From 844a28db7b0651eca20cc6319ca40a61d4f25577 Mon Sep 17 00:00:00 2001 From: Ruben Dashyan Date: Mon, 27 Jan 2020 16:20:01 +0100 Subject: [PATCH] [poincare/parsing/tokenizer] Token::Identifiers may contain Greek letters except pi --- ion/include/ion/unicode/code_point.h | 6 ++++ poincare/src/parsing/tokenizer.cpp | 47 +++++++++++++++++++--------- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/ion/include/ion/unicode/code_point.h b/ion/include/ion/unicode/code_point.h index 323a3e405..d05688d55 100644 --- a/ion/include/ion/unicode/code_point.h +++ b/ion/include/ion/unicode/code_point.h @@ -30,6 +30,12 @@ public: bool isCombining() const { return (m_code >= 0x300 && m_code <= 0x036F); } + bool isGreekCapitalLetter() const { + return 0x391 <= m_code && m_code <= 0x3a9 && m_code != 0x3a2; + } + bool isGreekSmallLetter() const { + return 0x3b1 <= m_code && m_code <= 0x3c9; + } private: uint32_t m_code; }; diff --git a/poincare/src/parsing/tokenizer.cpp b/poincare/src/parsing/tokenizer.cpp index 036bd377c..b21a26126 100644 --- a/poincare/src/parsing/tokenizer.cpp +++ b/poincare/src/parsing/tokenizer.cpp @@ -40,8 +40,15 @@ size_t Tokenizer::popWhile(PopTest popTest, CodePoint context) { size_t Tokenizer::popIdentifier() { /* TODO handle combined code points? For now combining code points will - * trigger a syntax error. */ - return popWhile([](CodePoint c, CodePoint context) { return c.isDecimalDigit() || c.isLatinLetter() || c == '_'; }); + * trigger a syntax error. + * This method is used to parse any identifier, reserved or custom, or even + * unit symbols. + * Exceptionally π is always parsed separately so that the user may for + * instance input '2πx' without any danger. + */ + return popWhile([](CodePoint c, CodePoint context) { + return c.isDecimalDigit() || c.isLatinLetter() || c == '_' || c.isGreekCapitalLetter() || (c.isGreekSmallLetter() && c != UCodePointGreekSmallLetterPi); + }); } size_t Tokenizer::popDigits() { @@ -134,12 +141,34 @@ Token Tokenizer::popToken() { if (!nextCodePointIsNeitherDotNorDigit) { return popNumber(); } + if (c == UCodePointGreekSmallLetterPi || + c == UCodePointMathematicalBoldSmallI || + c == UCodePointScriptSmallE) + { + Token result(Token::Constant); + result.setCodePoint(c); + return result; + } if (c == '_') { + /* For now, unit symbols must be prefixed with an underscore. Otherwise, + * common custom identifiers would be systematically parsed as units (for + * instance, A and g). + * TODO The Context of the Parser might be used to decide whether a symbol + * as 'A' should be parsed as a custom identifier, if 'A' already exists in + * the context, or as a unit if not. + * + * Besides unit symbols may contain Greek letters as μ and Ω. Since there + * is no particular reason to parse unit symbols differently from any other + * reserved or custom identifier, popIdentifier is called in both cases. + */ Token result(Token::Unit); result.setString(start + 1, popIdentifier()); return result; } - if (c.isLatinLetter()) { + if (c.isLatinLetter() || + c.isGreekCapitalLetter() || + c.isGreekSmallLetter()) // Greek small letter pi is matched earlier + { Token result(Token::Identifier); result.setString(start, 1 + popIdentifier()); // We already popped 1 code point return result; @@ -195,17 +224,7 @@ Token Tokenizer::popToken() { if (c == '}') { return Token(Token::RightBrace); } - if (c == UCodePointGreekSmallLetterPi - || c == UCodePointMathematicalBoldSmallI - || c == UCodePointScriptSmallE) - { - Token result(Token::Constant); - result.setCodePoint(c); - return result; - } - if (c == UCodePointSquareRoot - || c == UCodePointGreekSmallLetterTheta) - { + if (c == UCodePointSquareRoot) { Token result(Token::Identifier); result.setString(start, UTF8Decoder::CharSizeOfCodePoint(c)); return result;