From 844a28db7b0651eca20cc6319ca40a61d4f25577 Mon Sep 17 00:00:00 2001
From: Ruben Dashyan <ruben.dashyan@numworks.com>
Date: Mon, 27 Jan 2020 16:20:01 +0100
Subject: [PATCH] [poincare/parsing/tokenizer] Token::Identifiers may contain
 Greek letters except pi

---
 ion/include/ion/unicode/code_point.h |  6 ++++
 poincare/src/parsing/tokenizer.cpp   | 47 +++++++++++++++++++---------
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/ion/include/ion/unicode/code_point.h b/ion/include/ion/unicode/code_point.h
index 323a3e405..d05688d55 100644
--- a/ion/include/ion/unicode/code_point.h
+++ b/ion/include/ion/unicode/code_point.h
@@ -30,6 +30,12 @@ public:
   bool isCombining() const {
     return (m_code >= 0x300 && m_code <= 0x036F);
   }
+  bool isGreekCapitalLetter() const {
+    return 0x391 <= m_code && m_code <= 0x3a9 && m_code != 0x3a2;
+  }
+  bool isGreekSmallLetter() const {
+    return 0x3b1 <= m_code && m_code <= 0x3c9;
+  }
 private:
   uint32_t m_code;
 };
diff --git a/poincare/src/parsing/tokenizer.cpp b/poincare/src/parsing/tokenizer.cpp
index 036bd377c..b21a26126 100644
--- a/poincare/src/parsing/tokenizer.cpp
+++ b/poincare/src/parsing/tokenizer.cpp
@@ -40,8 +40,15 @@ size_t Tokenizer::popWhile(PopTest popTest, CodePoint context) {
 
 size_t Tokenizer::popIdentifier() {
   /* TODO handle combined code points? For now combining code points will
-   * trigger a syntax error. */
-  return popWhile([](CodePoint c, CodePoint context) { return c.isDecimalDigit() || c.isLatinLetter() || c == '_'; });
+   * trigger a syntax error.
+   * This method is used to parse any identifier, reserved or custom, or even
+   * unit symbols.
+   * Exceptionally π is always parsed separately so that the user may for
+   * instance input '2πx' without any danger.
+   */
+  return popWhile([](CodePoint c, CodePoint context) {
+      return c.isDecimalDigit() || c.isLatinLetter() || c == '_' || c.isGreekCapitalLetter() || (c.isGreekSmallLetter() && c != UCodePointGreekSmallLetterPi);
+      });
 }
 
 size_t Tokenizer::popDigits() {
@@ -134,12 +141,34 @@ Token Tokenizer::popToken() {
   if (!nextCodePointIsNeitherDotNorDigit) {
     return popNumber();
   }
+  if (c == UCodePointGreekSmallLetterPi ||
+      c == UCodePointMathematicalBoldSmallI ||
+      c == UCodePointScriptSmallE)
+  {
+    Token result(Token::Constant);
+    result.setCodePoint(c);
+    return result;
+  }
   if (c == '_') {
+    /* For now, unit symbols must be prefixed with an underscore. Otherwise,
+     * common custom identifiers would be systematically parsed as units (for
+     * instance, A and g).
+     * TODO The Context of the Parser might be used to decide whether a symbol
+     * as 'A' should be parsed as a custom identifier, if 'A' already exists in
+     * the context, or as a unit if not.
+     *
+     * Besides unit symbols may contain Greek letters as μ and Ω. Since there
+     * is no particular reason to parse unit symbols differently from any other
+     * reserved or custom identifier, popIdentifier is called in both cases.
+     */
     Token result(Token::Unit);
     result.setString(start + 1, popIdentifier());
     return result;
   }
-  if (c.isLatinLetter()) {
+  if (c.isLatinLetter() ||
+      c.isGreekCapitalLetter() ||
+      c.isGreekSmallLetter()) // Greek small letter pi is matched earlier
+  {
     Token result(Token::Identifier);
     result.setString(start, 1 + popIdentifier()); // We already popped 1 code point
     return result;
@@ -195,17 +224,7 @@ Token Tokenizer::popToken() {
   if (c == '}') {
     return Token(Token::RightBrace);
   }
-  if (c == UCodePointGreekSmallLetterPi
-      || c == UCodePointMathematicalBoldSmallI
-      || c == UCodePointScriptSmallE)
-  {
-    Token result(Token::Constant);
-    result.setCodePoint(c);
-    return result;
-  }
-  if (c == UCodePointSquareRoot
-      || c == UCodePointGreekSmallLetterTheta)
-  {
+  if (c == UCodePointSquareRoot) {
     Token result(Token::Identifier);
     result.setString(start, UTF8Decoder::CharSizeOfCodePoint(c));
     return result;