From 46f2cc29dcaeb8b90080a71e6b67b6a69b3f2d32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A9a=20Saviot?= <lea.saviot@numworks.com>
Date: Thu, 10 Jan 2019 15:38:36 +0100
Subject: [PATCH] [poincare/parser] Use the UTF8Decoder in the parser

---
 poincare/src/parsing/tokenizer.cpp | 95 ++++++++++++++++++------------
 poincare/src/parsing/tokenizer.h   |  7 ++-
 2 files changed, 63 insertions(+), 39 deletions(-)

diff --git a/poincare/src/parsing/tokenizer.cpp b/poincare/src/parsing/tokenizer.cpp
index 00698188a..bd2c8e823 100644
--- a/poincare/src/parsing/tokenizer.cpp
+++ b/poincare/src/parsing/tokenizer.cpp
@@ -1,6 +1,7 @@
 #include "tokenizer.h"
 #include <ion/charset.h>
 #include <poincare/number.h>
+#include <kandinsky/unicode/utf8decoder.h>
 
 namespace Poincare {
 
@@ -12,54 +13,70 @@ static inline bool isDigit(const char c) {
   return '0' <= c && c <= '9';
 }
 
+const char Tokenizer::nextChar(PopTest popTest, char context, bool * testResult) {
+  // Beware of chars spaning over more than one byte: use the UTF8Decoder.
+  UTF8Decoder decoder(m_text);
+  Codepoint firstCodepoint = decoder.nextCodepoint();
+  int numberOfBytesForChar = 1;
+  if (firstCodepoint != Null) {
+    Codepoint codepoint = decoder.nextCodepoint();
+    while (codepoint.isCombining()) {
+      numberOfBytesForChar++;
+      codepoint = decoder.nextCodepoint();
+    }
+  }
+  char c = *m_text; // TODO handle combined chars?
+  bool shouldPop = popTest(c, context);
+  if (testResult != nullptr) {
+    *testResult = shouldPop;
+  }
+  if (shouldPop) {
+    m_text+= numberOfBytesForChar;
+  }
+  return c;
+}
+
 const char Tokenizer::popChar() {
-  const char nextChar = *m_nextCharP;
-  m_nextCharP++;
-  return nextChar;
-  // After returning, m_nextCharP points to the character after nextChar.
+  return nextChar([](char c, char context) { return true; });
+  // m_text now points to the start of the character after the returned char.
 }
 
 bool Tokenizer::canPopChar (const char c) {
-  if (*m_nextCharP == c) {
-    m_nextCharP++;
-    return true;
+  bool didPop = false;
+  nextChar([](char nextC, char context) { return nextC == context; }, c, &didPop);
+  return didPop;
+}
+
+size_t Tokenizer::popWhile(PopTest popTest, char context) {
+  size_t length = 0;
+  bool didPop = true;
+  while (didPop) {
+    nextChar(popTest, context, &didPop);
+    if (didPop) {
+      length++;
+    }
   }
-  return false;
+  return length;
 }
 
 size_t Tokenizer::popIdentifier() {
-  /* Since this method is only called by popToken, currentChar is necessarily a
-   * letter. */
-  size_t length = 1;
-  char nextChar = *m_nextCharP;
-  while (isLetter(nextChar) || isDigit(nextChar) || nextChar == '_') {
-    length++;
-    nextChar = *++m_nextCharP;
-  }
-  return length;
+  return popWhile([](char c, char context) { return isLetter(c) || isDigit(c) || c == context; }, '_');
 }
 
 size_t Tokenizer::popDigits() {
-  size_t length = 0;
-  while (isDigit(*m_nextCharP)) {
-    length++;
-    m_nextCharP++;
-  }
-  return length;
+  return popWhile([](char c, char context) { return isDigit(c); });
 }
 
 Token Tokenizer::popNumber() {
-  /* This method is only called by popToken, after popping a dot or a digit.
-   * Hence the need to get one character back. */
-  m_nextCharP--;
-
-  const char * integralPartText = m_nextCharP;
+  const char * integralPartText = m_text;
   size_t integralPartLength = popDigits();
 
-  const char * fractionalPartText = m_nextCharP;
+  const char * fractionalPartText = m_text;
   size_t fractionalPartLength = 0;
+
+  assert(integralPartLength > 0 || *m_text == '.');
   if (canPopChar('.')) {
-    fractionalPartText = m_nextCharP;
+    fractionalPartText = m_text;
     fractionalPartLength = popDigits();
   }
 
@@ -67,12 +84,12 @@ Token Tokenizer::popNumber() {
     return Token(Token::Undefined);
   }
 
-  const char * exponentPartText = m_nextCharP;
+  const char * exponentPartText = m_text;
   size_t exponentPartLength = 0;
   bool exponentIsNegative = false;
   if (canPopChar(Ion::Charset::Exponent)) {
     exponentIsNegative = canPopChar('-');
-    exponentPartText = m_nextCharP;
+    exponentPartText = m_text;
     exponentPartLength = popDigits();
     if (exponentPartLength == 0) {
       return Token(Token::Undefined);
@@ -88,18 +105,22 @@ Token Tokenizer::popToken() {
   // Skip whitespaces
   while (canPopChar(' ')) {}
 
-  /* Save for later use (since m_nextCharP is altered by popChar, popNumber,
+  /* Save for later use (since m_text is altered by popChar, popNumber,
    * popIdentifier). */
-  const char * start = m_nextCharP;
+  const char * start = m_text;
+
+  /* If the next char is the start of a number, we do not want to pop it because
+   * popNumber needs this char. */
+  bool nextCharIsNeitherDotNorDigit = true;
+  const char currentChar = nextChar([](char c, char context) { return c != context && !isDigit(c); }, '.', &nextCharIsNeitherDotNorDigit);
 
-  const char currentChar = popChar();
   // According to currentChar, recognize the Token::Type.
-  if (currentChar == '.' || isDigit(currentChar)) {
+  if (!nextCharIsNeitherDotNorDigit) {
     return popNumber();
   }
   if (isLetter(currentChar)) {
     Token result(Token::Identifier);
-    result.setString(start, popIdentifier());
+    result.setString(start, 1 + popIdentifier()); // We already popped 1 char
     return result;
   }
   if ('(' <= currentChar && currentChar <= '/') {
diff --git a/poincare/src/parsing/tokenizer.h b/poincare/src/parsing/tokenizer.h
index 05e2017d9..b75d6e863 100644
--- a/poincare/src/parsing/tokenizer.h
+++ b/poincare/src/parsing/tokenizer.h
@@ -13,16 +13,19 @@ namespace Poincare {
 
 class Tokenizer {
 public:
-  Tokenizer(const char * text) : m_nextCharP(text) {}
+  Tokenizer(const char * text) : m_text(text) {}
   Token popToken();
 private:
+  typedef bool (*PopTest)(char c, char context);
+  const char nextChar(PopTest popTest, char context = 0, bool * testResult = nullptr);
   const char popChar();
   bool canPopChar(const char c);
+  size_t popWhile(PopTest popTest, char context = 0);
   size_t popDigits();
   size_t popIdentifier();
   Token popNumber();
 
-  const char * m_nextCharP;
+  const char * m_text;
 };
 
 }