[poincare] Clean parsing with unicodes

2026-03-26 17:20:53 +01:00 · 2019-01-18 09:41:07 +01:00
parent 780579265f
commit 217bbf024e
2 changed files with 26 additions and 13 deletions
--- a/poincare/src/parsing/parser.cpp
+++ b/poincare/src/parsing/parser.cpp
@@ -361,12 +361,14 @@ void Parser::parseSequence(Expression & leftHandSide, const char name, Token::Ty
    if (m_status != Status::Progress) {
    } else if (!popTokenIfType(rightDelimiter)) {
      m_status = Status::Error; // Right delimiter missing.
-    } else if (rank.isIdenticalTo(Symbol::Builder("n",1))) {
-      char sym[5] = {name, '(', 'n', ')', 0};
-      leftHandSide = Symbol::Builder(sym, 4);
-    } else if (rank.isIdenticalTo(Addition::Builder(Symbol::Builder("n",1),Rational::Builder("1")))) {
-      char sym[7] = {name, '(', 'n', '+', '1', ')', 0};
-      leftHandSide = Symbol::Builder(sym, 6);
+    } else if (rank.isIdenticalTo(Symbol::Builder('n'))) {
+      constexpr int symbolNameSize = 5;
+      char sym[symbolNameSize] = {name, '(', 'n', ')', 0};
+      leftHandSide = Symbol::Builder(sym, symbolNameSize);
+    } else if (rank.isIdenticalTo(Addition::Builder(Symbol::Builder('n'), Rational::Builder("1")))) {
+      constexpr int symbolNameSize = 7;
+      char sym[symbolNameSize] = {name, '(', 'n', '+', '1', ')', 0};
+      leftHandSide = Symbol::Builder(sym, symbolNameSize);
    } else {
      m_status = Status::Error; // Unexpected parameter.
    }
@@ -383,8 +385,12 @@ void Parser::parseSpecialIdentifier(Expression & leftHandSide) {
  } else if (m_currentToken.compareTo(Unreal::Name()) == 0) {
    leftHandSide = Unreal::Builder();
  } else if (m_currentToken.compareTo("u_") == 0 || m_currentToken.compareTo("v_") == 0) { // Special case for sequences (e.g. "u_{n}")
+    /* We now that m_currentToken.text()[0] is either 'u' or 'v', so we do not
+     * need to pass a code point to parseSequence. */
    parseSequence(leftHandSide, m_currentToken.text()[0], Token::LeftBrace, Token::RightBrace);
  } else if (m_currentToken.compareTo("u") == 0 || m_currentToken.compareTo("v") == 0) { // Special case for sequences (e.g. "u(n)")
+    /* We now that m_currentToken.text()[0] is either 'u' or 'v', so we do not
+     * need to pass a code point to parseSequence. */
    parseSequence(leftHandSide, m_currentToken.text()[0], Token::LeftParenthesis, Token::RightParenthesis);
  } else if (m_currentToken.compareTo("log_") == 0) { // Special case for the log function (e.g. "log_{2}(8)")
    if (!popTokenIfType(Token::LeftBrace)) {
@@ -426,7 +432,7 @@ void Parser::parseCustomIdentifier(Expression & leftHandSide, const char * name,
    return;
  }
  parameter = parameter.childAtIndex(0);
-  if (parameter.type() == ExpressionNode::Type::Symbol && strncmp(static_cast<SymbolAbstract&>(parameter).name(),name, length) == 0) {
+  if (parameter.type() == ExpressionNode::Type::Symbol && strncmp(static_cast<SymbolAbstract&>(parameter).name(), name, length) == 0) {
    m_status = Status::Error; // Function and variable must have distinct names.
  } else if (!popTokenIfType(Token::RightParenthesis)) {
    m_status = Status::Error; // Right parenthesis missing.
@@ -487,8 +493,7 @@ void Parser::parseMatrix(Expression & leftHandSide, Token::Type stoppingType) {
      return;
    }
    if ((numberOfRows == 0 && (numberOfColumns = row.numberOfChildren()) == 0)
-        ||
-        (numberOfColumns != row.numberOfChildren())) {
+        || (numberOfColumns != row.numberOfChildren())) {
      m_status = Status::Error; // Incorrect matrix.
      return;
    } else {
--- a/poincare/src/parsing/tokenizer.cpp
+++ b/poincare/src/parsing/tokenizer.cpp
@@ -14,16 +14,22 @@ static inline bool isDigit(const CodePoint c) {

 const CodePoint Tokenizer::nextCodePoint(PopTest popTest, CodePoint context, bool * testResult) {
  UTF8Decoder decoder(m_text);
+  const char * currentPointer = m_text;
+  const char * nextPointer = decoder.nextCodePointPointer();
  CodePoint firstCodePoint = decoder.nextCodePoint();
-  size_t numberOfBytesForCodePoint = UTF8Decoder::CharSizeOfCodePoint(firstCodePoint);
+  size_t numberOfBytesForCodePoint = nextPointer - currentPointer;
  if (firstCodePoint != KDCodePointNull) {
+    currentPointer = nextPointer;
+    nextPointer = decoder.nextCodePointPointer();
    CodePoint codePoint = decoder.nextCodePoint();
    while (codePoint.isCombining()) {
-      numberOfBytesForCodePoint+= UTF8Decoder::CharSizeOfCodePoint(codePoint);
+      numberOfBytesForCodePoint+= nextPointer - currentPointer;
+      currentPointer = nextPointer;
+      nextPointer = decoder.nextCodePointPointer();
      codePoint = decoder.nextCodePoint();
    }
  }
-  // TODO handle combined code points?
+  // TODO handle combined code points? For now the combining codepoints get dropped.
  bool shouldPop = popTest(firstCodePoint, context);
  if (testResult != nullptr) {
    *testResult = shouldPop;
@@ -181,7 +187,9 @@ Token Tokenizer::popToken() {
  if (c == KDCodePointSquareRoot) {
    Token result(Token::Identifier);
    // TODO compute size manually?
-    result.setString(start, UTF8Decoder::CharSizeOfCodePoint(KDCodePointSquareRoot));
+    constexpr int squareRootCharLength = 3;
+    assert(UTF8Decoder::CharSizeOfCodePoint(KDCodePointSquareRoot) == squareRootCharLength);
+    result.setString(start, squareRootCharLength);
    return result;
  }
  if (c == KDCodePointEmpty) {