From 46f2cc29dcaeb8b90080a71e6b67b6a69b3f2d32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9a=20Saviot?= Date: Thu, 10 Jan 2019 15:38:36 +0100 Subject: [PATCH] [poincare/parser] Use the UTF8Decoder in the parser --- poincare/src/parsing/tokenizer.cpp | 95 ++++++++++++++++++------------ poincare/src/parsing/tokenizer.h | 7 ++- 2 files changed, 63 insertions(+), 39 deletions(-) diff --git a/poincare/src/parsing/tokenizer.cpp b/poincare/src/parsing/tokenizer.cpp index 00698188a..bd2c8e823 100644 --- a/poincare/src/parsing/tokenizer.cpp +++ b/poincare/src/parsing/tokenizer.cpp @@ -1,6 +1,7 @@ #include "tokenizer.h" #include #include +#include namespace Poincare { @@ -12,54 +13,70 @@ static inline bool isDigit(const char c) { return '0' <= c && c <= '9'; } +const char Tokenizer::nextChar(PopTest popTest, char context, bool * testResult) { + // Beware of chars spaning over more than one byte: use the UTF8Decoder. + UTF8Decoder decoder(m_text); + Codepoint firstCodepoint = decoder.nextCodepoint(); + int numberOfBytesForChar = 1; + if (firstCodepoint != Null) { + Codepoint codepoint = decoder.nextCodepoint(); + while (codepoint.isCombining()) { + numberOfBytesForChar++; + codepoint = decoder.nextCodepoint(); + } + } + char c = *m_text; // TODO handle combined chars? + bool shouldPop = popTest(c, context); + if (testResult != nullptr) { + *testResult = shouldPop; + } + if (shouldPop) { + m_text+= numberOfBytesForChar; + } + return c; +} + const char Tokenizer::popChar() { - const char nextChar = *m_nextCharP; - m_nextCharP++; - return nextChar; - // After returning, m_nextCharP points to the character after nextChar. + return nextChar([](char c, char context) { return true; }); + // m_text now points to the start of the character after the returned char. } bool Tokenizer::canPopChar (const char c) { - if (*m_nextCharP == c) { - m_nextCharP++; - return true; + bool didPop = false; + nextChar([](char nextC, char context) { return nextC == context; }, c, &didPop); + return didPop; +} + +size_t Tokenizer::popWhile(PopTest popTest, char context) { + size_t length = 0; + bool didPop = true; + while (didPop) { + nextChar(popTest, context, &didPop); + if (didPop) { + length++; + } } - return false; + return length; } size_t Tokenizer::popIdentifier() { - /* Since this method is only called by popToken, currentChar is necessarily a - * letter. */ - size_t length = 1; - char nextChar = *m_nextCharP; - while (isLetter(nextChar) || isDigit(nextChar) || nextChar == '_') { - length++; - nextChar = *++m_nextCharP; - } - return length; + return popWhile([](char c, char context) { return isLetter(c) || isDigit(c) || c == context; }, '_'); } size_t Tokenizer::popDigits() { - size_t length = 0; - while (isDigit(*m_nextCharP)) { - length++; - m_nextCharP++; - } - return length; + return popWhile([](char c, char context) { return isDigit(c); }); } Token Tokenizer::popNumber() { - /* This method is only called by popToken, after popping a dot or a digit. - * Hence the need to get one character back. */ - m_nextCharP--; - - const char * integralPartText = m_nextCharP; + const char * integralPartText = m_text; size_t integralPartLength = popDigits(); - const char * fractionalPartText = m_nextCharP; + const char * fractionalPartText = m_text; size_t fractionalPartLength = 0; + + assert(integralPartLength > 0 || *m_text == '.'); if (canPopChar('.')) { - fractionalPartText = m_nextCharP; + fractionalPartText = m_text; fractionalPartLength = popDigits(); } @@ -67,12 +84,12 @@ Token Tokenizer::popNumber() { return Token(Token::Undefined); } - const char * exponentPartText = m_nextCharP; + const char * exponentPartText = m_text; size_t exponentPartLength = 0; bool exponentIsNegative = false; if (canPopChar(Ion::Charset::Exponent)) { exponentIsNegative = canPopChar('-'); - exponentPartText = m_nextCharP; + exponentPartText = m_text; exponentPartLength = popDigits(); if (exponentPartLength == 0) { return Token(Token::Undefined); @@ -88,18 +105,22 @@ Token Tokenizer::popToken() { // Skip whitespaces while (canPopChar(' ')) {} - /* Save for later use (since m_nextCharP is altered by popChar, popNumber, + /* Save for later use (since m_text is altered by popChar, popNumber, * popIdentifier). */ - const char * start = m_nextCharP; + const char * start = m_text; + + /* If the next char is the start of a number, we do not want to pop it because + * popNumber needs this char. */ + bool nextCharIsNeitherDotNorDigit = true; + const char currentChar = nextChar([](char c, char context) { return c != context && !isDigit(c); }, '.', &nextCharIsNeitherDotNorDigit); - const char currentChar = popChar(); // According to currentChar, recognize the Token::Type. - if (currentChar == '.' || isDigit(currentChar)) { + if (!nextCharIsNeitherDotNorDigit) { return popNumber(); } if (isLetter(currentChar)) { Token result(Token::Identifier); - result.setString(start, popIdentifier()); + result.setString(start, 1 + popIdentifier()); // We already popped 1 char return result; } if ('(' <= currentChar && currentChar <= '/') { diff --git a/poincare/src/parsing/tokenizer.h b/poincare/src/parsing/tokenizer.h index 05e2017d9..b75d6e863 100644 --- a/poincare/src/parsing/tokenizer.h +++ b/poincare/src/parsing/tokenizer.h @@ -13,16 +13,19 @@ namespace Poincare { class Tokenizer { public: - Tokenizer(const char * text) : m_nextCharP(text) {} + Tokenizer(const char * text) : m_text(text) {} Token popToken(); private: + typedef bool (*PopTest)(char c, char context); + const char nextChar(PopTest popTest, char context = 0, bool * testResult = nullptr); const char popChar(); bool canPopChar(const char c); + size_t popWhile(PopTest popTest, char context = 0); size_t popDigits(); size_t popIdentifier(); Token popNumber(); - const char * m_nextCharP; + const char * m_text; }; }