[poincare/parser] Use the UTF8Decoder in the parser

This commit is contained in:
Léa Saviot
2019-01-10 15:38:36 +01:00
committed by Émilie Feral
parent 6380ba6cea
commit 46f2cc29dc
2 changed files with 63 additions and 39 deletions

View File

@@ -1,6 +1,7 @@
#include "tokenizer.h"
#include <ion/charset.h>
#include <poincare/number.h>
#include <kandinsky/unicode/utf8decoder.h>
namespace Poincare {
@@ -12,54 +13,70 @@ static inline bool isDigit(const char c) {
return '0' <= c && c <= '9';
}
const char Tokenizer::nextChar(PopTest popTest, char context, bool * testResult) {
// Beware of chars spaning over more than one byte: use the UTF8Decoder.
UTF8Decoder decoder(m_text);
Codepoint firstCodepoint = decoder.nextCodepoint();
int numberOfBytesForChar = 1;
if (firstCodepoint != Null) {
Codepoint codepoint = decoder.nextCodepoint();
while (codepoint.isCombining()) {
numberOfBytesForChar++;
codepoint = decoder.nextCodepoint();
}
}
char c = *m_text; // TODO handle combined chars?
bool shouldPop = popTest(c, context);
if (testResult != nullptr) {
*testResult = shouldPop;
}
if (shouldPop) {
m_text+= numberOfBytesForChar;
}
return c;
}
const char Tokenizer::popChar() {
const char nextChar = *m_nextCharP;
m_nextCharP++;
return nextChar;
// After returning, m_nextCharP points to the character after nextChar.
return nextChar([](char c, char context) { return true; });
// m_text now points to the start of the character after the returned char.
}
bool Tokenizer::canPopChar (const char c) {
if (*m_nextCharP == c) {
m_nextCharP++;
return true;
bool didPop = false;
nextChar([](char nextC, char context) { return nextC == context; }, c, &didPop);
return didPop;
}
size_t Tokenizer::popWhile(PopTest popTest, char context) {
size_t length = 0;
bool didPop = true;
while (didPop) {
nextChar(popTest, context, &didPop);
if (didPop) {
length++;
}
}
return false;
return length;
}
size_t Tokenizer::popIdentifier() {
/* Since this method is only called by popToken, currentChar is necessarily a
* letter. */
size_t length = 1;
char nextChar = *m_nextCharP;
while (isLetter(nextChar) || isDigit(nextChar) || nextChar == '_') {
length++;
nextChar = *++m_nextCharP;
}
return length;
return popWhile([](char c, char context) { return isLetter(c) || isDigit(c) || c == context; }, '_');
}
size_t Tokenizer::popDigits() {
size_t length = 0;
while (isDigit(*m_nextCharP)) {
length++;
m_nextCharP++;
}
return length;
return popWhile([](char c, char context) { return isDigit(c); });
}
Token Tokenizer::popNumber() {
/* This method is only called by popToken, after popping a dot or a digit.
* Hence the need to get one character back. */
m_nextCharP--;
const char * integralPartText = m_nextCharP;
const char * integralPartText = m_text;
size_t integralPartLength = popDigits();
const char * fractionalPartText = m_nextCharP;
const char * fractionalPartText = m_text;
size_t fractionalPartLength = 0;
assert(integralPartLength > 0 || *m_text == '.');
if (canPopChar('.')) {
fractionalPartText = m_nextCharP;
fractionalPartText = m_text;
fractionalPartLength = popDigits();
}
@@ -67,12 +84,12 @@ Token Tokenizer::popNumber() {
return Token(Token::Undefined);
}
const char * exponentPartText = m_nextCharP;
const char * exponentPartText = m_text;
size_t exponentPartLength = 0;
bool exponentIsNegative = false;
if (canPopChar(Ion::Charset::Exponent)) {
exponentIsNegative = canPopChar('-');
exponentPartText = m_nextCharP;
exponentPartText = m_text;
exponentPartLength = popDigits();
if (exponentPartLength == 0) {
return Token(Token::Undefined);
@@ -88,18 +105,22 @@ Token Tokenizer::popToken() {
// Skip whitespaces
while (canPopChar(' ')) {}
/* Save for later use (since m_nextCharP is altered by popChar, popNumber,
/* Save for later use (since m_text is altered by popChar, popNumber,
* popIdentifier). */
const char * start = m_nextCharP;
const char * start = m_text;
/* If the next char is the start of a number, we do not want to pop it because
* popNumber needs this char. */
bool nextCharIsNeitherDotNorDigit = true;
const char currentChar = nextChar([](char c, char context) { return c != context && !isDigit(c); }, '.', &nextCharIsNeitherDotNorDigit);
const char currentChar = popChar();
// According to currentChar, recognize the Token::Type.
if (currentChar == '.' || isDigit(currentChar)) {
if (!nextCharIsNeitherDotNorDigit) {
return popNumber();
}
if (isLetter(currentChar)) {
Token result(Token::Identifier);
result.setString(start, popIdentifier());
result.setString(start, 1 + popIdentifier()); // We already popped 1 char
return result;
}
if ('(' <= currentChar && currentChar <= '/') {

View File

@@ -13,16 +13,19 @@ namespace Poincare {
class Tokenizer {
public:
Tokenizer(const char * text) : m_nextCharP(text) {}
Tokenizer(const char * text) : m_text(text) {}
Token popToken();
private:
typedef bool (*PopTest)(char c, char context);
const char nextChar(PopTest popTest, char context = 0, bool * testResult = nullptr);
const char popChar();
bool canPopChar(const char c);
size_t popWhile(PopTest popTest, char context = 0);
size_t popDigits();
size_t popIdentifier();
Token popNumber();
const char * m_nextCharP;
const char * m_text;
};
}