[poincare] Complete and clean Parser's Tokenizer

This commit is contained in:
Ruben Dashyan
2018-11-05 17:47:02 +01:00
committed by Émilie Feral
parent 5f734c3f4c
commit c97996696e
2 changed files with 119 additions and 90 deletions

View File

@@ -1,145 +1,168 @@
#include "tokenizer.h"
#include <ion/charset.h>
#include <poincare/empty_expression.h>
#include <poincare/number.h>
#include <poincare/symbol.h>
namespace Poincare {
bool Tokenizer::canPopChar(char c) {
if (currentChar() == c) {
popChar();
static inline bool isLetter(const char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
}
static inline bool isDigit(const char c) {
return '0' <= c && c <= '9';
}
const char Tokenizer::popChar() {
const char nextChar = *m_nextCharP;
m_nextCharP++;
return nextChar;
// Note that, after returning, m_nextCharP points to the character after nextChar.
}
bool Tokenizer::canPopChar (const char c) {
if (*m_nextCharP == c) {
m_nextCharP++;
return true;
}
return false;
}
size_t Tokenizer::popIdentifier() {
// Since this method is only called by popToken,
// currentChar is necessary a letter.
size_t length = 1;
char nextChar = *m_nextCharP;
while (isLetter(nextChar) || isDigit(nextChar) || nextChar == '_') {
length++;
nextChar = *++m_nextCharP;
}
return length;
}
size_t Tokenizer::popDigits() {
size_t length = 0;
char c = currentChar();
while (c >= '0' && c <= '9') {
while (isDigit(*m_nextCharP)) {
length++;
c = popChar();
m_nextCharP++;
}
return length;
}
Token Tokenizer::popNumber() {
const char * integralPartText = m_text;
/* This method is only called by popToken, after popping a dot or a digit.
* Hence one needs to get one character back. */
m_nextCharP--;
const char * integralPartText = m_nextCharP;
size_t integralPartLength = popDigits();
const char * decimalPartText = m_text;
size_t decimalPartLength = 0;
const char * fractionalPartText = m_nextCharP;
size_t fractionalPartLength = 0;
if (canPopChar('.')) {
decimalPartText = m_text;
decimalPartLength = popDigits();
fractionalPartText = m_nextCharP;
fractionalPartLength = popDigits();
}
if (integralPartLength == 0 && decimalPartLength == 0) {
return Token(Token::Type::Undefined);
if (integralPartLength == 0 && fractionalPartLength == 0) {
return Token(Token::Undefined);
}
const char * exponentPartText = m_text;
const char * exponentPartText = m_nextCharP;
size_t exponentPartLength = 0;
bool exponentIsNegative = false;
if (canPopChar(Ion::Charset::Exponent)) {
exponentIsNegative = canPopChar('-');
exponentPartText = m_text;
exponentPartText = m_nextCharP;
exponentPartLength = popDigits();
if (exponentPartLength == 0) {
return Token(Token::Type::Undefined);
return Token(Token::Undefined);
}
}
Token result(Token::Type::Number);
result.setExpression(Number::ParseNumber(integralPartText, integralPartLength, decimalPartText, decimalPartLength, exponentIsNegative, exponentPartText, exponentPartLength));
return result;
}
static inline bool isLetter(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
}
Token Tokenizer::popSymbol() {
const char * text = m_text;
int length = 1;
char c = popChar();
while (isLetter(c) || (c >= '0' && c <= '9') || c == '_') {
length++;
c = popChar();
}
Token result(Token::Type::Symbol);
result.setExpression(Symbol(text, length));
Token result(Token::Number);
result.setExpression(Number::ParseNumber(integralPartText, integralPartLength, fractionalPartText, fractionalPartLength, exponentIsNegative, exponentPartText, exponentPartLength));
return result;
}
Token Tokenizer::popToken() {
while (canPopChar(' ')){};
const char c = currentChar();
if ((c == '.') || (c >= '0' && c <= '9')) {
// Skip whitespaces
while (canPopChar(' ')) {}
// Save for later use (since m_nextCharP is altered by popChar, popNumber, popIdentifier).
const char * start = m_nextCharP;
const char currentChar = popChar();
// According to currentChar, recognize the Token::Type.
if (currentChar == '.' || isDigit(currentChar)) {
return popNumber();
}
if (isLetter(c)) {
return popSymbol();
if (isLetter(currentChar)) {
Token result(Token::Identifier);
result.setString(start, popIdentifier());
return result;
}
if (c >= '(' && c <= '/' && c != '.') {
const Token::Type typeForChar[] = {
Token::Type::LeftParenthesis,
Token::Type::RightParenthesis,
Token::Type::Times,
Token::Type::Plus,
Token::Type::Comma,
Token::Type::Minus,
Token::Type::Undefined,
Token::Type::Slash,
if ('(' <= currentChar && currentChar <= '/') {
// Those characters form a contiguous range in the ascii character set,
// so one can make searching faster with this lookup table.
constexpr Token::Type typeForChar[] = {
Token::LeftParenthesis,
Token::RightParenthesis,
Token::Times,
Token::Plus,
Token::Comma,
Token::Minus,
Token::Undefined,
Token::Slash
};
popChar();
return Token(typeForChar[c - '(']);
// The dot character is the second last of that range,
// but it is matched before (with popNumber).
assert(currentChar != '.');
return Token(typeForChar[currentChar - '(']);
}
if (canPopChar(Ion::Charset::MultiplicationSign) || canPopChar(Ion::Charset::MiddleDot)) {
return Token(Token::Type::Times);
if (currentChar == Ion::Charset::MultiplicationSign || currentChar == Ion::Charset::MiddleDot) {
return Token(Token::Times);
}
if (canPopChar('!')) {
return Token(Token::Type::Bang);
if (currentChar == '!') {
return Token(Token::Bang);
}
if (canPopChar('=')) {
return Token(Token::Type::Equal);
if (currentChar == '=') {
return Token(Token::Equal);
}
if (canPopChar('[')) {
return Token(Token::Type::LeftBracket);
if (currentChar == '[') {
return Token(Token::LeftBracket);
}
if (canPopChar(']')) {
return Token(Token::Type::RightBracket);
if (currentChar == ']') {
return Token(Token::RightBracket);
}
if (canPopChar('^')) {
return Token(Token::Type::Caret);
if (currentChar == '^') {
return Token(Token::Caret);
}
if (canPopChar('{')) {
return Token(Token::Type::LeftBrace);
if (currentChar == '{') {
return Token(Token::LeftBrace);
}
if (canPopChar('}')) {
return Token(Token::Type::RightBrace);
if (currentChar == '}') {
return Token(Token::RightBrace);
}
if (c == Ion::Charset::SmallPi || c == Ion::Charset::IComplex || c == Ion::Charset::Exponential || c == Ion::Charset::Root) {
Token result(Token::Type::Symbol);
result.setExpression(Symbol(m_text,1));
popChar();
if (currentChar == Ion::Charset::SmallPi || currentChar == Ion::Charset::IComplex || currentChar == Ion::Charset::Exponential) {
Token result(Token::Constant);
result.setString(start, 1);
return result;
}
if (canPopChar(Ion::Charset::Empty)) {
Token result(Token::Type::Undefined); //FIXME
result.setExpression(EmptyExpression());
if (currentChar == Ion::Charset::Root) {
Token result(Token::Identifier);
result.setString(start, 1);
return result;
}
if (canPopChar(Ion::Charset::Sto)) {
return Token(Token::Type::Store);
if (currentChar == Ion::Charset::Empty) {
return Token(Token::Empty);
}
if (canPopChar(0)) {
return Token(Token::Type::EndOfStream);
if (currentChar == Ion::Charset::Sto) {
return Token(Token::Store);
}
return Token(Token::Type::Undefined);
if (currentChar == 0) {
return Token(Token::EndOfStream);
}
return Token(Token::Undefined);
}
}

View File

@@ -1,23 +1,29 @@
#ifndef POINCARE_PARSING_TOKENIZER_H
#define POINCARE_PARSING_TOKENIZER_H
/* In order to parse a text input into an Expression,
* (an instance of) the Tokenizer reads the successive
* characters of the input, pops the Tokens it recognizes,
* which are then consumed by the Parser.
* For each Token, the Tokenizer determines a Type and
* may save other relevant data intended for the Parser. */
#include "token.h"
namespace Poincare {
class Tokenizer {
public:
Tokenizer(const char * text) : m_text(text) {};
Tokenizer(const char * text) : m_nextCharP(text) {};
Token popToken();
private:
const char popChar() { return *++m_text; }
const char currentChar() const { return *m_text; }
bool canPopChar(char c);
const char popChar();
bool canPopChar(const char c);
size_t popDigits();
size_t popIdentifier();
Token popNumber();
Token popSymbol();
const char * m_text;
const char * m_nextCharP;
};
}