[unicode] Clean decoder use

2026-01-19 00:37:25 +01:00 · 2019-01-30 11:48:31 +01:00
parent fd407ce3d9
commit 7b5636f298
7 changed files with 22 additions and 25 deletions
--- a/apps/code/python_text_area.cpp
+++ b/apps/code/python_text_area.cpp
@@ -119,22 +119,20 @@ void PythonTextArea::ContentView::drawLine(KDContext * ctx, int line, const char
     * basis. This can work, however the MicroPython lexer won't accept a line
     * starting with a whitespace. So we're discarding leading whitespaces
     * beforehand. */
-    UTF8Decoder decoder(text);
-    const char * p = decoder.stringPosition();
-    CodePoint c = decoder.nextCodePoint();
-    while (p < text + byteLength && c  == ' ') {
-        p = decoder.stringPosition();
-        c = decoder.nextCodePoint();
+    const char * firstNonSpace = UTF8Helper::NotCodePointSearch(text, ' ');
+    if (UTF8Helper::CodePointIs(firstNonSpace, UCodePointNull)) {
+      nlr_pop();
+      return;
    }

-    mp_lexer_t * lex = mp_lexer_new_from_str_len(0, p, byteLength - (p - text), 0);
+    mp_lexer_t * lex = mp_lexer_new_from_str_len(0, firstNonSpace, byteLength - (firstNonSpace - text), 0);
    LOG_DRAW("Pop token %d\n", lex->tok_kind);

-    const char * tokenFrom = p;
+    const char * tokenFrom = firstNonSpace;
    size_t tokenLength = 0;
    while (lex->tok_kind != MP_TOKEN_NEWLINE && lex->tok_kind != MP_TOKEN_END) {

-      tokenFrom = p + lex->tok_column - 1;
+      tokenFrom = firstNonSpace + lex->tok_column - 1;
      tokenLength = TokenLength(lex);
      LOG_DRAW("Draw \"%.*s\" for token %d\n", tokenLength, tokenFrom, lex->tok_kind);
      drawStringAt(ctx, line,
@@ -151,7 +149,7 @@ void PythonTextArea::ContentView::drawLine(KDContext * ctx, int line, const char

    tokenFrom += tokenLength;
    if (tokenFrom < text + byteLength) {
-      LOG_DRAW("Draw comment \"%.*s\" from %d\n", byteLength - (tokenFrom - text), p, tokenFrom);
+      LOG_DRAW("Draw comment \"%.*s\" from %d\n", byteLength - (tokenFrom - text), firstNonSpace, tokenFrom);
      drawStringAt(ctx, line,
          UTF8Helper::GlyphOffsetAtCodePoint(text, tokenFrom),
          tokenFrom,
--- a/escher/src/text_area.cpp
+++ b/escher/src/text_area.cpp
@@ -57,10 +57,10 @@ bool TextArea::handleEvent(Ion::Events::Event event) {
    decoder.previousCodePoint();
    return setCursorLocation(decoder.stringPosition());
  } else if (event == Ion::Events::Right) {
-    if (*cursorLocation() == 0) {
+    if (UTF8Helper::CodePointIs(cursorLocation(), UCodePointNull)) {
      return false;
    }
-    UTF8Decoder decoder(text(), cursorLocation());
+    UTF8Decoder decoder(cursorLocation());
    decoder.nextCodePoint();
    return setCursorLocation(decoder.stringPosition());
  } else if (event == Ion::Events::Up) {
@@ -216,7 +216,9 @@ CodePoint TextArea::Text::removeCodePoint(char * * position) {

 size_t TextArea::Text::removeRemainingLine(const char * location, int direction) {
  assert(m_buffer != nullptr);
-  assert(location >= m_buffer && location < m_buffer + m_bufferSize);
+  assert(location >= m_buffer && location <= m_buffer + m_bufferSize);
+  assert(direction > 0 || location > m_buffer);
+  assert(direction < 0 || location < m_buffer + m_bufferSize);

  UTF8Decoder decoder(m_buffer, location);
  const char * codePointPosition = decoder.stringPosition();
--- a/escher/src/text_field.cpp
+++ b/escher/src/text_field.cpp
@@ -440,7 +440,7 @@ bool TextField::privateHandleMoveEvent(Ion::Events::Event event) {
  }
  if (event == Ion::Events::Right && isEditing() && cursorLocation() < m_contentView.draftTextBuffer() + draftTextLength()) {
    assert(isEditing());
-    UTF8Decoder decoder(m_contentView.draftTextBuffer(), cursorLocation());
+    UTF8Decoder decoder(cursorLocation());
    decoder.nextCodePoint();
    return setCursorLocation(decoder.stringPosition());
  }
--- a/ion/include/ion/unicode/utf8_helper.h
+++ b/ion/include/ion/unicode/utf8_helper.h
@@ -15,7 +15,7 @@ const char * CodePointSearch(const char * s, CodePoint c);

 /* Returns the first occurence of a code point that is not c in a string,
 * stopping at the null-terminating char or the start of string. */
-const char * NotCodePointSearch(const char * s, CodePoint c, bool goingLeft, const char * initialPosition);
+const char * NotCodePointSearch(const char * s, CodePoint c, bool goingLeft = false, const char * initialPosition = nullptr);

 /* Copy src into dst while removing all code points c. Also update an index
 * that should be lower if code points where removed before it. Ensure null-
--- a/ion/src/shared/unicode/utf8_decoder.cpp
+++ b/ion/src/shared/unicode/utf8_decoder.cpp
@@ -4,6 +4,7 @@
 static inline int leading_ones(uint8_t value) {
  for (int i=0; i<8; i++) {
    if (!(value & 0x80)) {
+      assert(i <= 4);
      return i;
    }
    value = value << 1;
@@ -17,7 +18,7 @@ static inline uint8_t last_k_bits(uint8_t value, uint8_t bits) {
 }

 CodePoint UTF8Decoder::nextCodePoint() {
-  assert(m_stringPosition == m_stringPosition || *(m_stringPosition - 1) != 0);
+  assert(m_stringPosition == m_string || *(m_stringPosition - 1) != 0);
  int leadingOnes = leading_ones(*m_stringPosition);
  uint32_t result = last_k_bits(*m_stringPosition++, 8-leadingOnes-1);
  for (int i = 0; i < leadingOnes - 1; i++) {
--- a/ion/src/shared/unicode/utf8_helper.cpp
+++ b/ion/src/shared/unicode/utf8_helper.cpp
@@ -9,6 +9,7 @@ static inline int minInt(int x, int y) { return x < y ? x : y; }
 static inline size_t minSizeT(size_t x, size_t y) { return x < y ? x : y; }

 int CountOccurrences(const char * s, CodePoint c) {
+  assert(c != UCodePointNull);
  int count = 0;
  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
    /* The code point is one char long, so it is equal to its char translation.
@@ -55,6 +56,7 @@ const char * CodePointSearch(const char * s, CodePoint c) {
 }

 const char * NotCodePointSearch(const char * s, CodePoint c, bool goingLeft, const char * initialPosition) {
+  // TODO LEA: optimize for one byte long c?
  if (goingLeft) {
    assert(initialPosition != nullptr);
    if (initialPosition == s) {
--- a/poincare/src/parsing/tokenizer.cpp
+++ b/poincare/src/parsing/tokenizer.cpp
@@ -15,19 +15,13 @@ static inline bool isDigit(const CodePoint c) {

 const CodePoint Tokenizer::nextCodePoint(PopTest popTest, CodePoint context, bool * testResult) {
  UTF8Decoder decoder(m_text);
-  const char * currentPointer = m_text;
  CodePoint firstCodePoint = decoder.nextCodePoint();
-  const char * nextPointer = decoder.stringPosition();
-  size_t numberOfBytesForCodePoint = nextPointer - currentPointer;
+  const char * nextTextPosition = decoder.stringPosition();
  if (firstCodePoint != UCodePointNull) {
-    currentPointer = nextPointer;
    CodePoint codePoint = decoder.nextCodePoint();
-    nextPointer = decoder.stringPosition();
    while (codePoint.isCombining()) {
-      numberOfBytesForCodePoint+= nextPointer - currentPointer;
-      currentPointer = nextPointer;
+      nextTextPosition = decoder.stringPosition();
      codePoint = decoder.nextCodePoint();
-      nextPointer = decoder.stringPosition();
    }
  }
  // TODO handle combined code points? For now the combining codepoints get dropped.
@@ -36,7 +30,7 @@ const CodePoint Tokenizer::nextCodePoint(PopTest popTest, CodePoint context, boo
    *testResult = shouldPop;
  }
  if (shouldPop) {
-    m_text+= numberOfBytesForCodePoint;
+    m_text = nextTextPosition;
  }
  return firstCodePoint;
 }