[escher/ion] Standard methods in UTF8Helper used in text_area

String manipulations need to be done using an UTF8 decoder, but some manipulations can be optimized if the code points we manipulate are only one char long. These optimizations are done inside UTF8Helper methods.
2026-03-28 18:20:14 +01:00 · 2019-01-24 11:06:07 +01:00
parent e6051fae42
commit aac78ca32a
6 changed files with 153 additions and 52 deletions
--- a/ion/src/shared/unicode/utf8_helper.cpp
+++ b/ion/src/shared/unicode/utf8_helper.cpp
@@ -34,6 +34,13 @@ int CountOccurrences(const char * s, CodePoint c) {
 }

 const char * CodePointSearch(const char * s, CodePoint c) {
+  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
+    const char * result = s;
+    while (*result != 0 && *result != c) {
+      result++;
+    }
+    return result;
+  }
  UTF8Decoder decoder(s);
  const char * currentPointer = s;
  CodePoint codePoint = decoder.nextCodePoint();
@@ -43,10 +50,7 @@ const char * CodePointSearch(const char * s, CodePoint c) {
    codePoint = decoder.nextCodePoint();
    nextPointer = decoder.stringPosition();
  }
-  if (codePoint == c) {
-    return currentPointer;
-  }
-  return nullptr;
+  return currentPointer;
 }

 void CopyAndRemoveCodePoint(char * dst, size_t dstSize, const char * src, CodePoint c, const char * * pointerToUpdate) {
@@ -74,30 +78,95 @@ void CopyAndRemoveCodePoint(char * dst, size_t dstSize, const char * src, CodePo
  }
 }

-void PerformAtCodePoints(const char * s, CodePoint c, CodePointAction action, void * contextPointer, int contextInt) {
-  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
-    /* The code point is one char long, so it is equal to its char translation.
-     * We can do a classic char search. */
-    const char * i = s;
-    while (*i != 0) {
-      if (*i == c) {
-        action(const_cast<char *>(i), contextPointer, contextInt);
+const char * PerformAtCodePoints(const char * s, CodePoint c, CodePointAction actionCodePoint, CodePointAction actionOtherCodePoint, void * contextPointer, int contextInt, CodePoint stoppingCodePoint, bool goingRight, const char * initialPosition) {
+  /* If we are decoding towards the left, we must have a starting position. If
+   * we are decoding towards the right, the starting position is the start of
+   * string. */
+  assert((goingRight && initialPosition == nullptr)
+      || (!goingRight && initialPosition != nullptr));
+
+  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1 && UTF8Decoder::CharSizeOfCodePoint(stoppingCodePoint) == 1) {
+    /* The code points are one char long, so they are equal to their char
+     * translations. We can do a classic char search. */
+    if (goingRight) {
+      const char * i = s;
+      while (*i != stoppingCodePoint && *i != 0) {
+        if (*i == c) {
+          actionCodePoint(const_cast<char *>(i), contextPointer, contextInt);
+        } else {
+          actionOtherCodePoint(const_cast<char *>(i), contextPointer, contextInt);
+        }
+        i++;
      }
-      i++;
+      return i;
    }
-  } else {
-    // The code point is more than one char long, we use a UTF8Decoder.
+    const char * i = initialPosition - 1;
+    while (i >= s && *i != stoppingCodePoint) {
+      if (*i == c) {
+        actionCodePoint(const_cast<char *>(i), contextPointer, contextInt);
+      } else {
+        actionOtherCodePoint(const_cast<char *>(i), contextPointer, contextInt);
+      }
+      i--;
+    }
+    return i;
+  }
+  // The code point is more than one char long, we use a UTF8Decoder.
+  if (goingRight) {
    UTF8Decoder decoder(s);
    const char * codePointPointer = decoder.stringPosition();
    CodePoint codePoint = decoder.nextCodePoint();
-    while (codePoint != UCodePointNull) {
+    while (codePoint != stoppingCodePoint && codePoint != UCodePointNull) {
      if (codePoint == c) {
-        action(const_cast<char *>(codePointPointer), contextPointer, contextInt);
+        actionCodePoint(const_cast<char *>(codePointPointer), contextPointer, contextInt);
+      } else {
+        actionOtherCodePoint(const_cast<char *>(codePointPointer), contextPointer, contextInt);
      }
      codePointPointer = decoder.stringPosition();
      codePoint = decoder.nextCodePoint();
    }
+    return codePointPointer;
  }
+  assert(!goingRight);
+  if (initialPosition <= s) {
+    return initialPosition;
+  }
+  UTF8Decoder decoder(s, initialPosition);
+  CodePoint codePoint = decoder.previousCodePoint();
+  const char * codePointPointer = decoder.stringPosition();
+  while (codePointPointer >= s && codePoint != stoppingCodePoint) {
+    if (codePoint == c) {
+      actionCodePoint(const_cast<char *>(codePointPointer), contextPointer, contextInt);
+    } else {
+      actionOtherCodePoint(const_cast<char *>(codePointPointer), contextPointer, contextInt);
+    }
+    if (codePointPointer > s) {
+      codePoint = decoder.previousCodePoint();
+      codePointPointer = decoder.stringPosition();
+    } else {
+      /* If the current pointer is s, we cannot continue decoding. Decreasing s
+       * will stop the while loop. */
+      codePointPointer = s-1;
+    }
+  }
+  return codePointPointer;
+}
+
+bool PreviousCodePointIs(const char * buffer, const char * location, CodePoint c) {
+  assert(location > buffer);
+  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
+    return *(location -1) == c;
+  }
+  UTF8Decoder decoder(buffer, location);
+  return decoder.previousCodePoint() == c;
+}
+
+bool CodePointIs(const char * location, CodePoint c) {
+  if (UTF8Decoder::CharSizeOfCodePoint(c) == 1) {
+    return *(location) == c;
+  }
+  UTF8Decoder decoder(location);
+  return decoder.nextCodePoint() == c;
 }

 };