[unicode] Use the UTF8Decoder to scan const char *

2026-01-18 16:27:34 +01:00 · 2019-01-16 17:03:30 +01:00
parent 65e5adafac
commit 41afa92f10
14 changed files with 189 additions and 65 deletions
--- a/kandinsky/Makefile
+++ b/kandinsky/Makefile
@@ -14,6 +14,7 @@ src += $(addprefix kandinsky/src/,\
  point.cpp \
  rect.cpp \
  unicode/utf8_decoder.cpp\
+  unicode/utf8_helper.cpp\
 )

 src += $(addprefix kandinsky/fonts/, \
--- a/kandinsky/include/kandinsky/unicode/utf8_decoder.h
+++ b/kandinsky/include/kandinsky/unicode/utf8_decoder.h
@@ -18,7 +18,10 @@
 class UTF8Decoder {
 public:
  UTF8Decoder(const char * string) : m_string(string) {}
+  /* TODO: Rename methods? nextCodePoint increases m_string but
+   * nextCodePointPointer does not */
  CodePoint nextCodePoint();
+  const char * nextCodePointPointer();
  static size_t CharSizeOfCodePoint(CodePoint c);
  static size_t CodePointToChars(CodePoint c, char * buffer, int bufferSize);
 private:
--- a/kandinsky/include/kandinsky/unicode/utf8_helper.h
+++ b/kandinsky/include/kandinsky/unicode/utf8_helper.h
@@ -0,0 +1,17 @@
+#ifndef KANDINSKY_UNICODE_UTF8_HELPER_H
+#define KANDINSKY_UNICODE_UTF8_HELPER_H
+
+#include "code_point.h"
+#include <stddef.h>
+
+namespace UTF8Helper {
+
+const char * CodePointSearch(const char * s, CodePoint c);
+/* CopyAndRemoveCodePoint copies src into dst while removing all code points c.
+ * It also updates an index that should be lower if code points where removed
+ * before it. */
+void CopyAndRemoveCodePoint(char * dst, size_t dstSize, const char * src, CodePoint c, size_t * indexToDUpdate = nullptr);
+
+};
+
+#endif
--- a/kandinsky/src/unicode/utf8_decoder.cpp
+++ b/kandinsky/src/unicode/utf8_decoder.cpp
@@ -25,6 +25,10 @@ CodePoint UTF8Decoder::nextCodePoint() {
  return CodePoint(result);
 }

+const char * UTF8Decoder::nextCodePointPointer() {
+  return m_string + leading_ones(*m_string);
+}
+
 size_t UTF8Decoder::CharSizeOfCodePoint(CodePoint c) {
  constexpr int bufferSize = CodePoint::MaxCodePointCharLength;
  char buffer[bufferSize];
@@ -32,21 +36,29 @@ size_t UTF8Decoder::CharSizeOfCodePoint(CodePoint c) {
 }

 size_t UTF8Decoder::CodePointToChars(CodePoint c, char * buffer, int bufferSize) {
-  assert(bufferSize >= CodePoint::MaxCodePointCharLength);
+  if (bufferSize <= 0) {
+    return 0;
+  }
  size_t i = 0;
  if (c <= 0x7F) {
    buffer[i++] = c;
  } else if (c <= 0x7FF) {
    buffer[i++] = 0b11000000 | (c >> 6);
+    if (bufferSize <= i) { return i; }
    buffer[i++] = 0b10000000 | (c & 0b111111);
  } else if (c <= 0xFFFF) {
    buffer[i++] = 0b11100000 | (c >> 12);
+    if (bufferSize <= i) { return i; }
    buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
+    if (bufferSize <= i) { return i; }
    buffer[i++] = 0b10000000 | (c & 0b111111);
  } else {
    buffer[i++] = 0b11110000 | (c >> 18);
+    if (bufferSize <= i) { return i; }
    buffer[i++] = 0b10000000 | ((c >> 12) & 0b111111);
+    if (bufferSize <= i) { return i; }
    buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
+    if (bufferSize <= i) { return i; }
    buffer[i++] = 0b10000000 | (c & 0b111111);
  }
  return i;
--- a/kandinsky/src/unicode/utf8_helper.cpp
+++ b/kandinsky/src/unicode/utf8_helper.cpp
@@ -0,0 +1,51 @@
+#include <kandinsky/unicode/utf8_helper.h>
+#include <kandinsky/unicode/utf8_decoder.h>
+#include <string.h>
+#include <assert.h>
+
+namespace UTF8Helper {
+
+static inline int min(int x, int y) { return x < y ? x : y; }
+
+const char * CodePointSearch(const char * s, CodePoint c) {
+  UTF8Decoder decoder(s);
+  const char * currentPointer = s;
+  const char * nextPointer = decoder.nextCodePointPointer();
+  CodePoint codePoint = decoder.nextCodePoint();
+  while (codePoint != KDCodePointNull && codePoint != c) {
+    currentPointer = nextPointer;
+    nextPointer = decoder.nextCodePointPointer();
+    codePoint = decoder.nextCodePoint();
+  }
+  if (codePoint == c) {
+    return currentPointer;
+  }
+  return nullptr;
+}
+
+void CopyAndRemoveCodePoint(char * dst, size_t dstSize, const char * src, CodePoint c, size_t * indexToUpdate) {
+  UTF8Decoder decoder(src);
+  const char * currentPointer = src;
+  const char * nextPointer = decoder.nextCodePointPointer();
+  const char * maxPointer = src + strlen(src) + 1;
+  CodePoint codePoint = decoder.nextCodePoint();
+  size_t bufferIndex = 0;
+  size_t codePointCharSize = UTF8Decoder::CharSizeOfCodePoint(c);
+
+  // Remove CodePoint c
+  while (currentPointer < maxPointer && bufferIndex < dstSize) {
+    if (codePoint != c) {
+      int copySize = min(nextPointer - currentPointer, dstSize - bufferIndex);
+      memcpy(dst + bufferIndex, currentPointer, copySize);
+      bufferIndex+= copySize;
+    } else if (indexToUpdate != nullptr && currentPointer - src < *indexToUpdate) {
+      assert(*indexToUpdate >= codePointCharSize);
+      *indexToUpdate-= codePointCharSize;
+    }
+    currentPointer = nextPointer;
+    nextPointer = decoder.nextCodePointPointer();
+    codePoint = decoder.nextCodePoint();
+  }
+}
+
+};