[kandinsky] UTF8Decoder::CodePointToChars

2026-03-19 22:00:28 +01:00 · 2019-01-11 11:52:20 +01:00
parent 02a5d6cd23
commit cf79b26ceb
2 changed files with 34 additions and 0 deletions
--- a/kandinsky/include/kandinsky/unicode/utf8decoder.h
+++ b/kandinsky/include/kandinsky/unicode/utf8decoder.h
@@ -1,12 +1,25 @@
 #ifndef KANDINSKY_UNICODE_UTF8DECODER_H
 #define KANDINSKY_UNICODE_UTF8DECODER_H

+#include <stddef.h>
 #include "code_point.h"

+/* UTF-8 encodes all valid code points using at most 4 bytes (= 28 bits), the
+ * lowest codes being equal to ASCII codes. There are less than 2^21 different
+ * UTF-8 valid code points.
+ *
+ * The encoding is the following:
+ * For code points between ...   ->  The corresponding bits are ...
+ * 0 and 7F         -> 0xxxxxxx
+ * 80 and 7FF       -> 110xxxxx 10xxxxxx
+ * 800 and FFFF     -> 1110xxxx 10xxxxxx 10xxxxxx
+ * 10000 and 10FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+
 class UTF8Decoder {
 public:
  UTF8Decoder(const char * string) : m_string(string) {}
  CodePoint nextCodePoint();
+  static size_t CodePointToChars(CodePoint c, char * buffer, int bufferSize);
 private:
  const char * m_string;
 };
--- a/kandinsky/src/unicode/utf8decoder.cpp
+++ b/kandinsky/src/unicode/utf8decoder.cpp
@@ -24,3 +24,24 @@ CodePoint UTF8Decoder::nextCodePoint() {
  }
  return CodePoint(result);
 }
+
+size_t UTF8Decoder::CodePointToChars(CodePoint c, char * buffer, int bufferSize) {
+  assert(bufferSize >= sizeof(CodePoint)/sizeof(char));
+  size_t i = 0;
+  if (c <= 0x7F) {
+    buffer[i++] = c;
+  } else if (c <= 0x7FF) {
+    buffer[i++] = 0b11000000 | (c >> 6);
+    buffer[i++] = 0b10000000 | (c & 0b111111);
+  } else if (c <= 0xFFFF) {
+    buffer[i++] = 0b11100000 | (c >> 12);
+    buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
+    buffer[i++] = 0b10000000 | (c & 0b111111);
+  } else {
+    buffer[i++] = 0b11110000 | (c >> 18);
+    buffer[i++] = 0b10000000 | ((c >> 12) & 0b111111);
+    buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
+    buffer[i++] = 0b10000000 | (c & 0b111111);
+  }
+  return i;
+}