From cf79b26ceb3fbfb97d6014140ec0badd159e4eaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A9a=20Saviot?= <lea.saviot@numworks.com>
Date: Fri, 11 Jan 2019 11:52:20 +0100
Subject: [PATCH] [kandinsky] UTF8Decoder::CodePointToChars

---
 .../include/kandinsky/unicode/utf8decoder.h   | 13 ++++++++++++
 kandinsky/src/unicode/utf8decoder.cpp         | 21 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/kandinsky/include/kandinsky/unicode/utf8decoder.h b/kandinsky/include/kandinsky/unicode/utf8decoder.h
index 36aed2c48..fcabe859a 100644
--- a/kandinsky/include/kandinsky/unicode/utf8decoder.h
+++ b/kandinsky/include/kandinsky/unicode/utf8decoder.h
@@ -1,12 +1,25 @@
 #ifndef KANDINSKY_UNICODE_UTF8DECODER_H
 #define KANDINSKY_UNICODE_UTF8DECODER_H
 
+#include <stddef.h>
 #include "code_point.h"
 
+/* UTF-8 encodes all valid code points using at most 4 bytes (= 28 bits), the
+ * lowest codes being equal to ASCII codes. There are less than 2^21 different
+ * UTF-8 valid code points.
+ *
+ * The encoding is the following:
+ * For code points between ...   ->  The corresponding bits are ...
+ * 0 and 7F         -> 0xxxxxxx
+ * 80 and 7FF       -> 110xxxxx 10xxxxxx
+ * 800 and FFFF     -> 1110xxxx 10xxxxxx 10xxxxxx
+ * 10000 and 10FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+
 class UTF8Decoder {
 public:
   UTF8Decoder(const char * string) : m_string(string) {}
   CodePoint nextCodePoint();
+  static size_t CodePointToChars(CodePoint c, char * buffer, int bufferSize);
 private:
   const char * m_string;
 };
diff --git a/kandinsky/src/unicode/utf8decoder.cpp b/kandinsky/src/unicode/utf8decoder.cpp
index 7990c28bb..5648df4df 100644
--- a/kandinsky/src/unicode/utf8decoder.cpp
+++ b/kandinsky/src/unicode/utf8decoder.cpp
@@ -24,3 +24,24 @@ CodePoint UTF8Decoder::nextCodePoint() {
   }
   return CodePoint(result);
 }
+
+size_t UTF8Decoder::CodePointToChars(CodePoint c, char * buffer, int bufferSize) {
+  assert(bufferSize >= sizeof(CodePoint)/sizeof(char));
+  size_t i = 0;
+  if (c <= 0x7F) {
+    buffer[i++] = c;
+  } else if (c <= 0x7FF) {
+    buffer[i++] = 0b11000000 | (c >> 6);
+    buffer[i++] = 0b10000000 | (c & 0b111111);
+  } else if (c <= 0xFFFF) {
+    buffer[i++] = 0b11100000 | (c >> 12);
+    buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
+    buffer[i++] = 0b10000000 | (c & 0b111111);
+  } else {
+    buffer[i++] = 0b11110000 | (c >> 18);
+    buffer[i++] = 0b10000000 | ((c >> 12) & 0b111111);
+    buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
+    buffer[i++] = 0b10000000 | (c & 0b111111);
+  }
+  return i;
+}