From cf79b26ceb3fbfb97d6014140ec0badd159e4eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9a=20Saviot?= Date: Fri, 11 Jan 2019 11:52:20 +0100 Subject: [PATCH] [kandinsky] UTF8Decoder::CodePointToChars --- .../include/kandinsky/unicode/utf8decoder.h | 13 ++++++++++++ kandinsky/src/unicode/utf8decoder.cpp | 21 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/kandinsky/include/kandinsky/unicode/utf8decoder.h b/kandinsky/include/kandinsky/unicode/utf8decoder.h index 36aed2c48..fcabe859a 100644 --- a/kandinsky/include/kandinsky/unicode/utf8decoder.h +++ b/kandinsky/include/kandinsky/unicode/utf8decoder.h @@ -1,12 +1,25 @@ #ifndef KANDINSKY_UNICODE_UTF8DECODER_H #define KANDINSKY_UNICODE_UTF8DECODER_H +#include #include "code_point.h" +/* UTF-8 encodes all valid code points using at most 4 bytes (= 28 bits), the + * lowest codes being equal to ASCII codes. There are less than 2^21 different + * UTF-8 valid code points. + * + * The encoding is the following: + * For code points between ... -> The corresponding bits are ... + * 0 and 7F -> 0xxxxxxx + * 80 and 7FF -> 110xxxxx 10xxxxxx + * 800 and FFFF -> 1110xxxx 10xxxxxx 10xxxxxx + * 10000 and 10FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + class UTF8Decoder { public: UTF8Decoder(const char * string) : m_string(string) {} CodePoint nextCodePoint(); + static size_t CodePointToChars(CodePoint c, char * buffer, int bufferSize); private: const char * m_string; }; diff --git a/kandinsky/src/unicode/utf8decoder.cpp b/kandinsky/src/unicode/utf8decoder.cpp index 7990c28bb..5648df4df 100644 --- a/kandinsky/src/unicode/utf8decoder.cpp +++ b/kandinsky/src/unicode/utf8decoder.cpp @@ -24,3 +24,24 @@ CodePoint UTF8Decoder::nextCodePoint() { } return CodePoint(result); } + +size_t UTF8Decoder::CodePointToChars(CodePoint c, char * buffer, int bufferSize) { + assert(bufferSize >= sizeof(CodePoint)/sizeof(char)); + size_t i = 0; + if (c <= 0x7F) { + buffer[i++] = c; + } else if (c <= 0x7FF) { + buffer[i++] = 0b11000000 | (c >> 6); + buffer[i++] = 0b10000000 | (c & 0b111111); + } else if (c <= 0xFFFF) { + buffer[i++] = 0b11100000 | (c >> 12); + buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111); + buffer[i++] = 0b10000000 | (c & 0b111111); + } else { + buffer[i++] = 0b11110000 | (c >> 18); + buffer[i++] = 0b10000000 | ((c >> 12) & 0b111111); + buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111); + buffer[i++] = 0b10000000 | (c & 0b111111); + } + return i; +}