[kandinsky] UTF8Decoder::CodePointToChars

This commit is contained in:
Léa Saviot
2019-01-11 11:52:20 +01:00
committed by Émilie Feral
parent 02a5d6cd23
commit cf79b26ceb
2 changed files with 34 additions and 0 deletions

View File

@@ -1,12 +1,25 @@
#ifndef KANDINSKY_UNICODE_UTF8DECODER_H
#define KANDINSKY_UNICODE_UTF8DECODER_H
#include <stddef.h>
#include "code_point.h"
/* UTF-8 encodes all valid code points using at most 4 bytes (= 28 bits), the
* lowest codes being equal to ASCII codes. There are less than 2^21 different
* UTF-8 valid code points.
*
* The encoding is the following:
* For code points between ... -> The corresponding bits are ...
* 0 and 7F -> 0xxxxxxx
* 80 and 7FF -> 110xxxxx 10xxxxxx
* 800 and FFFF -> 1110xxxx 10xxxxxx 10xxxxxx
* 10000 and 10FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
class UTF8Decoder {
public:
UTF8Decoder(const char * string) : m_string(string) {}
CodePoint nextCodePoint();
static size_t CodePointToChars(CodePoint c, char * buffer, int bufferSize);
private:
const char * m_string;
};

View File

@@ -24,3 +24,24 @@ CodePoint UTF8Decoder::nextCodePoint() {
}
return CodePoint(result);
}
size_t UTF8Decoder::CodePointToChars(CodePoint c, char * buffer, int bufferSize) {
assert(bufferSize >= sizeof(CodePoint)/sizeof(char));
size_t i = 0;
if (c <= 0x7F) {
buffer[i++] = c;
} else if (c <= 0x7FF) {
buffer[i++] = 0b11000000 | (c >> 6);
buffer[i++] = 0b10000000 | (c & 0b111111);
} else if (c <= 0xFFFF) {
buffer[i++] = 0b11100000 | (c >> 12);
buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
buffer[i++] = 0b10000000 | (c & 0b111111);
} else {
buffer[i++] = 0b11110000 | (c >> 18);
buffer[i++] = 0b10000000 | ((c >> 12) & 0b111111);
buffer[i++] = 0b10000000 | ((c >> 6) & 0b111111);
buffer[i++] = 0b10000000 | (c & 0b111111);
}
return i;
}