Files
Upsilon/ion/include/ion/unicode/utf8_decoder.h
Léa Saviot b29d014695 [escher/text_field] Fix removing of \n in insertTextAtLocation
Scenario: Copy a text with \n (for instance from a script), then paste
it in a script name -> if \n are replaed with \0 instead of just being
removed, there are problems with the extension
2020-02-12 15:13:23 +01:00

54 lines
2.0 KiB
C++

#ifndef ION_UNICODE_UTF8_DECODER_H
#define ION_UNICODE_UTF8_DECODER_H
#include "code_point.h"
#include <stddef.h>
#include <assert.h>
/* UTF-8 encodes all valid code points using at most 4 bytes (= 28 bits), the
* lowest codes being equal to ASCII codes. There are less than 2^21 different
* UTF-8 valid code points.
*
* The encoding is the following:
* For code points between ... -> The corresponding bits are ...
* 0 and 7F -> 0xxxxxxx
* 80 and 7FF -> 110xxxxx 10xxxxxx
* 800 and FFFF -> 1110xxxx 10xxxxxx 10xxxxxx
* 10000 and 10FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* WARNING: A single glyph can have several code point translations. For
* instance, 'ê' can be the code point 'e' followed by the code point '^', but
* it can also be the unique code point 'ê'. We assume NFKD normalization, where
* 'ê' will be 'e' followed by a combining code point '^'.
* The i18n literals are processed by i18n.py, where we enforce the NFKD
* normalization, but other string literals (for instance the script templates)
* will be encoded as the text editor chooses.
*
* /!\ All literals except the i18n should thus avoid containing ambiguous
* glyphs such as accentuated letters.
*/
class UTF8Decoder {
public:
UTF8Decoder(const char * string, const char * initialPosition = nullptr) :
m_string(string),
m_stringPosition(initialPosition == nullptr ? string : initialPosition)
{
assert(m_string != nullptr);
}
CodePoint nextCodePoint();
CodePoint previousCodePoint();
const char * nextGlyphPosition();
const char * previousGlyphPosition();
const char * stringPosition() const { return m_stringPosition; }
void setPosition(const char * position);
static size_t CharSizeOfCodePoint(CodePoint c);
static size_t CodePointToChars(CodePoint c, char * buffer, size_t bufferSize); // No null-terminating char
private:
static bool IsInTheMiddleOfACodePoint(uint8_t value);
const char * const m_string;
const char * m_stringPosition;
};
#endif