mirror of
https://github.com/UpsilonNumworks/Upsilon.git
synced 2026-03-21 23:00:45 +01:00
Scenario: Copy a text with \n (for instance from a script), then paste it in a script name -> if \n are replaed with \0 instead of just being removed, there are problems with the extension
54 lines
2.0 KiB
C++
54 lines
2.0 KiB
C++
#ifndef ION_UNICODE_UTF8_DECODER_H
|
|
#define ION_UNICODE_UTF8_DECODER_H
|
|
|
|
#include "code_point.h"
|
|
#include <stddef.h>
|
|
#include <assert.h>
|
|
|
|
/* UTF-8 encodes all valid code points using at most 4 bytes (= 28 bits), the
|
|
* lowest codes being equal to ASCII codes. There are less than 2^21 different
|
|
* UTF-8 valid code points.
|
|
*
|
|
* The encoding is the following:
|
|
* For code points between ... -> The corresponding bits are ...
|
|
* 0 and 7F -> 0xxxxxxx
|
|
* 80 and 7FF -> 110xxxxx 10xxxxxx
|
|
* 800 and FFFF -> 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 10000 and 10FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*
|
|
* WARNING: A single glyph can have several code point translations. For
|
|
* instance, 'ê' can be the code point 'e' followed by the code point '^', but
|
|
* it can also be the unique code point 'ê'. We assume NFKD normalization, where
|
|
* 'ê' will be 'e' followed by a combining code point '^'.
|
|
* The i18n literals are processed by i18n.py, where we enforce the NFKD
|
|
* normalization, but other string literals (for instance the script templates)
|
|
* will be encoded as the text editor chooses.
|
|
*
|
|
* /!\ All literals except the i18n should thus avoid containing ambiguous
|
|
* glyphs such as accentuated letters.
|
|
*/
|
|
|
|
class UTF8Decoder {
|
|
public:
|
|
UTF8Decoder(const char * string, const char * initialPosition = nullptr) :
|
|
m_string(string),
|
|
m_stringPosition(initialPosition == nullptr ? string : initialPosition)
|
|
{
|
|
assert(m_string != nullptr);
|
|
}
|
|
CodePoint nextCodePoint();
|
|
CodePoint previousCodePoint();
|
|
const char * nextGlyphPosition();
|
|
const char * previousGlyphPosition();
|
|
const char * stringPosition() const { return m_stringPosition; }
|
|
void setPosition(const char * position);
|
|
static size_t CharSizeOfCodePoint(CodePoint c);
|
|
static size_t CodePointToChars(CodePoint c, char * buffer, size_t bufferSize); // No null-terminating char
|
|
private:
|
|
static bool IsInTheMiddleOfACodePoint(uint8_t value);
|
|
const char * const m_string;
|
|
const char * m_stringPosition;
|
|
};
|
|
|
|
#endif
|