[i18n] Encode strings as NFKD-normalized UTF-8 strings

2026-03-19 13:50:28 +01:00 · 2018-10-31 10:37:41 +01:00
parent 39e01f1d4c
commit 6fac2120fe
1 changed files with 23 additions and 39 deletions
--- a/apps/i18n.py
+++ b/apps/i18n.py
@@ -1,4 +1,6 @@
-#coding=utf-8
+# This script gather all .i18n files and aggregates them as a pair of .h/.cpp file
+# In practice, it enforces a NFKD normalization
+# It works with Python 2 and Python 3

 import sys
 import re
@@ -6,42 +8,8 @@ import unicodedata
 import argparse
 import io

-ion_special_characters = {
-    u'Δ': "Ion::Charset::CapitalDelta",
-    u'Σ': "Ion::Charset::CapitalSigma",
-    u'λ': "Ion::Charset::SmallLambda",
-    u'μ': "Ion::Charset::SmallMu",
-    u'σ': "Ion::Charset::SmallSigma",
-    u'≤': "Ion::Charset::LessEqual",
-    u'≈': "Ion::Charset::AlmostEqual",
-    u'ø': "Ion::Charset::Empty",
-    u'•': "Ion::Charset::MiddleDot"
-}
-
-def ion_char(i18n_letter):
-    if i18n_letter == '\'':
-        return "'\\\''"
-    if ord(i18n_letter) < 128:
-        return "'" + i18n_letter + "'"
-    if i18n_letter in ion_special_characters:
-        return ion_special_characters[i18n_letter]
-    normalized = unicodedata.normalize("NFD", i18n_letter).encode('ascii', 'ignore')
-    #sys.stderr.write("Warning: Normalizing unicode character \"" + i18n_letter + "\" -> \"" + normalized + "\"\n")
-    return "'" + normalized.decode() + "'"
-
 def source_definition(i18n_string):
-    ion_characters = []
-    i = 0
-    while i < len(i18n_string):
-        if i18n_string[i] == '\\':
-            i = i+1
-            newChar = "'\\"+i18n_string[i]+"'"
-            ion_characters.append(newChar)
-        else:
-            ion_characters.append(ion_char(i18n_string[i]))
-        i = i+1
-    ion_characters.append("0")
-    return "{" + ", ".join(ion_characters) + "}"
+    return (u"\"" + unicodedata.normalize("NFKD", i18n_string) + u"\"").encode("utf-8")

 def split_line(line):
    match = re.match(r"^(\w+)\s*=\s*\"(.*)\"$", line)
@@ -78,7 +46,7 @@ def parse_files(files):
    return {"messages": sorted(messages), "universal_messages": sorted(universal_messages), "data": data}

 def print_header(data, path, locales):
-    f = open(path, 'w')
+    f = open(path, "w")
    f.write("#ifndef APPS_I18N_H\n")
    f.write("#define APPS_I18N_H\n\n")
    f.write("// This file is auto-generated by i18n.py\n\n")
@@ -114,7 +82,7 @@ def print_header(data, path, locales):
    f.close()

 def print_implementation(data, path, locales):
-    f = open(path, 'w')
+    f = open(path, "w")
    f.write("#include \"i18n.h\"\n")
    f.write("#include <apps/global_preferences.h>\n")
    f.write("#include <assert.h>\n\n");
@@ -141,7 +109,11 @@ def print_implementation(data, path, locales):
            if not message in data["data"][locale]:
                sys.stderr.write("Error: Undefined key \"" + message + "\" for locale \"" + locale + "\"\n")
                sys.exit(-1)
-            f.write("constexpr static char " + locale + message + "[] = " + data["data"][locale][message] + ";\n")
+            f.write("constexpr static char " + locale + message + "[] = ")
+            f = open(path, "ab") # Re-open the file as binary to output raw UTF-8 bytes
+            f.write(data["data"][locale][message])
+            f = open(path, "a") # Re-open the file as text
+            f.write(";\n")
    f.write("\n")
    f.write("constexpr static const char * messages[%d][%d] = {\n" % (len(data["messages"]), len(locales)))
    for message in data["messages"]:
@@ -152,6 +124,18 @@ def print_implementation(data, path, locales):
    f.write("};\n\n")

    # Write the translate method
+    for message in data["universal_messages"]:
+        f.write("constexpr static char universal" + message + "[] = ")
+        f = open(path, "ab") # Re-open the file as binary to output raw UTF-8 bytes
+        f.write(data["data"]["universal"][message])
+        f = open(path, "a") # Re-open the file as text
+        f.write(";\n")
+    f.write("\n")
+    f.write("constexpr static const char * universalMessages[%d] = {\n" % len(data["universal_messages"]))
+    for message in data["universal_messages"]:
+        f.write("  universal" + message + ",\n")
+    f.write("};\n")
+    f.write("\n")
    f.write("const char * translate(Message m, Language l) {\n")
    f.write("  assert(m != Message::LocalizedMessageMarker);\n")
    f.write("  int localizedMessageOffset = (int)Message::LocalizedMessageMarker+1;\n")