diff options
author | Raghuram Subramani <raghus2247@gmail.com> | 2022-06-19 19:47:51 +0530 |
---|---|---|
committer | Raghuram Subramani <raghus2247@gmail.com> | 2022-06-19 19:47:51 +0530 |
commit | 4fd287655a72b9aea14cdac715ad5b90ed082ed2 (patch) | |
tree | 65d393bc0e699dd12d05b29ba568e04cea666207 /circuitpython/py/makeqstrdata.py | |
parent | 0150f70ce9c39e9e6dd878766c0620c85e47bed0 (diff) |
add circuitpython code
Diffstat (limited to 'circuitpython/py/makeqstrdata.py')
-rw-r--r-- | circuitpython/py/makeqstrdata.py | 786 |
1 files changed, 786 insertions, 0 deletions
diff --git a/circuitpython/py/makeqstrdata.py b/circuitpython/py/makeqstrdata.py new file mode 100644 index 0000000..74ad78c --- /dev/null +++ b/circuitpython/py/makeqstrdata.py @@ -0,0 +1,786 @@ +""" +Process raw qstr file and output qstr data with length, hash and data bytes. + +This script works with Python 2.7, 3.3 and 3.4. + +For documentation about the format of compressed translated strings, see +supervisor/shared/translate.h +""" + +from __future__ import print_function + +import bisect +import re +import sys + +import collections +import gettext +import os.path + +if hasattr(sys.stdout, "reconfigure"): + sys.stdout.reconfigure(encoding="utf-8") + sys.stderr.reconfigure(errors="backslashreplace") + +py = os.path.dirname(sys.argv[0]) +top = os.path.dirname(py) + +sys.path.append(os.path.join(top, "tools/huffman")) + +import huffman + +# Python 2/3 compatibility: +# - iterating through bytes is different +# - codepoint2name lives in a different module +import platform + +if platform.python_version_tuple()[0] == "2": + bytes_cons = lambda val, enc=None: bytearray(val) + from htmlentitydefs import codepoint2name +elif platform.python_version_tuple()[0] == "3": + bytes_cons = bytes + from html.entities import codepoint2name +# end compatibility code + +codepoint2name[ord("-")] = "hyphen" + +# add some custom names to map characters that aren't in HTML +codepoint2name[ord(" ")] = "space" +codepoint2name[ord("'")] = "squot" +codepoint2name[ord(",")] = "comma" +codepoint2name[ord(".")] = "dot" +codepoint2name[ord(":")] = "colon" +codepoint2name[ord(";")] = "semicolon" +codepoint2name[ord("/")] = "slash" +codepoint2name[ord("%")] = "percent" +codepoint2name[ord("#")] = "hash" +codepoint2name[ord("(")] = "paren_open" +codepoint2name[ord(")")] = "paren_close" +codepoint2name[ord("[")] = "bracket_open" +codepoint2name[ord("]")] = "bracket_close" +codepoint2name[ord("{")] = "brace_open" +codepoint2name[ord("}")] = "brace_close" +codepoint2name[ord("*")] = "star" +codepoint2name[ord("!")] = "bang" +codepoint2name[ord("\\")] = "backslash" +codepoint2name[ord("+")] = "plus" +codepoint2name[ord("$")] = "dollar" +codepoint2name[ord("=")] = "equals" +codepoint2name[ord("?")] = "question" +codepoint2name[ord("@")] = "at_sign" +codepoint2name[ord("^")] = "caret" +codepoint2name[ord("|")] = "pipe" +codepoint2name[ord("~")] = "tilde" + +C_ESCAPES = { + "\a": "\\a", + "\b": "\\b", + "\f": "\\f", + "\n": "\\n", + "\r": "\\r", + "\t": "\\t", + "\v": "\\v", + "'": "\\'", + '"': '\\"', +} + +# static qstrs, should be sorted +# These are qstrs that are always included and always have the same number. It allows mpy files to omit them. +static_qstr_list = [ + "", + "__dir__", # Put __dir__ after empty qstr for builtin dir() to work + "\n", + " ", + "*", + "/", + "<module>", + "_", + "__call__", + "__class__", + "__delitem__", + "__enter__", + "__exit__", + "__getattr__", + "__getitem__", + "__hash__", + "__init__", + "__int__", + "__iter__", + "__len__", + "__main__", + "__module__", + "__name__", + "__new__", + "__next__", + "__qualname__", + "__repr__", + "__setitem__", + "__str__", + "ArithmeticError", + "AssertionError", + "AttributeError", + "BaseException", + "EOFError", + "Ellipsis", + "Exception", + "GeneratorExit", + "ImportError", + "IndentationError", + "IndexError", + "KeyError", + "KeyboardInterrupt", + "LookupError", + "MemoryError", + "NameError", + "NoneType", + "NotImplementedError", + "OSError", + "OverflowError", + "RuntimeError", + "StopIteration", + "SyntaxError", + "SystemExit", + "TypeError", + "ValueError", + "ZeroDivisionError", + "abs", + "all", + "any", + "append", + "args", + "bool", + "builtins", + "bytearray", + "bytecode", + "bytes", + "callable", + "chr", + "classmethod", + "clear", + "close", + "const", + "copy", + "count", + "dict", + "dir", + "divmod", + "end", + "endswith", + "eval", + "exec", + "extend", + "find", + "format", + "from_bytes", + "get", + "getattr", + "globals", + "hasattr", + "hash", + "id", + "index", + "insert", + "int", + "isalpha", + "isdigit", + "isinstance", + "islower", + "isspace", + "issubclass", + "isupper", + "items", + "iter", + "join", + "key", + "keys", + "len", + "list", + "little", + "locals", + "lower", + "lstrip", + "main", + "map", + "micropython", + "next", + "object", + "open", + "ord", + "pop", + "popitem", + "pow", + "print", + "range", + "read", + "readinto", + "readline", + "remove", + "replace", + "repr", + "reverse", + "rfind", + "rindex", + "round", + "rsplit", + "rstrip", + "self", + "send", + "sep", + "set", + "setattr", + "setdefault", + "sort", + "sorted", + "split", + "start", + "startswith", + "staticmethod", + "step", + "stop", + "str", + "strip", + "sum", + "super", + "throw", + "to_bytes", + "tuple", + "type", + "update", + "upper", + "utf-8", + "value", + "values", + "write", + "zip", +] + +# this must match the equivalent function in qstr.c +def compute_hash(qstr, bytes_hash): + hash = 5381 + for b in qstr: + hash = (hash * 33) ^ b + # Make sure that valid hash is never zero, zero means "hash not computed" + return (hash & ((1 << (8 * bytes_hash)) - 1)) or 1 + + +def translate(translation_file, i18ns): + with open(translation_file, "rb") as f: + table = gettext.GNUTranslations(f) + + translations = [] + for original in i18ns: + unescaped = original + for s in C_ESCAPES: + unescaped = unescaped.replace(C_ESCAPES[s], s) + translation = table.gettext(unescaped) + # Add in carriage returns to work in terminals + translation = translation.replace("\n", "\r\n") + translations.append((original, translation)) + return translations + + +class TextSplitter: + def __init__(self, words): + words = sorted(words, key=lambda x: len(x), reverse=True) + self.words = set(words) + if words: + pat = "|".join(re.escape(w) for w in words) + "|." + else: + pat = "." + self.pat = re.compile(pat, flags=re.DOTALL) + + def iter_words(self, text): + s = [] + words = self.words + for m in self.pat.finditer(text): + t = m.group(0) + if t in words: + if s: + yield (False, "".join(s)) + s = [] + yield (True, t) + else: + s.append(t) + if s: + yield (False, "".join(s)) + + def iter(self, text): + for m in self.pat.finditer(text): + yield m.group(0) + + +def iter_substrings(s, minlen, maxlen): + len_s = len(s) + maxlen = min(len_s, maxlen) + for n in range(minlen, maxlen + 1): + for begin in range(0, len_s - n + 1): + yield s[begin : begin + n] + + +def compute_huffman_coding(translations, compression_filename): + texts = [t[1] for t in translations] + words = [] + + start_unused = 0x80 + end_unused = 0xFF + max_ord = 0 + for text in texts: + for c in text: + ord_c = ord(c) + max_ord = max(ord_c, max_ord) + if 0x80 <= ord_c < 0xFF: + end_unused = min(ord_c, end_unused) + max_words = end_unused - 0x80 + + bits_per_codepoint = 16 if max_ord > 255 else 8 + values_type = "uint16_t" if max_ord > 255 else "uint8_t" + while len(words) < max_words: + # Until the dictionary is filled to capacity, use a heuristic to find + # the best "word" (2- to 11-gram) to add to it. + # + # The TextSplitter allows us to avoid considering parts of the text + # that are already covered by a previously chosen word, for example + # if "the" is in words then not only will "the" not be considered + # again, neither will "there" or "wither", since they have "the" + # as substrings. + extractor = TextSplitter(words) + counter = collections.Counter() + for t in texts: + for atom in extractor.iter(t): + counter[atom] += 1 + cb = huffman.codebook(counter.items()) + lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items()) + + def bit_length(s): + return sum(len(cb[c]) for c in s) + + def est_len(occ): + idx = bisect.bisect_left(lengths, (occ, 0)) + return lengths[idx][1] + 1 + + # The cost of adding a dictionary word is just its storage size + # while its savings is close to the difference between the original + # huffman bit-length of the string and the estimated bit-length + # of the dictionary word, times the number of times the word appears. + # + # The savings is not strictly accurate because including a word into + # the Huffman tree bumps up the encoding lengths of all words in the + # same subtree. In the extreme case when the new word is so frequent + # that it gets a one-bit encoding, all other words will cost an extra + # bit each. This is empirically modeled by the constant factor added to + # cost, but the specific value used isn't "proven" to be correct. + # + # Another source of inaccuracy is that compressed strings end up + # on byte boundaries, not bit boundaries, so saving 1 bit somewhere + # might not save a byte. + # + # In fact, when this change was first made, some translations (luckily, + # ones on boards not at all close to full) wasted up to 40 bytes, + # while the most constrained boards typically gained 100 bytes or + # more. + # + # The difference between the two is the estimated net savings, in bits. + def est_net_savings(s, occ): + savings = occ * (bit_length(s) - est_len(occ)) + cost = len(s) * bits_per_codepoint + 24 + return savings - cost + + counter = collections.Counter() + for t in texts: + for (found, word) in extractor.iter_words(t): + if not found: + for substr in iter_substrings(word, minlen=2, maxlen=11): + counter[substr] += 1 + + # Score the candidates we found. This is a semi-empirical formula that + # attempts to model the number of bits saved as closely as possible. + # + # It attempts to compute the codeword lengths of the original word + # to the codeword length the dictionary entry would get, times + # the number of occurrences, less the ovehead of the entries in the + # words[] array. + + scores = sorted( + ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1] + ) + + # Pick the one with the highest score. The score must be negative. + if not scores or scores[0][-1] >= 0: + break + + word = scores[0][0] + words.append(word) + + words.sort(key=len) + extractor = TextSplitter(words) + counter = collections.Counter() + for t in texts: + for atom in extractor.iter(t): + counter[atom] += 1 + cb = huffman.codebook(counter.items()) + + word_start = start_unused + word_end = word_start + len(words) - 1 + print("// # words", len(words)) + print("// words", words) + + values = [] + length_count = {} + renumbered = 0 + last_length = None + canonical = {} + for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): + values.append(atom) + length = len(code) + if length not in length_count: + length_count[length] = 0 + length_count[length] += 1 + if last_length: + renumbered <<= length - last_length + canonical[atom] = "{0:0{width}b}".format(renumbered, width=length) + # print(f"atom={repr(atom)} code={code}", file=sys.stderr) + if len(atom) > 1: + o = words.index(atom) + 0x80 + s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom) + else: + s = C_ESCAPES.get(atom, atom) + o = ord(atom) + print("//", o, s, counter[atom], canonical[atom], renumbered) + renumbered += 1 + last_length = length + lengths = bytearray() + print("// length count", length_count) + + for i in range(1, max(length_count) + 2): + lengths.append(length_count.get(i, 0)) + print("// values", values, "lengths", len(lengths), lengths) + + print("//", values, lengths) + values = [(atom if len(atom) == 1 else chr(0x80 + words.index(atom))) for atom in values] + max_translation_encoded_length = max( + len(translation.encode("utf-8")) for (original, translation) in translations + ) + + maxlen = len(words[-1]) + minlen = len(words[0]) + wlencount = [len([None for w in words if len(w) == l]) for l in range(minlen, maxlen + 1)] + + with open(compression_filename, "w") as f: + f.write("typedef {} mchar_t;".format(values_type)) + f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths)))) + f.write( + "const mchar_t values[] = {{ {} }};\n".format(", ".join(str(ord(u)) for u in values)) + ) + f.write( + "#define compress_max_length_bits ({})\n".format( + max_translation_encoded_length.bit_length() + ) + ) + f.write( + "const mchar_t words[] = {{ {} }};\n".format( + ", ".join(str(ord(c)) for w in words for c in w) + ) + ) + f.write( + "const uint8_t wlencount[] = {{ {} }};\n".format(", ".join(str(p) for p in wlencount)) + ) + f.write("#define word_start {}\n".format(word_start)) + f.write("#define word_end {}\n".format(word_end)) + f.write("#define minlen {}\n".format(minlen)) + f.write("#define maxlen {}\n".format(maxlen)) + + return (values, lengths, words, canonical, extractor) + + +def decompress(encoding_table, encoded, encoded_length_bits): + (values, lengths, words, _, _) = encoding_table + dec = [] + this_byte = 0 + this_bit = 7 + b = encoded[this_byte] + bits = 0 + for i in range(encoded_length_bits): + bits <<= 1 + if 0x80 & b: + bits |= 1 + + b <<= 1 + if this_bit == 0: + this_bit = 7 + this_byte += 1 + if this_byte < len(encoded): + b = encoded[this_byte] + else: + this_bit -= 1 + length = bits + + i = 0 + while i < length: + bits = 0 + bit_length = 0 + max_code = lengths[0] + searched_length = lengths[0] + while True: + bits <<= 1 + if 0x80 & b: + bits |= 1 + + b <<= 1 + bit_length += 1 + if this_bit == 0: + this_bit = 7 + this_byte += 1 + if this_byte < len(encoded): + b = encoded[this_byte] + else: + this_bit -= 1 + if max_code > 0 and bits < max_code: + # print('{0:0{width}b}'.format(bits, width=bit_length)) + break + max_code = (max_code << 1) + lengths[bit_length] + searched_length += lengths[bit_length] + + v = values[searched_length + bits - max_code] + if v >= chr(0x80) and v < chr(0x80 + len(words)): + v = words[ord(v) - 0x80] + i += len(v.encode("utf-8")) + dec.append(v) + return "".join(dec) + + +def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded): + if not isinstance(decompressed, str): + raise TypeError() + (_, _, _, canonical, extractor) = encoding_table + + enc = bytearray(len(decompressed) * 3) + current_bit = 7 + current_byte = 0 + + bits = encoded_length_bits + 1 + for i in range(bits - 1, 0, -1): + if len_translation_encoded & (1 << (i - 1)): + enc[current_byte] |= 1 << current_bit + if current_bit == 0: + current_bit = 7 + current_byte += 1 + else: + current_bit -= 1 + + for atom in extractor.iter(decompressed): + for b in canonical[atom]: + if b == "1": + enc[current_byte] |= 1 << current_bit + if current_bit == 0: + current_bit = 7 + current_byte += 1 + else: + current_bit -= 1 + + if current_bit != 7: + current_byte += 1 + return enc[:current_byte] + + +def qstr_escape(qst): + def esc_char(m): + c = ord(m.group(0)) + try: + name = codepoint2name[c] + except KeyError: + name = "0x%02x" % c + return "_" + name + "_" + + return re.sub(r"[^A-Za-z0-9_]", esc_char, qst) + + +def parse_input_headers(infiles): + qcfgs = {} + qstrs = {} + i18ns = set() + + # add static qstrs + for qstr in static_qstr_list: + # work out the corresponding qstr name + ident = qstr_escape(qstr) + + # don't add duplicates + assert ident not in qstrs + + # add the qstr to the list, with order number to retain original order in file + order = len(qstrs) - 300000 + qstrs[ident] = (order, ident, qstr) + + # read the qstrs in from the input files + for infile in infiles: + with open(infile, "rt") as f: + for line in f: + line = line.strip() + + # is this a config line? + match = re.match(r"^QCFG\((.+), (.+)\)", line) + if match: + value = match.group(2) + if value[0] == "(" and value[-1] == ")": + # strip parenthesis from config value + value = value[1:-1] + qcfgs[match.group(1)] = value + continue + + match = re.match(r'^TRANSLATE\("(.*)"\)$', line) + if match: + i18ns.add(match.group(1)) + continue + + # is this a QSTR line? + match = re.match(r"^Q\((.*)\)$", line) + if not match: + continue + + # get the qstr value + qstr = match.group(1) + + # special cases to specify control characters + if qstr == "\\n": + qstr = "\n" + elif qstr == "\\r\\n": + qstr = "\r\n" + + # work out the corresponding qstr name + ident = qstr_escape(qstr) + + # don't add duplicates + if ident in qstrs: + continue + + # add the qstr to the list, with order number to retain original order in file + order = len(qstrs) + # but put special method names like __add__ at the top of list, so + # that their id's fit into a byte + if ident == "": + # Sort empty qstr above all still + order = -200000 + elif ident == "__dir__": + # Put __dir__ after empty qstr for builtin dir() to work + order = -190000 + elif ident.startswith("__"): + order -= 100000 + qstrs[ident] = (order, ident, qstr) + + if not qcfgs and qstrs: + sys.stderr.write("ERROR: Empty preprocessor output - check for errors above\n") + sys.exit(1) + + return qcfgs, qstrs, i18ns + + +def escape_bytes(qstr): + if all(32 <= ord(c) <= 126 and c != "\\" and c != '"' for c in qstr): + # qstr is all printable ASCII so render it as-is (for easier debugging) + return qstr + else: + # qstr contains non-printable codes so render entire thing as hex pairs + qbytes = bytes_cons(qstr, "utf8") + return "".join(("\\x%02x" % b) for b in qbytes) + + +def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr): + qbytes = bytes_cons(qstr, "utf8") + qlen = len(qbytes) + qhash = compute_hash(qbytes, cfg_bytes_hash) + if qlen >= (1 << (8 * cfg_bytes_len)): + print("qstr is too long:", qstr) + assert False + qdata = escape_bytes(qstr) + return '%d, %d, "%s"' % (qhash, qlen, qdata) + + +def print_qstr_data(encoding_table, qcfgs, qstrs, i18ns): + # get config variables + cfg_bytes_len = int(qcfgs["BYTES_IN_LEN"]) + cfg_bytes_hash = int(qcfgs["BYTES_IN_HASH"]) + + # print out the starter of the generated C header file + print("// This file was automatically generated by makeqstrdata.py") + print("") + + # add NULL qstr with no hash or data + print('QDEF(MP_QSTRnull, 0, 0, "")') + + total_qstr_size = 0 + total_qstr_compressed_size = 0 + # go through each qstr and print it out + for order, ident, qstr in sorted(qstrs.values(), key=lambda x: x[0]): + qbytes = make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr) + print("QDEF(MP_QSTR_%s, %s)" % (ident, qbytes)) + + total_qstr_size += len(qstr) + + total_text_size = 0 + total_text_compressed_size = 0 + max_translation_encoded_length = max( + len(translation.encode("utf-8")) for original, translation in i18ns + ) + encoded_length_bits = max_translation_encoded_length.bit_length() + for original, translation in i18ns: + translation_encoded = translation.encode("utf-8") + compressed = compress( + encoding_table, translation, encoded_length_bits, len(translation_encoded) + ) + total_text_compressed_size += len(compressed) + decompressed = decompress(encoding_table, compressed, encoded_length_bits) + assert decompressed == translation + for c in C_ESCAPES: + decompressed = decompressed.replace(c, C_ESCAPES[c]) + print( + 'TRANSLATION("{}", {}) // {}'.format( + original, ", ".join(["{:d}".format(x) for x in compressed]), decompressed + ) + ) + total_text_size += len(translation.encode("utf-8")) + + print() + print("// {} bytes worth of qstr".format(total_qstr_size)) + print("// {} bytes worth of translations".format(total_text_size)) + print("// {} bytes worth of translations compressed".format(total_text_compressed_size)) + print("// {} bytes saved".format(total_text_size - total_text_compressed_size)) + + +def print_qstr_enums(qstrs): + # print out the starter of the generated C header file + print("// This file was automatically generated by makeqstrdata.py") + print("") + + # add NULL qstr with no hash or data + print("QENUM(MP_QSTRnull)") + + # go through each qstr and print it out + for order, ident, qstr in sorted(qstrs.values(), key=lambda x: x[0]): + print("QENUM(MP_QSTR_%s)" % (ident,)) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Process QSTR definitions into headers for compilation" + ) + parser.add_argument( + "infiles", metavar="N", type=str, nargs="+", help="an integer for the accumulator" + ) + parser.add_argument( + "--translation", default=None, type=str, help="translations for i18n() items" + ) + parser.add_argument( + "--compression_filename", default=None, type=str, help="header for compression info" + ) + + args = parser.parse_args() + + qcfgs, qstrs, i18ns = parse_input_headers(args.infiles) + if args.translation: + i18ns = sorted(i18ns) + translations = translate(args.translation, i18ns) + encoding_table = compute_huffman_coding(translations, args.compression_filename) + print_qstr_data(encoding_table, qcfgs, qstrs, translations) + else: + print_qstr_enums(qstrs) |