util/generate_unicode

#!/usr/bin/env python3 # Portions copyright (c) Facebook, Inc. and its affiliates. (http://www.facebook.com) """Generate Unicode name, property, and type databases. This script converts Unicode 3.2 and 11.0 database files into a Pyro-specific format. Run the script using Python version >= 3.7, from this directory, without args. The algorithm was taken from Fredrik Lundh's script for CPython. """ import os import sys import zipfile from dataclasses import dataclass SCRIPT = os.path.relpath(__file__) PYRO_DIR = os.path.normpath(os.path.join(SCRIPT, "..", "..")) RUNTIME_DIR = os.path.join(PYRO_DIR, "runtime") HEADER = "unicode-db.h" HEADER_PATH = os.path.join(RUNTIME_DIR, HEADER) DB_PATH = os.path.join(RUNTIME_DIR, "unicode-db.cpp") LINE_WIDTH = 80 NUM_CODE_POINTS = sys.maxunicode + 1 CODE_POINTS = range(NUM_CODE_POINTS) UNIDATA_VERSION = "11.0.0" OLD_VERSION = "3.2.0" # UCD files CASE_FOLDING = "CaseFolding.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps.txt" EASTASIAN_WIDTH = "EastAsianWidth.txt" LINE_BREAK = "LineBreak.txt" NAME_ALIASES = "NameAliases.txt" NAMED_SEQUENCES = "NamedSequences.txt" SPECIAL_CASING = "SpecialCasing.txt" UNICODE_DATA = "UnicodeData.txt" UNIHAN = "Unihan.zip" # Private Use Areas -- in planes 1, 15, 16 PUA_1 = range(0xE000, 0xF900) PUA_15 = range(0xF0000, 0xFFFFE) PUA_16 = range(0x100000, 0x10FFFE) # we use this ranges of PUA_15 to store name aliases and named sequences NAME_ALIASES_START = 0xF0000 NAMED_SEQUENCES_START = 0xF0200 # Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA MAX_DECOMPOSITION = 18 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk", "So", ] BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", "ON", "LRI", "RLI", "FSI", "PDI", ] EASTASIANWIDTH_NAMES = ["F", "H", "W", "Na", "A", "N"] MANDATORY_LINE_BREAKS = ["BK", "CR", "LF", "NL"] ALPHA_MASK = 0x01 DECIMAL_MASK = 0x02 DIGIT_MASK = 0x04 LOWER_MASK = 0x08 LINEBREAK_MASK = 0x10 SPACE_MASK = 0x20 TITLE_MASK = 0x40 UPPER_MASK = 0x80 XID_START_MASK = 0x100 XID_CONTINUE_MASK = 0x200 PRINTABLE_MASK = 0x400 NUMERIC_MASK = 0x800 CASE_IGNORABLE_MASK = 0x1000 CASED_MASK = 0x2000 EXTENDED_CASE_MASK = 0x4000 def open_data(filename, version): ucd_dir = os.path.join(PYRO_DIR, "third-party", f"ucd-{version}") if not os.path.exists(ucd_dir): os.mkdir(ucd_dir) path = os.path.join(ucd_dir, filename) if not os.path.exists(path): if version == OLD_VERSION: # irregular url structure root, path = os.path.splitext(filename) url = f"http://www.unicode.org/Public/3.2-Update/{root}-{version}{path}" else: url = f"http://www.unicode.org/Public/{version}/ucd/{filename}" raise OSError( f"""Cannot open {path}: file does not exist. Find it online at {url}""" ) if path.endswith(".txt"): return open(path, encoding="utf-8") else: # Unihan.zip return open(path, "rb") ######################################################################################## # CJK ranges @dataclass(frozen=True) class CJKRange: start: int end: int CJK_RANGES = ( CJKRange(0x3400, 0x4DB5), CJKRange(0x4E00, 0x9FEF), CJKRange(0x20000, 0x2A6D6), CJKRange(0x2A700, 0x2B734), CJKRange(0x2B740, 0x2B81D), CJKRange(0x2B820, 0x2CEA1), CJKRange(0x2CEB0, 0x2EBE0), ) ######################################################################################## # the following support code is taken from the unidb utilities # Copyright (c) 1999-2000 by Secret Labs AB # load a unicode-data file from disk class UnicodeData: """Loads a Unicode data file, checking disk first and then internet. Taken from the unidb utilities. Copyright (c) 1999-2000 by Secret Labs AB """ # Record structure: # [ID, name, category, combining, bidi, decomp, (6) # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11) # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) # derived-props] (17) def __init__(self, version, check_cjk=True): # noqa: C901 self.version = version self.aliases = [] self.case_folding = {} self.change_records = [] self.exclusions = set() self.named_sequences = [] self.normalization_changes = [] self.special_casing = {} self.table = [None] * NUM_CODE_POINTS with open_data(UNICODE_DATA, version) as file: while True: s = file.readline() if not s: break s = s.strip().split(";") char = int(s[0], 16) self.table[char] = s # expand first-last ranges field = None cjk_ranges = [] for char in CODE_POINTS: s = self.table[char] if s is not None: if s[1][-6:] == "First>": s[1] = "" field = s elif s[1][-5:] == "Last>": if check_cjk and s[1].startswith("<CJK Ideograph"): start = int(field[0], 16) end = int(s[0], 16) cjk_ranges.append(CJKRange(start, end)) s[1] = "" field = None elif field: f2 = field[:] f2[0] = f"{char:X}" self.table[char] = f2 if check_cjk and set(cjk_ranges) != set(CJK_RANGES): raise ValueError(f"CJK ranges deviate: found {cjk_ranges!r}") # check for name aliases and named sequences, see #12753 # aliases and named sequences are not in 3.2.0 if version != OLD_VERSION: self.aliases = [] # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF, # in order to take advantage of the compression and lookup # algorithms used for the other characters pua_index = NAME_ALIASES_START with open_data(NAME_ALIASES, version) as file: for s in file: s = s.strip() if not s or s.startswith("#"): continue char, name, abbrev = s.split(";") char = int(char, 16) self.aliases.append((name, char)) # also store the name in the PUA 1 self.table[pua_index][1] = name pua_index += 1 assert pua_index - NAME_ALIASES_START == len(self.aliases) self.named_sequences = [] # store named sequences in the PUA 1, in range U+F0100.., # in order to take advantage of the compression and lookup # algorithms used for the other characters. assert pua_index < NAMED_SEQUENCES_START pua_index = NAMED_SEQUENCES_START with open_data(NAMED_SEQUENCES, version) as file: for s in file: s = s.strip() if not s or s.startswith("#"): continue name, chars = s.split(";") chars = tuple(int(char, 16) for char in chars.split()) # check that the structure defined in write_header is OK assert len(chars) <= 4, "change the NamedSequence array size" assert all(c < NUM_CODE_POINTS for c in chars) self.named_sequences.append(NamedSequence(name, chars)) # also store these in the PUA 1 self.table[pua_index][1] = name pua_index += 1 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) self.exclusions = set() with open_data(COMPOSITION_EXCLUSIONS, version) as file: for s in file: s = s.strip() if s and not s.startswith("#"): char = int(s.split()[0], 16) self.exclusions.add(char) widths = [None] * NUM_CODE_POINTS with open_data(EASTASIAN_WIDTH, version) as file: for s in file: s = s.strip() if not s: continue if s[0] == "#": continue s = s.split()[0].split(";") if ".." in s[0]: first, last = [int(c, 16) for c in s[0].split("..")] chars = list(range(first, last + 1)) else: chars = [int(s[0], 16)] for char in chars: widths[char] = s[1] for char in CODE_POINTS: if self.table[char] is not None: self.table[char].append(widths[char]) self.table[char].append(set()) with open_data(DERIVED_CORE_PROPERTIES, version) as file: for s in file: s = s.split("#", 1)[0].strip() if not s: continue r, p = s.split(";") r = r.strip() p = p.strip() if ".." in r: first, last = [int(c, 16) for c in r.split("..")] chars = list(range(first, last + 1)) else: chars = [int(r, 16)] for char in chars: if self.table[char]: # Some properties (e.g. Default_Ignorable_Code_Point) # apply to unassigned code points; ignore them self.table[char][-1].add(p) with open_data(LINE_BREAK, version) as file: for s in file: s = s.partition("#")[0] s = [i.strip() for i in s.split(";")] if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: continue if ".." not in s[0]: first = last = int(s[0], 16) else: first, last = [int(c, 16) for c in s[0].split("..")] for char in range(first, last + 1): self.table[char][-1].add("Line_Break") # We only want the quickcheck properties # Format: NF?_QC; Y(es)/N(o)/M(aybe) # Yes is the default, hence only N and M occur # In 3.2.0, the format was different (NF?_NO) # The parsing will incorrectly determine these as # "yes", however, unicodedata.c will not perform quickchecks # for older versions, and no delta records will be created. quickchecks = [0] * NUM_CODE_POINTS qc_order = "NFD_QC NFKD_QC NFC_QC NFKC_QC".split() with open_data(DERIVEDNORMALIZATION_PROPS, version) as file: for s in file: if "#" in s: s = s[: s.index("#")] s = [i.strip() for i in s.split(";")] if len(s) < 2 or s[1] not in qc_order: continue quickcheck = "MN".index(s[2]) + 1 # Maybe or No quickcheck_shift = qc_order.index(s[1]) * 2 quickcheck <<= quickcheck_shift if ".." not in s[0]: first = last = int(s[0], 16) else: first, last = [int(c, 16) for c in s[0].split("..")] for char in range(first, last + 1): assert not (quickchecks[char] >> quickcheck_shift) & 3 quickchecks[char] |= quickcheck with open_data(UNIHAN, version) as file: zip = zipfile.ZipFile(file) if version == OLD_VERSION: data = zip.open(f"Unihan-{OLD_VERSION}.txt").read() else: data = zip.open("Unihan_NumericValues.txt").read() for line in data.decode("utf-8").splitlines(): if not line.startswith("U+"): continue code, tag, value = line.split(None, 3)[:3] if tag not in ("kAccountingNumeric", "kPrimaryNumeric", "kOtherNumeric"): continue value = value.strip().replace(",", "") i = int(code[2:], 16) # Patch the numeric field if self.table[i] is not None: self.table[i][8] = value sc = self.special_casing = {} with open_data(SPECIAL_CASING, version) as file: for s in file: s = s[:-1].split("#", 1)[0] if not s: continue data = s.split("; ") if data[4]: # We ignore all conditionals (since they depend on # languages) except for one, which is hardcoded. See # handle_capital_sigma in unicodeobject.c. continue c = int(data[0], 16) lower = [int(char, 16) for char in data[1].split()] title = [int(char, 16) for char in data[2].split()] upper = [int(char, 16) for char in data[3].split()] sc[c] = (lower, title, upper) cf = self.case_folding = {} if version != OLD_VERSION: with open_data(CASE_FOLDING, version) as file: for s in file: s = s[:-1].split("#", 1)[0] if not s: continue data = s.split("; ") if data[1] in "CF": c = int(data[0], 16) cf[c] = [int(char, 16) for char in data[2].split()] for char in CODE_POINTS: if self.table[char] is not None: self.table[char].append(quickchecks[char]) def merge(self, old): # noqa: C901 # Changes to exclusion file not implemented yet if old.exclusions != self.exclusions: raise NotImplementedError("exclusions differ") # In these change records, 0xFF means "no change" bidir_changes = [0xFF] * NUM_CODE_POINTS category_changes = [0xFF] * NUM_CODE_POINTS decimal_changes = [0xFF] * NUM_CODE_POINTS east_asian_width_changes = [0xFF] * NUM_CODE_POINTS mirrored_changes = [0xFF] * NUM_CODE_POINTS # In numeric data, 0 means "no change", # -1 means "did not have a numeric value numeric_changes = [0.0] * NUM_CODE_POINTS # normalization_changes is a list of key-value pairs normalization_changes = [] for char in CODE_POINTS: if self.table[char] is None: # Characters unassigned in the new version ought to # be unassigned in the old one assert old.table[char] is None continue # check characters unassigned in the old version if old.table[char] is None: # category 0 is "unassigned" category_changes[char] = 0 continue # check characters that differ if old.table[char] != self.table[char]: for k in range(len(old.table[char])): if old.table[char][k] != self.table[char][k]: value = old.table[char][k] if k == 1 and char in PUA_15: # the name is not set in the old.table, but in the # self.table we are using it for aliases and named seq assert value == "" elif k == 2: category_changes[char] = CATEGORY_NAMES.index(value) elif k == 4: bidir_changes[char] = BIDIRECTIONAL_NAMES.index(value) elif k == 5: # We assume all normalization changes are in 1:1 mappings assert " " not in value normalization_changes.append((char, value)) elif k == 6: # only support changes where the old value is a single digit assert value in "0123456789" decimal_changes[char] = int(value) elif k == 8: # Since 0 encodes "no change", the old value is better not 0 if not value: numeric_changes[char] = -1.0 else: numeric_changes[char] = float(value) assert numeric_changes[char] not in (0, -1) elif k == 9: if value == "Y": mirrored_changes[char] = 1 else: mirrored_changes[char] = 0 elif k == 11: # change to ISO comment, ignore pass elif k == 12: # change to simple uppercase mapping; ignore pass elif k == 13: # change to simple lowercase mapping; ignore pass elif k == 14: # change to simple titlecase mapping; ignore pass elif k == 15: # change to east asian width east_asian_width_changes[char] = EASTASIANWIDTH_NAMES.index( value ) elif k == 16: # derived property changes; not yet pass elif k == 17: # normalization quickchecks are not performed # for older versions pass else: class Difference(Exception): pass raise Difference( hex(char), k, old.table[char], self.table[char] ) self.change_records = zip( bidir_changes, category_changes, decimal_changes, east_asian_width_changes, mirrored_changes, numeric_changes, ) self.normalization_changes = normalization_changes ######################################################################################## # Helper functions & classes def getsize(data): """Return the smallest possible integer size for the given array.""" maxdata = max(data) if maxdata < (1 << 8): return 1 if maxdata < (1 << 16): return 2 return 4 def splitbins(t, trace=False): # noqa: C901 """t, trace=False -> (t1, t2, shift). Split a table to save space. t is a sequence of ints. This function can be useful to save space if many of the ints are the same. t1 and t2 are lists of ints, and shift is an int, chosen to minimize the combined size of t1 and t2 (in C code), and where for each i in range(len(t)), t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] where mask is a bitmask isolating the last "shift" bits. If optional arg trace is non-zero (default zero), progress info is printed to sys.stderr. The higher the value, the more info you'll get. """ if trace: def dump(t1, t2, shift, bytes): sys.stderr.write( f"{len(t1)}+{len(t2)} bins at shift {shift}; {bytes} bytes\n" ) sys.stderr.write(f"Size of original table: {len(t) * getsize(t)} bytes\n") n = len(t) - 1 # last valid index maxshift = 0 # the most we can shift n and still have something left if n > 0: while n >> 1: n >>= 1 maxshift += 1 del n bytes = sys.maxsize # smallest total size so far t = tuple(t) # so slices can be dict keys for shift in range(maxshift + 1): t1 = [] t2 = [] size = 2 ** shift bincache = {} for i in range(0, len(t), size): bin = t[i : i + size] index = bincache.get(bin) if index is None: index = len(t2) bincache[bin] = index t2.extend(bin) t1.append(index >> shift) # determine memory size b = len(t1) * getsize(t1) + len(t2) * getsize(t2) if trace: dump(t1, t2, shift, b) if b < bytes: best = t1, t2, shift bytes = b t1, t2, shift = best if trace: sys.stderr.write(f"Best: ") dump(t1, t2, shift, bytes) if __debug__: # exhaustively verify that the decomposition is correct mask = ~((~0) << shift) # i.e., low-bit mask of shift bits for i in range(len(t)): assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)] return best @dataclass class WordData: word: str index: int frequency: int class Words: def __init__(self, names, trace=False): self.indices = {} self.frequencies = {} count = num_chars = num_words = 0 for char in CODE_POINTS: name = names[char] if name is None: continue words = name.split() num_chars += len(name) num_words += len(words) for word in words: if word in self.indices: self.frequencies[word] += 1 else: self.frequencies[word] = 1 self.indices[word] = count count += 1 if trace: print(f"{num_words} words in text; {num_chars} code points") def tolist(self): return sorted( ( WordData(word, index, self.frequencies[word]) for word, index in self.indices.items() ), key=lambda data: (-data.frequency, data.word), ) ######################################################################################## # Classes for database tables @dataclass(frozen=True) class ChangeRecord: bidirectional: int category: int decimal: int east_asian_width: int mirrored: int numeric: float def __str__(self): return f"\ {{{self.bidirectional:#04x}, {self.category:#04x}, {self.decimal:#04x}, \ {self.east_asian_width:#04x}, {self.mirrored:#04x}, {self.numeric}}}" @dataclass(frozen=True) class DatabaseRecord: bidirectional: int category: int combining: int east_asian_width: int mirrored: bool quick_check: int def __str__(self): mirrored = "true" if self.mirrored else "false" return f"{{{self.bidirectional:#04x}, {self.category:#04x}, \ {self.combining:#04x}, {self.east_asian_width:#04x}, {mirrored}, \ {self.quick_check:#04x}}}" @dataclass(frozen=True) class NamedSequence: name: int seq: int def __str__(self): seq_str = ", ".join(f"{elt:#04x}" for elt in self.seq) return f"{{{len(self.seq)}, {{{seq_str}}}}}" @dataclass(frozen=True) class Reindex: start: int count: int index: int def __str__(self): return f"{{{self.start:#010x}, {self.count}, {self.index}}}" @dataclass(frozen=True) class TypeRecord: decimal: int digit: int flags: int lower: int title: int upper: int def __str__(self): return f"{{{self.decimal}, {self.digit}, {self.flags:#06x}, \ {self.lower:#x}, {self.title:#x}, {self.upper:#x}}}" class CodePointArray: """Arrays of code points, printed as hex.""" def __init__(self, name, data): self.name = name self.data = data def dump(self, file, trace=False): if trace: sys.stderr.write(f"{self.name}: {4 * len(self.data)} bytes\n") file.write(f"\nstatic const int32_t {self.name}[] = {{") columns = (LINE_WIDTH - 3) // 12 for i in range(0, len(self.data), columns): line = "\n " for item in self.data[i : i + columns]: line = f"{line} {item:#010x}," file.write(line) file.write("\n};\n") class SmallStrArray: def __init__(self, name, data, comment=None): self.name = name self.data = data self.comment = comment def dump(self, file, trace=False): if self.comment is not None: file.write(f"\n// {self.comment}") file.write(f"\nconst RawSmallStr {self.name}[] = {{") if self.data: file.write("\n") for item in self.data: file.write(f' RawSmallStr::fromCStr("{item}"),\n') file.write("};\n") class StructArray: """Store an array for writing to a generated C/C++ file.""" def __init__(self, ctype, name, data, comment=None): self.type = ctype self.name = name self.data = data self.comment = comment def dump(self, file, trace=False): if self.comment is not None: file.write(f"\n// {self.comment}") file.write(f"\nstatic const {self.type} {self.name}[] = {{") if self.data: file.write("\n") for item in self.data: file.write(f" {item},\n") file.write("};\n") class UIntArray: """Unsigned integer arrays, printed as hex.""" def __init__(self, name, data): self.name = name self.data = data self.size = getsize(data) def dump(self, file, trace=False): if trace: sys.stderr.write(f"{self.name}: {self.size * len(self.data)} bytes\n") file.write(f"\nstatic const uint{8 * self.size}_t {self.name}[] = {{") columns = (LINE_WIDTH - 3) // (2 * self.size + 4) for i in range(0, len(self.data), columns): line = "\n " for item in self.data[i : i + columns]: if self.size == 1: line = f"{line} {item:#04x}," elif self.size == 2: line = f"{line} {item:#06x}," else: line = f"{line} {item:#010x}," file.write(line) file.write("\n};\n") ######################################################################################## # Reimplementation of dictionaries using static structures and a custom string hash # Number chosen to minimize the number of collisions. CPython uses 47. MAGIC = 43 def myhash(s): h = 0 for c in map(ord, s.upper()): h = (h * MAGIC) + c if h > 0x00FFFFFF: h = (h & 0x00FFFFFF) ^ ((h & 0xFF000000) >> 24) return h SIZES = [ (4, 3), (8, 3), (16, 3), (32, 5), (64, 3), (128, 3), (256, 29), (512, 17), (1024, 9), (2048, 5), (4096, 83), (8192, 27), (16384, 43), (32768, 3), (65536, 45), (131072, 9), (262144, 39), (524288, 39), (1048576, 9), (2097152, 5), (4194304, 3), (8388608, 33), (16777216, 27), ] class Hash: def __init__(self, data): """Turn a (key, value) list into a static hash table structure.""" # determine table size for size, poly in SIZES: if size > len(data): self.poly = poly + size self.size = size break else: raise AssertionError("ran out of polynomials") print(self.size, "slots in hash table") table = [None] * self.size mask = self.size - 1 # initialize hash table collisions = 0 for key, value in data: h = myhash(key) i = (~h) & mask if table[i] is None: table[i] = value continue incr = (h ^ (h >> 3)) & mask if not incr: incr = mask while True: collisions += 1 i = (i + incr) & mask if table[i] is None: table[i] = value break incr = incr << 1 if incr > mask: incr = incr ^ self.poly print(collisions, "collisions") for i in range(len(table)): if table[i] is None: table[i] = 0 self.data = CodePointArray("kCodeHash", table) def dump(self, file, trace): self.data.dump(file, trace) file.write( f""" static const int32_t kCodeMagic = {MAGIC}; static const int32_t kCodeMask = {self.size - 1}; static const int32_t kCodePoly = {self.poly}; """ ) ######################################################################################## # Unicode database writers def write_header(unicode, header): """Writes type declarations to the database header file.""" header.write("// @") header.write( f"""\ generated by {os.path.basename(SCRIPT)} #pragma once #include <cstdint> #include "globals.h" #include "objects.h" #include "unicode.h" namespace py {{ static const int kMaxNameLength = 256; // Longest decomposition in Unicode {UNIDATA_VERSION}: U+FDFA static const int kMaxDecomposition = {MAX_DECOMPOSITION}; static_assert(Unicode::kAliasStart == {NAME_ALIASES_START:#x}, "Unicode aliases start at unexpected code point"); static_assert(Unicode::kAliasCount == {len(unicode.aliases)}, "Unexpected number of Unicode aliases"); static_assert(Unicode::kNamedSequenceStart == {NAMED_SEQUENCES_START:#x}, "Unicode named sequences start at unexpected code point"); static_assert(Unicode::kNamedSequenceCount == {len(unicode.named_sequences)}, "Unexpected number of Unicode named sequences"); enum NormalizationForm : byte {{ kInvalid = 0, kNFD = 0x3, kNFKD = 0xc, kNFC = 0x30, kNFKC = 0xc0, }}; enum : int32_t {{ kAlphaMask = {ALPHA_MASK:#x}, kDecimalMask = {DECIMAL_MASK:#x}, kDigitMask = {DIGIT_MASK:#x}, kLowerMask = {LOWER_MASK:#x}, kLinebreakMask = {LINEBREAK_MASK:#x}, kSpaceMask = {SPACE_MASK:#x}, kTitleMask = {TITLE_MASK:#x}, kUpperMask = {UPPER_MASK:#x}, kXidStartMask = {XID_START_MASK:#x}, kXidContinueMask = {XID_CONTINUE_MASK:#x}, kPrintableMask = {PRINTABLE_MASK:#x}, kNumericMask = {NUMERIC_MASK:#x}, kCaseIgnorableMask = {CASE_IGNORABLE_MASK:#x}, kCasedMask = {CASED_MASK:#x}, kExtendedCaseMask = {EXTENDED_CASE_MASK:#x}, }}; struct UnicodeChangeRecord {{ const byte bidirectional; const byte category; const byte decimal; const byte east_asian_width; const byte mirrored; const double numeric; }}; struct UnicodeDatabaseRecord {{ const byte bidirectional; const byte category; const byte combining; // canonical combining class const byte east_asian_width; const bool mirrored; const byte quick_check; }}; struct UnicodeDecomposition {{ const char* prefix; const int count; const int32_t* code_points; }}; struct UnicodeNamedSequence {{ const byte length; const int32_t code_points[4]; }}; struct UnicodeTypeRecord {{ // Note: if more flag space is needed, decimal and digit could be unified const int8_t decimal; const int8_t digit; const int16_t flags; // Deltas to the character or offsets in kExtendedCase const int32_t lower; const int32_t title; const int32_t upper; }}; extern const RawSmallStr kBidirectionalNames[]; extern const RawSmallStr kCategoryNames[]; extern const RawSmallStr kEastAsianWidthNames[]; // Get a code point from its Unicode name. // Returns the code point if the lookup succeeds, -1 if it fails. int32_t codePointFromName(const byte* name, word size); int32_t codePointFromNameOrNamedSequence(const byte* name, word size); // Returns the NFC composition given the NFC first and last indices. int32_t composeCodePoint(int32_t first, int32_t last); // Returns the decomposition mapping of the code point. UnicodeDecomposition decomposeCodePoint(int32_t code_point); // Returns the case mapping for code points where offset is insufficient int32_t extendedCaseMapping(int32_t index); // Finds the first/last character of an NFC sequence containing the code point. int32_t findNFCFirst(int32_t code_point); int32_t findNFCLast(int32_t code_point); // Write the Unicode name for the given code point into the buffer. // Returns true if the name was written successfully, false otherwise. bool nameFromCodePoint(int32_t code_point, byte* buffer, word size); // Returns the normalization of the code point in Unicode {OLD_VERSION}, if it differs // from the current version. If the normalization is unchanged, returns -1. int32_t normalizeOld(int32_t code_point); // Returns the numeric value of the code point, or -1.0 if not numeric. double numericValue(int32_t code_point); // Returns true if the code point has one of the line break properties "BK", // "CR", "LR", or "NL" or the bidirectional type "B". Returns false otherwise. bool unicodeIsLinebreak(int32_t code_point); // Returns true if the code point has the bidirectional type "WS", "B", or "S" // or the category "Zs". Returns false otherwise. bool unicodeIsWhitespace(int32_t code_point); const UnicodeChangeRecord* changeRecord(int32_t code_point); const UnicodeDatabaseRecord* databaseRecord(int32_t code_point); const UnicodeNamedSequence* namedSequence(int32_t code_point); const UnicodeTypeRecord* typeRecord(int32_t code_point); }} // namespace py """ ) def write_db_prelude(db): """Writes required include directives to the database file.""" db.write("// @") db.write( f"""\ generated by {os.path.basename(SCRIPT)} #include "{HEADER}" #include <cinttypes> #include <cstdio> #include <cstring> #include "asserts.h" #include "globals.h" #include "objects.h" #include "unicode.h" namespace py {{ """ ) def write_database_records(unicode, db, trace): # noqa: C901 dummy = DatabaseRecord(0, 0, 0, 0, False, 0) table = [dummy] cache = {dummy: 0} index = [0] * NUM_CODE_POINTS decomp_data = [0] decomp_prefix = [""] decomp_index = [0] * NUM_CODE_POINTS decomp_size = 0 comp_pairs = [] comp_first = [None] * NUM_CODE_POINTS comp_last = [None] * NUM_CODE_POINTS for char in CODE_POINTS: record = unicode.table[char] if record: # database properties item = DatabaseRecord( BIDIRECTIONAL_NAMES.index(record[4]), CATEGORY_NAMES.index(record[2]), int(record[3]), EASTASIANWIDTH_NAMES.index(record[15]), record[9] == "Y", record[17], ) idx = cache.get(item) if idx is None: cache[item] = idx = len(table) table.append(item) index[char] = idx # decomposition data decomp_idx = 0 if record[5]: decomp = record[5].split() prefix = decomp.pop(0) if decomp[0][0] == "<" else "" if len(decomp) > MAX_DECOMPOSITION: raise Exception( f"decomposition of code point {char:#x} is too large" ) try: idx = decomp_prefix.index(prefix) except ValueError: idx = len(decomp_prefix) decomp_prefix.append(prefix) assert idx < 256 decomp = [idx + (len(decomp) << 8)] + [int(s, 16) for s in decomp] if ( not idx and len(decomp) == 3 and char not in unicode.exclusions and unicode.table[decomp[1]][3] == "0" ): _, l, r = decomp comp_first[l] = 1 comp_last[r] = 1 comp_pairs.append((l, r, char)) try: decomp_idx = decomp_data.index(decomp) except ValueError: decomp_idx = len(decomp_data) decomp_data.extend(decomp) decomp_size = decomp_size + len(decomp) * 2 decomp_index[char] = decomp_idx first = last = 0 comp_first_ranges = [] comp_last_ranges = [] prev_first = prev_last = None for ch in CODE_POINTS: if comp_first[ch] is not None: comp_first[ch] = first first += 1 if prev_first is None: prev_first = (ch, ch) elif prev_first[1] + 1 == ch: prev_first = prev_first[0], ch else: comp_first_ranges.append(prev_first) prev_first = (ch, ch) if comp_last[ch] is not None: comp_last[ch] = last last += 1 if prev_last is None: prev_last = (ch, ch) elif prev_last[1] + 1 == ch: prev_last = prev_last[0], ch else: comp_last_ranges.append(prev_last) prev_last = (ch, ch) comp_first_ranges.append(prev_first) comp_last_ranges.append(prev_last) total_first = first total_last = last comp_data = [0] * (total_first * total_last) for first, last, char in comp_pairs: first = comp_first[first] last = comp_last[last] comp_data[first * total_last + last] = char if trace: print(len(table), "unique properties") print(len(decomp_prefix), "unique decomposition prefixes") print(len(decomp_data), "unique decomposition entries:", end=" ") print(decomp_size, "bytes") print(total_first, "first characters in NFC") print(total_last, "last characters in NFC") print(len(comp_pairs), "NFC pairs") StructArray( "UnicodeDatabaseRecord", "kDatabaseRecords", table, "a list of unique database records", ).dump(db, trace) # split record index table index1, index2, shift = splitbins(index, trace) db.write( f""" // type indices static const int kDatabaseIndexShift = {shift}; static const int32_t kDatabaseIndexMask = (int32_t{{1}} << kDatabaseIndexShift) - 1; """ ) UIntArray("kDatabaseIndex1", index1).dump(db, trace) UIntArray("kDatabaseIndex2", index2).dump(db, trace) db.write( f""" // Reindexing of NFC first and last characters struct Reindex {{ const int32_t start; const short count; const short index; }}; static const int kTotalLast = {total_last}; """ ) nfc_first = [ Reindex(start, end - start, comp_first[start]) for start, end in comp_first_ranges ] + [Reindex(0, 0, 0)] nfc_last = [ Reindex(start, end - start, comp_last[start]) for start, end in comp_last_ranges ] + [Reindex(0, 0, 0)] StructArray("Reindex", "kNFCFirst", nfc_first).dump(db, trace) StructArray("Reindex", "kNFCLast", nfc_last).dump(db, trace) # split decomposition index table index1, index2, shift = splitbins(decomp_index, trace) db.write( f""" // decomposition mappings static const int kDecompShift = {shift}; static const int32_t kDecompMask = (int32_t{{1}} << kDecompShift) - 1; const char* kDecompPrefix[] = {{ """ ) for name in decomp_prefix: db.write(f' "{name}",\n') db.write("};\n") CodePointArray("kDecompData", decomp_data).dump(db, trace) UIntArray("kDecompIndex1", index1).dump(db, trace) UIntArray("kDecompIndex2", index2).dump(db, trace) index1, index2, shift = splitbins(comp_data, trace) db.write( f""" // NFC pairs static const int kCompShift = {shift}; static const int32_t kCompMask = (int32_t{{1}} << kCompShift) - 1; """ ) UIntArray("kCompIndex", index1).dump(db, trace) UIntArray("kCompData", index2).dump(db, trace) def write_name_data(unicode, db, trace): # noqa: C901 # Collect names names = [None] * NUM_CODE_POINTS for char in CODE_POINTS: record = unicode.table[char] if record: name = record[1].strip() if name and name[0] != "<": names[char] = f"{name}\0" if trace: print(len([n for n in names if n is not None]), "distinct names") # Collect unique words from names. Note that we differ between # words ending a sentence, which include the trailing null byte, # and words inside a sentence, which do not. words = Words(names, trace) wordlist = words.tolist() # Figure out how many phrasebook escapes we need short = (65536 - len(wordlist)) // 256 assert short > 0 if trace: print(short, "short indexes in lexicon") n = sum(wordlist[i].frequency for i in range(short)) print(n, "short indexes in phrasebook") # Pick the most commonly used words, and sort the rest by falling # length to maximize overlap. tail = wordlist[short:] tail.sort(key=lambda data: len(data.word), reverse=True) wordlist[short:] = tail # Build a lexicon string from words current_offset = 0 lexicon_offset = [0] lexicon_string = "" word_offsets = {} for data in wordlist: # Encoding: bit 7 indicates last character in word # chr(128) indicates the last character in an entire string) last = ord(data.word[-1]) encoded = f"{data.word[:-1]}{chr(last + 128)}" # reuse string tails, when possible offset = lexicon_string.find(encoded) if offset < 0: offset = current_offset lexicon_string = lexicon_string + encoded current_offset += len(encoded) word_offsets[data.word] = len(lexicon_offset) lexicon_offset.append(offset) lexicon = [ord(ch) for ch in lexicon_string] # generate phrasebook from names and lexicon phrasebook = [0] phrasebook_offset = [0] * NUM_CODE_POINTS for char in CODE_POINTS: name = names[char] if name is not None: words = name.split() phrasebook_offset[char] = len(phrasebook) for word in words: offset = word_offsets[word] if offset < short: phrasebook.append(offset) else: # store as two bytes phrasebook.append((offset >> 8) + short) phrasebook.append(offset & 255) assert getsize(phrasebook) == 1 db.write(f"\n// lexicon") UIntArray("kLexicon", lexicon).dump(db, trace) UIntArray("kLexiconOffset", lexicon_offset).dump(db, trace) # split phrasebook index table offset1, offset2, shift = splitbins(phrasebook_offset, trace) db.write( f""" // code => name phrasebook static const int kPhrasebookShift = {shift}; static const int kPhrasebookShort = {short}; static const int kPhrasebookMask = (1 << kPhrasebookShift) - 1; """ ) UIntArray("kPhrasebook", phrasebook).dump(db, trace) UIntArray("kPhrasebookOffset1", offset1).dump(db, trace) UIntArray("kPhrasebookOffset2", offset2).dump(db, trace) # Extract names for name hash table hash_data = [] for char in CODE_POINTS: record = unicode.table[char] if record: name = record[1].strip() if name and name[0] != "<": hash_data.append((name, char)) db.write("\n// name => code dictionary") Hash(hash_data).dump(db, trace) aliases = [codepoint for _, codepoint in unicode.aliases] UIntArray("kNameAliases", aliases).dump(db, trace) StructArray( "UnicodeNamedSequence", "kNamedSequences", unicode.named_sequences ).dump(db, trace) def write_type_data(unicode, db, trace): # noqa: C901 """Writes Unicode character type tables to the database file.""" # extract unicode types dummy = TypeRecord(0, 0, 0, 0, 0, 0) table = [dummy] cache = {0: dummy} index = [0] * NUM_CODE_POINTS numeric = {} spaces = [] linebreaks = [] extended_cases = [] for char in CODE_POINTS: record = unicode.table[char] if record: # extract database properties category = record[2] bidirectional = record[4] properties = record[16] flags = 0 # TODO(T55176519): delta = True if category in ("Lm", "Lt", "Lu", "Ll", "Lo"): flags |= ALPHA_MASK if "Lowercase" in properties: flags |= LOWER_MASK if "Line_Break" in properties or bidirectional == "B": flags |= LINEBREAK_MASK linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): flags |= SPACE_MASK spaces.append(char) if category == "Lt": flags |= TITLE_MASK if "Uppercase" in properties: flags |= UPPER_MASK if char == ord(" ") or category[0] not in ("C", "Z"): flags |= PRINTABLE_MASK if "XID_Start" in properties: flags |= XID_START_MASK if "XID_Continue" in properties: flags |= XID_CONTINUE_MASK if "Cased" in properties: flags |= CASED_MASK if "Case_Ignorable" in properties: flags |= CASE_IGNORABLE_MASK sc = unicode.special_casing.get(char) cf = unicode.case_folding.get(char, [char]) if record[12]: upper = int(record[12], 16) else: upper = char if record[13]: lower = int(record[13], 16) else: lower = char if record[14]: title = int(record[14], 16) else: title = upper if sc is None and cf != [lower]: sc = ([lower], [title], [upper]) if sc is None: if upper == lower == title: upper = lower = title = 0 else: upper = upper - char lower = lower - char title = title - char assert ( abs(upper) <= 2147483647 and abs(lower) <= 2147483647 and abs(title) <= 2147483647 ) else: # This happens either when some character maps to more than one # character in uppercase, lowercase, or titlecase or the # casefolded version of the character is different from the # lowercase. The extra characters are stored in a different # array. flags |= EXTENDED_CASE_MASK lower = len(extended_cases) | (len(sc[0]) << 24) extended_cases.extend(sc[0]) if cf != sc[0]: lower |= len(cf) << 20 extended_cases.extend(cf) upper = len(extended_cases) | (len(sc[2]) << 24) extended_cases.extend(sc[2]) # Title is probably equal to upper. if sc[1] == sc[2]: title = upper else: title = len(extended_cases) | (len(sc[1]) << 24) extended_cases.extend(sc[1]) # decimal digit, integer digit decimal = 0 if record[6]: flags |= DECIMAL_MASK decimal = int(record[6]) digit = 0 if record[7]: flags |= DIGIT_MASK digit = int(record[7]) if record[8]: flags |= NUMERIC_MASK numeric.setdefault(record[8], []).append(char) item = TypeRecord(decimal, digit, flags, lower, title, upper) # add entry to index and item tables i = cache.get(item) if i is None: cache[item] = i = len(table) table.append(item) index[char] = i print(len(table), "unique character type entries") print(sum(map(len, numeric.values())), "numeric code points") print(len(spaces), "whitespace code points") print(len(linebreaks), "linebreak code points") print(len(extended_cases), "extended case array") StructArray( "UnicodeTypeRecord", "kTypeRecords", table, "a list of unique character type descriptors", ).dump(db, trace) # split record index table index1, index2, shift = splitbins(index, trace) db.write( f""" // type indices static const int kTypeIndexShift = {shift}; static const int32_t kTypeIndexMask = (int32_t{{1}} << kTypeIndexShift) - 1; """ ) UIntArray("kTypeIndex1", index1).dump(db, trace) UIntArray("kTypeIndex2", index2).dump(db, trace) # extended case mappings CodePointArray("kExtendedCase", extended_cases).dump(db, trace) db.write( """ double numericValue(int32_t code_point) { switch (code_point) {""" ) for value, codepoints in sorted(numeric.items()): parts = value.split("/") value = " / ".join(repr(float(part)) for part in parts) codepoints.sort() for codepoint in codepoints: db.write(f"\n case {codepoint:#08x}:") db.write(f"\n return {value};") db.write( """ default: return -1.0; } } bool unicodeIsLinebreak(int32_t code_point) { switch (code_point) {""" ) for codepoint in sorted(linebreaks): db.write(f"\n case {codepoint:#08x}:") db.write( """ return true; default: return false; } } bool unicodeIsWhitespace(int32_t code_point) { switch (code_point) {""" ) for codepoint in sorted(spaces): db.write(f"\n case {codepoint:#08x}:") db.write( """ return true; default: return false; } } """ ) def write_change_data(unicode, db, trace): dummy = ChangeRecord(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0.0) records = [dummy] cache = {dummy: 0} index = [0] * NUM_CODE_POINTS # Generate delta tables for 3.2 for char, record in enumerate(unicode.change_records): record = ChangeRecord(*record) if record in cache: index[char] = cache[record] else: index[char] = cache[record] = len(records) records.append(record) StructArray( "UnicodeChangeRecord", "kChangeRecords", records, f"deltas from Unicode {OLD_VERSION} to the current version", ).dump(db, trace) index1, index2, shift = splitbins(index, trace) db.write( f""" static const int kChangeIndexShift = {shift}; static const int kChangeIndexMask = (1 << kChangeIndexShift) - 1; """ ) UIntArray("kChangeIndex1", index1).dump(db, trace) UIntArray("kChangeIndex2", index2).dump(db, trace) db.write( """ int32_t normalizeOld(int32_t code_point) { switch (code_point) {""" ) for k, v in unicode.normalization_changes: db.write( f""" case {k:#x}: return 0x{v.lower()};""" ) db.write( """ default: return -1; } } """ ) def write_string_literals(db, trace): SmallStrArray("kBidirectionalNames", BIDIRECTIONAL_NAMES).dump(db, trace) SmallStrArray("kCategoryNames", CATEGORY_NAMES).dump(db, trace) SmallStrArray("kEastAsianWidthNames", EASTASIANWIDTH_NAMES).dump(db, trace) def write_db_coda(db): db.write( """ int32_t composeCodePoint(int32_t first, int32_t last) { int32_t i = first * kTotalLast + last; int32_t j = kCompIndex[i >> kCompShift]; return kCompData[(j << kCompShift) + (i & kCompMask)]; } UnicodeDecomposition decomposeCodePoint(int32_t code_point) { DCHECK_BOUND(code_point, kMaxUnicode); uint16_t offset = kDecompIndex1[code_point >> kDecompShift]; offset = kDecompIndex2[(offset << kDecompShift) + (code_point & kDecompMask)]; int32_t decomp = kDecompData[offset]; int32_t count = decomp >> kBitsPerByte; int8_t prefix = decomp & kMaxByte; return {kDecompPrefix[prefix], count, &kDecompData[offset + 1]}; } static int32_t findNFC(const Reindex* nfc, int32_t code_point) { for (word index = 0; nfc[index].start != 0; index++) { int32_t delta = code_point - nfc[index].start; if (delta < 0) { return -1; } if (delta <= nfc[index].count) { return delta + nfc[index].index; } } return -1; } int32_t findNFCFirst(int32_t code_point) { return findNFC(kNFCFirst, code_point); } int32_t findNFCLast(int32_t code_point) { return findNFC(kNFCLast, code_point); } struct HangulJamo { const int length; const byte name[3]; }; static const HangulJamo kHangulLeads[] = { {1, {'G'}}, {2, {'G', 'G'}}, {1, {'N'}}, {1, {'D'}}, {2, {'D', 'D'}}, {1, {'R'}}, {1, {'M'}}, {1, {'B'}}, {2, {'B', 'B'}}, {1, {'S'}}, {2, {'S', 'S'}}, {0}, {1, {'J'}}, {2, {'J', 'J'}}, {1, {'C'}}, {1, {'K'}}, {1, {'T'}}, {1, {'P'}}, {1, {'H'}}, }; static const HangulJamo kHangulVowels[] = { {1, {'A'}}, {2, {'A', 'E'}}, {2, {'Y', 'A'}}, {3, {'Y', 'A', 'E'}}, {2, {'E', 'O'}}, {1, {'E'}}, {3, {'Y', 'E', 'O'}}, {2, {'Y', 'E'}}, {1, {'O'}}, {2, {'W', 'A'}}, {3, {'W', 'A', 'E'}}, {2, {'O', 'E'}}, {2, {'Y', 'O'}}, {1, {'U'}}, {3, {'W', 'E', 'O'}}, {2, {'W', 'E'}}, {2, {'W', 'I'}}, {2, {'Y', 'U'}}, {2, {'E', 'U'}}, {2, {'Y', 'I'}}, {1, {'I'}}, }; static const HangulJamo kHangulTrails[] = { {0}, {1, {'G'}}, {2, {'G', 'G'}}, {2, {'G', 'S'}}, {1, {'N'}}, {2, {'N', 'J'}}, {2, {'N', 'H'}}, {1, {'D'}}, {1, {'L'}}, {2, {'L', 'G'}}, {2, {'L', 'M'}}, {2, {'L', 'B'}}, {2, {'L', 'S'}}, {2, {'L', 'T'}}, {2, {'L', 'P'}}, {2, {'L', 'H'}}, {1, {'M'}}, {1, {'B'}}, {2, {'B', 'S'}}, {1, {'S'}}, {2, {'S', 'S'}}, {2, {'N', 'G'}}, {1, {'J'}}, {1, {'C'}}, {1, {'K'}}, {1, {'T'}}, {1, {'P'}}, {1, {'H'}}, }; static_assert(ARRAYSIZE(kHangulLeads) == Unicode::kHangulLeadCount, "mismatch in number of Hangul lead jamo"); static_assert(ARRAYSIZE(kHangulVowels) == Unicode::kHangulVowelCount, "mismatch in number of Hangul vowel jamo"); static_assert(ARRAYSIZE(kHangulTrails) == Unicode::kHangulTrailCount, "mismatch in number of Hangul trail jamo"); static int32_t findHangulJamo(const byte* str, word size, const HangulJamo array[], word count) { word result = -1; int length = -1; for (word i = 0; i < count; i++) { HangulJamo jamo = array[i]; if (length < jamo.length && jamo.length <= size && (std::memcmp(str, jamo.name, jamo.length) == 0)) { length = jamo.length; result = i; } } return result; } static int32_t findHangulSyllable(const byte* name, word size) { word pos = 16; word lead = findHangulJamo(name + pos, size - pos, kHangulLeads, Unicode::kHangulLeadCount); if (lead == -1) { return -1; } pos += kHangulLeads[lead].length; word vowel = findHangulJamo(name + pos, size - pos, kHangulVowels, Unicode::kHangulVowelCount); if (vowel == -1) { return -1; } pos += kHangulVowels[vowel].length; word trail = findHangulJamo(name + pos, size - pos, kHangulTrails, Unicode::kHangulTrailCount); if (trail == -1 || pos + kHangulTrails[trail].length != size) { return -1; } return Unicode::kHangulSyllableStart + (lead * Unicode::kHangulVowelCount + vowel) * Unicode::kHangulTrailCount + trail; } static bool isUnifiedIdeograph(int32_t code_point) { return """ ) for i, cjk in enumerate(CJK_RANGES): if i > 0: db.write(" ||\n ") db.write(f"({cjk.start:#010x} <= code_point && code_point <= {cjk.end:#010x})") db.write( """; } static uint32_t myHash(const byte* name, word size) { uint32_t hash = 0; for (word i = 0; i < size; i++) { hash = (hash * kCodeMagic) + ASCII::toUpper(name[i]); if (hash > 0x00ffffff) { hash = (hash & 0x00ffffff) ^ ((hash & 0xff000000) >> 24); } } return hash; } static bool nameMatchesCodePoint(int32_t code_point, const byte* name, word size) { byte buffer[kMaxNameLength + 1]; if (!nameFromCodePoint(code_point, buffer, kMaxNameLength)) { return false; } for (word i = 0; i < size; i++) { if (ASCII::toUpper(name[i]) != buffer[i]) { return false; } } return buffer[size] == '\\0'; } static int32_t getCodePoint(const byte* name, word size, int32_t (*check)(int32_t)) { if (size >= 16 && std::memcmp(name, "HANGUL SYLLABLE ", 16) == 0) { return findHangulSyllable(name, size); } if (size >= 22 && std::memcmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { // Four or five uppercase hexdigits must follow name += 22; size -= 22; if (size != 4 && size != 5) { return -1; } int32_t result = 0; while (size--) { result *= 16; if ('0' <= *name && *name <= '9') { result += *name - '0'; } else if ('A' <= *name && *name <= 'F') { result += *name - 'A' + 10; } else { return -1; } name++; } if (!isUnifiedIdeograph(result)) { return -1; } return result; } uint32_t hash = myHash(name, size); uint32_t step = (hash ^ (hash >> 3)) & kCodeMask; if (step == 0) { step = kCodeMask; } for (uint32_t idx = (~hash) & kCodeMask;; idx = (idx + step) & kCodeMask) { int32_t result = kCodeHash[idx]; if (result == 0) { return -1; } if (nameMatchesCodePoint(result, name, size)) { return check(result); } step = step << 1; if (step > kCodeMask) { step = step ^ kCodePoly; } } } static int32_t checkAliasAndNamedSequence(int32_t code_point) { if (Unicode::isNamedSequence(code_point)) { return -1; } if (Unicode::isAlias(code_point)) { return kNameAliases[code_point - Unicode::kAliasStart]; } return code_point; } int32_t codePointFromName(const byte* name, word size) { return getCodePoint(name, size, checkAliasAndNamedSequence); } static int32_t checkAlias(int32_t code_point) { if (Unicode::isAlias(code_point)) { return kNameAliases[code_point - Unicode::kAliasStart]; } return code_point; } int32_t codePointFromNameOrNamedSequence(const byte* name, word size) { return getCodePoint(name, size, checkAlias); } int32_t extendedCaseMapping(int32_t index) { return kExtendedCase[index]; } bool nameFromCodePoint(int32_t code_point, byte* buffer, word size) { DCHECK_BOUND(code_point, kMaxUnicode); if (Unicode::isHangulSyllable(code_point)) { if (size < 27) { return false; // worst case: HANGUL SYLLABLE <10 chars> } int32_t offset = code_point - Unicode::kHangulSyllableStart; HangulJamo lead = kHangulLeads[offset / Unicode::kHangulCodaCount]; HangulJamo vowel = kHangulVowels[(offset % Unicode::kHangulCodaCount) / Unicode::kHangulTrailCount]; HangulJamo trail = kHangulTrails[offset % Unicode::kHangulTrailCount]; std::memcpy(buffer, "HANGUL SYLLABLE ", 16); buffer += 16; std::memcpy(buffer, lead.name, lead.length); buffer += lead.length; std::memcpy(buffer, vowel.name, vowel.length); buffer += vowel.length; std::memcpy(buffer, trail.name, trail.length); buffer[trail.length] = '\\0'; return true; } if (isUnifiedIdeograph(code_point)) { if (size < 28) { return false; // worst case: CJK UNIFIED IDEOGRAPH-***** } std::snprintf(reinterpret_cast<char*>(buffer), size, "CJK UNIFIED IDEOGRAPH-%" PRIX32, code_point); return true; } uint32_t offset = kPhrasebookOffset1[code_point >> kPhrasebookShift]; offset = kPhrasebookOffset2[(offset << kPhrasebookShift) + (code_point & kPhrasebookMask)]; if (offset == 0) { return false; } for (word i = 0;;) { if (i > 0) { if (i >= size) { return false; // buffer overflow } buffer[i++] = ' '; } // get word index word word_idx = kPhrasebook[offset] - kPhrasebookShort; if (word_idx >= 0) { word_idx = (word_idx << 8) + kPhrasebook[offset + 1]; offset += 2; } else { word_idx = kPhrasebook[offset++]; } // copy word string from lexicon const byte* word_ptr = kLexicon + kLexiconOffset[word_idx]; while (*word_ptr < 128) { if (i >= size) { return false; // buffer overflow } buffer[i++] = *word_ptr++; } if (i >= size) { return false; // buffer overflow } buffer[i++] = *word_ptr & 0x7f; if (*word_ptr == 0x80) { return true; } } } const UnicodeChangeRecord* changeRecord(int32_t code_point) { if (code_point > kMaxUnicode) { return &kChangeRecords[0]; } int index = kChangeIndex1[code_point >> kChangeIndexShift]; index <<= kChangeIndexShift; index = kChangeIndex2[index + (code_point & kChangeIndexMask)]; return &kChangeRecords[index]; } const UnicodeDatabaseRecord* databaseRecord(int32_t code_point) { if (code_point > kMaxUnicode) { return &kDatabaseRecords[0]; } int index = kDatabaseIndex1[code_point >> kDatabaseIndexShift]; index <<= kDatabaseIndexShift; index = kDatabaseIndex2[index + (code_point & kDatabaseIndexMask)]; return &kDatabaseRecords[index]; } const UnicodeNamedSequence* namedSequence(int32_t code_point) { DCHECK(Unicode::isNamedSequence(code_point), "not a named sequence"); return &kNamedSequences[code_point - Unicode::kNamedSequenceStart]; } const UnicodeTypeRecord* typeRecord(int32_t code_point) { if (code_point > kMaxUnicode) { return &kTypeRecords[0]; } int index = kTypeIndex1[code_point >> kTypeIndexShift]; index <<= kTypeIndexShift; index = kTypeIndex2[index + (code_point & kTypeIndexMask)]; return &kTypeRecords[index]; } } // namespace py """ ) def write_db(trace=False): print(f"--- Reading {UNICODE_DATA} v{UNIDATA_VERSION} ...") unicode = UnicodeData(UNIDATA_VERSION) print(len(list(filter(None, unicode.table))), "characters") print(f"--- Reading {UNICODE_DATA} v{OLD_VERSION} ...") old_unicode = UnicodeData(OLD_VERSION, check_cjk=False) print(len(list(filter(None, old_unicode.table))), "characters") unicode.merge(old_unicode) with open(HEADER_PATH, "w") as header: write_header(unicode, header) with open(DB_PATH, "w") as db: write_db_prelude(db) write_database_records(unicode, db, trace) write_name_data(unicode, db, trace) write_type_data(unicode, db, trace) write_change_data(unicode, db, trace) write_string_literals(db, trace) write_db_coda(db) if __name__ == "__main__": write_db(True)

util/generate_unicode_database.py (1,104 lines of code) (raw):