in util/generate_unicode_database.py [0:0]
def write_name_data(unicode, db, trace): # noqa: C901
# Collect names
names = [None] * NUM_CODE_POINTS
for char in CODE_POINTS:
record = unicode.table[char]
if record:
name = record[1].strip()
if name and name[0] != "<":
names[char] = f"{name}\0"
if trace:
print(len([n for n in names if n is not None]), "distinct names")
# Collect unique words from names. Note that we differ between
# words ending a sentence, which include the trailing null byte,
# and words inside a sentence, which do not.
words = Words(names, trace)
wordlist = words.tolist()
# Figure out how many phrasebook escapes we need
short = (65536 - len(wordlist)) // 256
assert short > 0
if trace:
print(short, "short indexes in lexicon")
n = sum(wordlist[i].frequency for i in range(short))
print(n, "short indexes in phrasebook")
# Pick the most commonly used words, and sort the rest by falling
# length to maximize overlap.
tail = wordlist[short:]
tail.sort(key=lambda data: len(data.word), reverse=True)
wordlist[short:] = tail
# Build a lexicon string from words
current_offset = 0
lexicon_offset = [0]
lexicon_string = ""
word_offsets = {}
for data in wordlist:
# Encoding: bit 7 indicates last character in word
# chr(128) indicates the last character in an entire string)
last = ord(data.word[-1])
encoded = f"{data.word[:-1]}{chr(last + 128)}"
# reuse string tails, when possible
offset = lexicon_string.find(encoded)
if offset < 0:
offset = current_offset
lexicon_string = lexicon_string + encoded
current_offset += len(encoded)
word_offsets[data.word] = len(lexicon_offset)
lexicon_offset.append(offset)
lexicon = [ord(ch) for ch in lexicon_string]
# generate phrasebook from names and lexicon
phrasebook = [0]
phrasebook_offset = [0] * NUM_CODE_POINTS
for char in CODE_POINTS:
name = names[char]
if name is not None:
words = name.split()
phrasebook_offset[char] = len(phrasebook)
for word in words:
offset = word_offsets[word]
if offset < short:
phrasebook.append(offset)
else:
# store as two bytes
phrasebook.append((offset >> 8) + short)
phrasebook.append(offset & 255)
assert getsize(phrasebook) == 1
db.write(f"\n// lexicon")
UIntArray("kLexicon", lexicon).dump(db, trace)
UIntArray("kLexiconOffset", lexicon_offset).dump(db, trace)
# split phrasebook index table
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
db.write(
f"""
// code => name phrasebook
static const int kPhrasebookShift = {shift};
static const int kPhrasebookShort = {short};
static const int kPhrasebookMask = (1 << kPhrasebookShift) - 1;
"""
)
UIntArray("kPhrasebook", phrasebook).dump(db, trace)
UIntArray("kPhrasebookOffset1", offset1).dump(db, trace)
UIntArray("kPhrasebookOffset2", offset2).dump(db, trace)
# Extract names for name hash table
hash_data = []
for char in CODE_POINTS:
record = unicode.table[char]
if record:
name = record[1].strip()
if name and name[0] != "<":
hash_data.append((name, char))
db.write("\n// name => code dictionary")
Hash(hash_data).dump(db, trace)
aliases = [codepoint for _, codepoint in unicode.aliases]
UIntArray("kNameAliases", aliases).dump(db, trace)
StructArray(
"UnicodeNamedSequence", "kNamedSequences", unicode.named_sequences
).dump(db, trace)