def write_name_data()

in util/generate_unicode_database.py [0:0]


def write_name_data(unicode, db, trace):  # noqa: C901
    # Collect names
    names = [None] * NUM_CODE_POINTS
    for char in CODE_POINTS:
        record = unicode.table[char]
        if record:
            name = record[1].strip()
            if name and name[0] != "<":
                names[char] = f"{name}\0"

    if trace:
        print(len([n for n in names if n is not None]), "distinct names")

    # Collect unique words from names. Note that we differ between
    # words ending a sentence, which include the trailing null byte,
    # and words inside a sentence, which do not.
    words = Words(names, trace)
    wordlist = words.tolist()

    # Figure out how many phrasebook escapes we need
    short = (65536 - len(wordlist)) // 256
    assert short > 0

    if trace:
        print(short, "short indexes in lexicon")
        n = sum(wordlist[i].frequency for i in range(short))
        print(n, "short indexes in phrasebook")

    # Pick the most commonly used words, and sort the rest by falling
    # length to maximize overlap.
    tail = wordlist[short:]
    tail.sort(key=lambda data: len(data.word), reverse=True)
    wordlist[short:] = tail

    # Build a lexicon string from words
    current_offset = 0
    lexicon_offset = [0]
    lexicon_string = ""
    word_offsets = {}
    for data in wordlist:
        # Encoding: bit 7 indicates last character in word
        # chr(128) indicates the last character in an entire string)
        last = ord(data.word[-1])
        encoded = f"{data.word[:-1]}{chr(last + 128)}"

        # reuse string tails, when possible
        offset = lexicon_string.find(encoded)
        if offset < 0:
            offset = current_offset
            lexicon_string = lexicon_string + encoded
            current_offset += len(encoded)

        word_offsets[data.word] = len(lexicon_offset)
        lexicon_offset.append(offset)

    lexicon = [ord(ch) for ch in lexicon_string]

    # generate phrasebook from names and lexicon
    phrasebook = [0]
    phrasebook_offset = [0] * NUM_CODE_POINTS
    for char in CODE_POINTS:
        name = names[char]
        if name is not None:
            words = name.split()
            phrasebook_offset[char] = len(phrasebook)
            for word in words:
                offset = word_offsets[word]
                if offset < short:
                    phrasebook.append(offset)
                else:
                    # store as two bytes
                    phrasebook.append((offset >> 8) + short)
                    phrasebook.append(offset & 255)

    assert getsize(phrasebook) == 1

    db.write(f"\n// lexicon")
    UIntArray("kLexicon", lexicon).dump(db, trace)
    UIntArray("kLexiconOffset", lexicon_offset).dump(db, trace)

    # split phrasebook index table
    offset1, offset2, shift = splitbins(phrasebook_offset, trace)

    db.write(
        f"""
// code => name phrasebook
static const int kPhrasebookShift = {shift};
static const int kPhrasebookShort = {short};
static const int kPhrasebookMask = (1 << kPhrasebookShift) - 1;
"""
    )
    UIntArray("kPhrasebook", phrasebook).dump(db, trace)
    UIntArray("kPhrasebookOffset1", offset1).dump(db, trace)
    UIntArray("kPhrasebookOffset2", offset2).dump(db, trace)

    # Extract names for name hash table
    hash_data = []
    for char in CODE_POINTS:
        record = unicode.table[char]
        if record:
            name = record[1].strip()
            if name and name[0] != "<":
                hash_data.append((name, char))

    db.write("\n// name => code dictionary")
    Hash(hash_data).dump(db, trace)

    aliases = [codepoint for _, codepoint in unicode.aliases]
    UIntArray("kNameAliases", aliases).dump(db, trace)

    StructArray(
        "UnicodeNamedSequence", "kNamedSequences", unicode.named_sequences
    ).dump(db, trace)