in util/generate_unicode_database.py [0:0]
def write_database_records(unicode, db, trace): # noqa: C901
dummy = DatabaseRecord(0, 0, 0, 0, False, 0)
table = [dummy]
cache = {dummy: 0}
index = [0] * NUM_CODE_POINTS
decomp_data = [0]
decomp_prefix = [""]
decomp_index = [0] * NUM_CODE_POINTS
decomp_size = 0
comp_pairs = []
comp_first = [None] * NUM_CODE_POINTS
comp_last = [None] * NUM_CODE_POINTS
for char in CODE_POINTS:
record = unicode.table[char]
if record:
# database properties
item = DatabaseRecord(
BIDIRECTIONAL_NAMES.index(record[4]),
CATEGORY_NAMES.index(record[2]),
int(record[3]),
EASTASIANWIDTH_NAMES.index(record[15]),
record[9] == "Y",
record[17],
)
idx = cache.get(item)
if idx is None:
cache[item] = idx = len(table)
table.append(item)
index[char] = idx
# decomposition data
decomp_idx = 0
if record[5]:
decomp = record[5].split()
prefix = decomp.pop(0) if decomp[0][0] == "<" else ""
if len(decomp) > MAX_DECOMPOSITION:
raise Exception(
f"decomposition of code point {char:#x} is too large"
)
try:
idx = decomp_prefix.index(prefix)
except ValueError:
idx = len(decomp_prefix)
decomp_prefix.append(prefix)
assert idx < 256
decomp = [idx + (len(decomp) << 8)] + [int(s, 16) for s in decomp]
if (
not idx
and len(decomp) == 3
and char not in unicode.exclusions
and unicode.table[decomp[1]][3] == "0"
):
_, l, r = decomp
comp_first[l] = 1
comp_last[r] = 1
comp_pairs.append((l, r, char))
try:
decomp_idx = decomp_data.index(decomp)
except ValueError:
decomp_idx = len(decomp_data)
decomp_data.extend(decomp)
decomp_size = decomp_size + len(decomp) * 2
decomp_index[char] = decomp_idx
first = last = 0
comp_first_ranges = []
comp_last_ranges = []
prev_first = prev_last = None
for ch in CODE_POINTS:
if comp_first[ch] is not None:
comp_first[ch] = first
first += 1
if prev_first is None:
prev_first = (ch, ch)
elif prev_first[1] + 1 == ch:
prev_first = prev_first[0], ch
else:
comp_first_ranges.append(prev_first)
prev_first = (ch, ch)
if comp_last[ch] is not None:
comp_last[ch] = last
last += 1
if prev_last is None:
prev_last = (ch, ch)
elif prev_last[1] + 1 == ch:
prev_last = prev_last[0], ch
else:
comp_last_ranges.append(prev_last)
prev_last = (ch, ch)
comp_first_ranges.append(prev_first)
comp_last_ranges.append(prev_last)
total_first = first
total_last = last
comp_data = [0] * (total_first * total_last)
for first, last, char in comp_pairs:
first = comp_first[first]
last = comp_last[last]
comp_data[first * total_last + last] = char
if trace:
print(len(table), "unique properties")
print(len(decomp_prefix), "unique decomposition prefixes")
print(len(decomp_data), "unique decomposition entries:", end=" ")
print(decomp_size, "bytes")
print(total_first, "first characters in NFC")
print(total_last, "last characters in NFC")
print(len(comp_pairs), "NFC pairs")
StructArray(
"UnicodeDatabaseRecord",
"kDatabaseRecords",
table,
"a list of unique database records",
).dump(db, trace)
# split record index table
index1, index2, shift = splitbins(index, trace)
db.write(
f"""
// type indices
static const int kDatabaseIndexShift = {shift};
static const int32_t kDatabaseIndexMask =
(int32_t{{1}} << kDatabaseIndexShift) - 1;
"""
)
UIntArray("kDatabaseIndex1", index1).dump(db, trace)
UIntArray("kDatabaseIndex2", index2).dump(db, trace)
db.write(
f"""
// Reindexing of NFC first and last characters
struct Reindex {{
const int32_t start;
const short count;
const short index;
}};
static const int kTotalLast = {total_last};
"""
)
nfc_first = [
Reindex(start, end - start, comp_first[start])
for start, end in comp_first_ranges
] + [Reindex(0, 0, 0)]
nfc_last = [
Reindex(start, end - start, comp_last[start]) for start, end in comp_last_ranges
] + [Reindex(0, 0, 0)]
StructArray("Reindex", "kNFCFirst", nfc_first).dump(db, trace)
StructArray("Reindex", "kNFCLast", nfc_last).dump(db, trace)
# split decomposition index table
index1, index2, shift = splitbins(decomp_index, trace)
db.write(
f"""
// decomposition mappings
static const int kDecompShift = {shift};
static const int32_t kDecompMask = (int32_t{{1}} << kDecompShift) - 1;
const char* kDecompPrefix[] = {{
"""
)
for name in decomp_prefix:
db.write(f' "{name}",\n')
db.write("};\n")
CodePointArray("kDecompData", decomp_data).dump(db, trace)
UIntArray("kDecompIndex1", index1).dump(db, trace)
UIntArray("kDecompIndex2", index2).dump(db, trace)
index1, index2, shift = splitbins(comp_data, trace)
db.write(
f"""
// NFC pairs
static const int kCompShift = {shift};
static const int32_t kCompMask = (int32_t{{1}} << kCompShift) - 1;
"""
)
UIntArray("kCompIndex", index1).dump(db, trace)
UIntArray("kCompData", index2).dump(db, trace)