in util/generate_unicode_database.py [0:0]
def write_type_data(unicode, db, trace): # noqa: C901
"""Writes Unicode character type tables to the database file."""
# extract unicode types
dummy = TypeRecord(0, 0, 0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * NUM_CODE_POINTS
numeric = {}
spaces = []
linebreaks = []
extended_cases = []
for char in CODE_POINTS:
record = unicode.table[char]
if record:
# extract database properties
category = record[2]
bidirectional = record[4]
properties = record[16]
flags = 0
# TODO(T55176519): delta = True
if category in ("Lm", "Lt", "Lu", "Ll", "Lo"):
flags |= ALPHA_MASK
if "Lowercase" in properties:
flags |= LOWER_MASK
if "Line_Break" in properties or bidirectional == "B":
flags |= LINEBREAK_MASK
linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK
spaces.append(char)
if category == "Lt":
flags |= TITLE_MASK
if "Uppercase" in properties:
flags |= UPPER_MASK
if char == ord(" ") or category[0] not in ("C", "Z"):
flags |= PRINTABLE_MASK
if "XID_Start" in properties:
flags |= XID_START_MASK
if "XID_Continue" in properties:
flags |= XID_CONTINUE_MASK
if "Cased" in properties:
flags |= CASED_MASK
if "Case_Ignorable" in properties:
flags |= CASE_IGNORABLE_MASK
sc = unicode.special_casing.get(char)
cf = unicode.case_folding.get(char, [char])
if record[12]:
upper = int(record[12], 16)
else:
upper = char
if record[13]:
lower = int(record[13], 16)
else:
lower = char
if record[14]:
title = int(record[14], 16)
else:
title = upper
if sc is None and cf != [lower]:
sc = ([lower], [title], [upper])
if sc is None:
if upper == lower == title:
upper = lower = title = 0
else:
upper = upper - char
lower = lower - char
title = title - char
assert (
abs(upper) <= 2147483647
and abs(lower) <= 2147483647
and abs(title) <= 2147483647
)
else:
# This happens either when some character maps to more than one
# character in uppercase, lowercase, or titlecase or the
# casefolded version of the character is different from the
# lowercase. The extra characters are stored in a different
# array.
flags |= EXTENDED_CASE_MASK
lower = len(extended_cases) | (len(sc[0]) << 24)
extended_cases.extend(sc[0])
if cf != sc[0]:
lower |= len(cf) << 20
extended_cases.extend(cf)
upper = len(extended_cases) | (len(sc[2]) << 24)
extended_cases.extend(sc[2])
# Title is probably equal to upper.
if sc[1] == sc[2]:
title = upper
else:
title = len(extended_cases) | (len(sc[1]) << 24)
extended_cases.extend(sc[1])
# decimal digit, integer digit
decimal = 0
if record[6]:
flags |= DECIMAL_MASK
decimal = int(record[6])
digit = 0
if record[7]:
flags |= DIGIT_MASK
digit = int(record[7])
if record[8]:
flags |= NUMERIC_MASK
numeric.setdefault(record[8], []).append(char)
item = TypeRecord(decimal, digit, flags, lower, title, upper)
# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i
print(len(table), "unique character type entries")
print(sum(map(len, numeric.values())), "numeric code points")
print(len(spaces), "whitespace code points")
print(len(linebreaks), "linebreak code points")
print(len(extended_cases), "extended case array")
StructArray(
"UnicodeTypeRecord",
"kTypeRecords",
table,
"a list of unique character type descriptors",
).dump(db, trace)
# split record index table
index1, index2, shift = splitbins(index, trace)
db.write(
f"""
// type indices
static const int kTypeIndexShift = {shift};
static const int32_t kTypeIndexMask = (int32_t{{1}} << kTypeIndexShift) - 1;
"""
)
UIntArray("kTypeIndex1", index1).dump(db, trace)
UIntArray("kTypeIndex2", index2).dump(db, trace)
# extended case mappings
CodePointArray("kExtendedCase", extended_cases).dump(db, trace)
db.write(
"""
double numericValue(int32_t code_point) {
switch (code_point) {"""
)
for value, codepoints in sorted(numeric.items()):
parts = value.split("/")
value = " / ".join(repr(float(part)) for part in parts)
codepoints.sort()
for codepoint in codepoints:
db.write(f"\n case {codepoint:#08x}:")
db.write(f"\n return {value};")
db.write(
"""
default:
return -1.0;
}
}
bool unicodeIsLinebreak(int32_t code_point) {
switch (code_point) {"""
)
for codepoint in sorted(linebreaks):
db.write(f"\n case {codepoint:#08x}:")
db.write(
"""
return true;
default:
return false;
}
}
bool unicodeIsWhitespace(int32_t code_point) {
switch (code_point) {"""
)
for codepoint in sorted(spaces):
db.write(f"\n case {codepoint:#08x}:")
db.write(
"""
return true;
default:
return false;
}
}
"""
)