in util/generate_unicode_database.py [0:0]
def __init__(self, version, check_cjk=True): # noqa: C901
self.version = version
self.aliases = []
self.case_folding = {}
self.change_records = []
self.exclusions = set()
self.named_sequences = []
self.normalization_changes = []
self.special_casing = {}
self.table = [None] * NUM_CODE_POINTS
with open_data(UNICODE_DATA, version) as file:
while True:
s = file.readline()
if not s:
break
s = s.strip().split(";")
char = int(s[0], 16)
self.table[char] = s
# expand first-last ranges
field = None
cjk_ranges = []
for char in CODE_POINTS:
s = self.table[char]
if s is not None:
if s[1][-6:] == "First>":
s[1] = ""
field = s
elif s[1][-5:] == "Last>":
if check_cjk and s[1].startswith("<CJK Ideograph"):
start = int(field[0], 16)
end = int(s[0], 16)
cjk_ranges.append(CJKRange(start, end))
s[1] = ""
field = None
elif field:
f2 = field[:]
f2[0] = f"{char:X}"
self.table[char] = f2
if check_cjk and set(cjk_ranges) != set(CJK_RANGES):
raise ValueError(f"CJK ranges deviate: found {cjk_ranges!r}")
# check for name aliases and named sequences, see #12753
# aliases and named sequences are not in 3.2.0
if version != OLD_VERSION:
self.aliases = []
# store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
# in order to take advantage of the compression and lookup
# algorithms used for the other characters
pua_index = NAME_ALIASES_START
with open_data(NAME_ALIASES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith("#"):
continue
char, name, abbrev = s.split(";")
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases)
self.named_sequences = []
# store named sequences in the PUA 1, in range U+F0100..,
# in order to take advantage of the compression and lookup
# algorithms used for the other characters.
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith("#"):
continue
name, chars = s.split(";")
chars = tuple(int(char, 16) for char in chars.split())
# check that the structure defined in write_header is OK
assert len(chars) <= 4, "change the NamedSequence array size"
assert all(c < NUM_CODE_POINTS for c in chars)
self.named_sequences.append(NamedSequence(name, chars))
# also store these in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
self.exclusions = set()
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
for s in file:
s = s.strip()
if s and not s.startswith("#"):
char = int(s.split()[0], 16)
self.exclusions.add(char)
widths = [None] * NUM_CODE_POINTS
with open_data(EASTASIAN_WIDTH, version) as file:
for s in file:
s = s.strip()
if not s:
continue
if s[0] == "#":
continue
s = s.split()[0].split(";")
if ".." in s[0]:
first, last = [int(c, 16) for c in s[0].split("..")]
chars = list(range(first, last + 1))
else:
chars = [int(s[0], 16)]
for char in chars:
widths[char] = s[1]
for char in CODE_POINTS:
if self.table[char] is not None:
self.table[char].append(widths[char])
self.table[char].append(set())
with open_data(DERIVED_CORE_PROPERTIES, version) as file:
for s in file:
s = s.split("#", 1)[0].strip()
if not s:
continue
r, p = s.split(";")
r = r.strip()
p = p.strip()
if ".." in r:
first, last = [int(c, 16) for c in r.split("..")]
chars = list(range(first, last + 1))
else:
chars = [int(r, 16)]
for char in chars:
if self.table[char]:
# Some properties (e.g. Default_Ignorable_Code_Point)
# apply to unassigned code points; ignore them
self.table[char][-1].add(p)
with open_data(LINE_BREAK, version) as file:
for s in file:
s = s.partition("#")[0]
s = [i.strip() for i in s.split(";")]
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
continue
if ".." not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split("..")]
for char in range(first, last + 1):
self.table[char][-1].add("Line_Break")
# We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
# Yes is the default, hence only N and M occur
# In 3.2.0, the format was different (NF?_NO)
# The parsing will incorrectly determine these as
# "yes", however, unicodedata.c will not perform quickchecks
# for older versions, and no delta records will be created.
quickchecks = [0] * NUM_CODE_POINTS
qc_order = "NFD_QC NFKD_QC NFC_QC NFKC_QC".split()
with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
for s in file:
if "#" in s:
s = s[: s.index("#")]
s = [i.strip() for i in s.split(";")]
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = "MN".index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1]) * 2
quickcheck <<= quickcheck_shift
if ".." not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split("..")]
for char in range(first, last + 1):
assert not (quickchecks[char] >> quickcheck_shift) & 3
quickchecks[char] |= quickcheck
with open_data(UNIHAN, version) as file:
zip = zipfile.ZipFile(file)
if version == OLD_VERSION:
data = zip.open(f"Unihan-{OLD_VERSION}.txt").read()
else:
data = zip.open("Unihan_NumericValues.txt").read()
for line in data.decode("utf-8").splitlines():
if not line.startswith("U+"):
continue
code, tag, value = line.split(None, 3)[:3]
if tag not in ("kAccountingNumeric", "kPrimaryNumeric", "kOtherNumeric"):
continue
value = value.strip().replace(",", "")
i = int(code[2:], 16)
# Patch the numeric field
if self.table[i] is not None:
self.table[i][8] = value
sc = self.special_casing = {}
with open_data(SPECIAL_CASING, version) as file:
for s in file:
s = s[:-1].split("#", 1)[0]
if not s:
continue
data = s.split("; ")
if data[4]:
# We ignore all conditionals (since they depend on
# languages) except for one, which is hardcoded. See
# handle_capital_sigma in unicodeobject.c.
continue
c = int(data[0], 16)
lower = [int(char, 16) for char in data[1].split()]
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
cf = self.case_folding = {}
if version != OLD_VERSION:
with open_data(CASE_FOLDING, version) as file:
for s in file:
s = s[:-1].split("#", 1)[0]
if not s:
continue
data = s.split("; ")
if data[1] in "CF":
c = int(data[0], 16)
cf[c] = [int(char, 16) for char in data[2].split()]
for char in CODE_POINTS:
if self.table[char] is not None:
self.table[char].append(quickchecks[char])