def __init_

def init()

in util/generate_unicode_database.py [0:0]
187 lines of code
84 McCabe index (conditional complexity)

    def __init__(self, version, check_cjk=True):  # noqa: C901
        self.version = version

        self.aliases = []
        self.case_folding = {}
        self.change_records = []
        self.exclusions = set()
        self.named_sequences = []
        self.normalization_changes = []
        self.special_casing = {}
        self.table = [None] * NUM_CODE_POINTS

        with open_data(UNICODE_DATA, version) as file:
            while True:
                s = file.readline()
                if not s:
                    break
                s = s.strip().split(";")
                char = int(s[0], 16)
                self.table[char] = s

        # expand first-last ranges
        field = None
        cjk_ranges = []
        for char in CODE_POINTS:
            s = self.table[char]
            if s is not None:
                if s[1][-6:] == "First>":
                    s[1] = ""
                    field = s
                elif s[1][-5:] == "Last>":
                    if check_cjk and s[1].startswith("<CJK Ideograph"):
                        start = int(field[0], 16)
                        end = int(s[0], 16)
                        cjk_ranges.append(CJKRange(start, end))
                    s[1] = ""
                    field = None
            elif field:
                f2 = field[:]
                f2[0] = f"{char:X}"
                self.table[char] = f2

        if check_cjk and set(cjk_ranges) != set(CJK_RANGES):
            raise ValueError(f"CJK ranges deviate: found {cjk_ranges!r}")

        # check for name aliases and named sequences, see #12753
        # aliases and named sequences are not in 3.2.0
        if version != OLD_VERSION:
            self.aliases = []
            # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
            # in order to take advantage of the compression and lookup
            # algorithms used for the other characters
            pua_index = NAME_ALIASES_START
            with open_data(NAME_ALIASES, version) as file:
                for s in file:
                    s = s.strip()
                    if not s or s.startswith("#"):
                        continue
                    char, name, abbrev = s.split(";")
                    char = int(char, 16)
                    self.aliases.append((name, char))
                    # also store the name in the PUA 1
                    self.table[pua_index][1] = name
                    pua_index += 1
            assert pua_index - NAME_ALIASES_START == len(self.aliases)

            self.named_sequences = []
            # store named sequences in the PUA 1, in range U+F0100..,
            # in order to take advantage of the compression and lookup
            # algorithms used for the other characters.

            assert pua_index < NAMED_SEQUENCES_START
            pua_index = NAMED_SEQUENCES_START
            with open_data(NAMED_SEQUENCES, version) as file:
                for s in file:
                    s = s.strip()
                    if not s or s.startswith("#"):
                        continue
                    name, chars = s.split(";")
                    chars = tuple(int(char, 16) for char in chars.split())
                    # check that the structure defined in write_header is OK
                    assert len(chars) <= 4, "change the NamedSequence array size"
                    assert all(c < NUM_CODE_POINTS for c in chars)
                    self.named_sequences.append(NamedSequence(name, chars))
                    # also store these in the PUA 1
                    self.table[pua_index][1] = name
                    pua_index += 1
            assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)

        self.exclusions = set()
        with open_data(COMPOSITION_EXCLUSIONS, version) as file:
            for s in file:
                s = s.strip()
                if s and not s.startswith("#"):
                    char = int(s.split()[0], 16)
                    self.exclusions.add(char)

        widths = [None] * NUM_CODE_POINTS
        with open_data(EASTASIAN_WIDTH, version) as file:
            for s in file:
                s = s.strip()
                if not s:
                    continue
                if s[0] == "#":
                    continue
                s = s.split()[0].split(";")
                if ".." in s[0]:
                    first, last = [int(c, 16) for c in s[0].split("..")]
                    chars = list(range(first, last + 1))
                else:
                    chars = [int(s[0], 16)]
                for char in chars:
                    widths[char] = s[1]

        for char in CODE_POINTS:
            if self.table[char] is not None:
                self.table[char].append(widths[char])
                self.table[char].append(set())

        with open_data(DERIVED_CORE_PROPERTIES, version) as file:
            for s in file:
                s = s.split("#", 1)[0].strip()
                if not s:
                    continue

                r, p = s.split(";")
                r = r.strip()
                p = p.strip()
                if ".." in r:
                    first, last = [int(c, 16) for c in r.split("..")]
                    chars = list(range(first, last + 1))
                else:
                    chars = [int(r, 16)]
                for char in chars:
                    if self.table[char]:
                        # Some properties (e.g. Default_Ignorable_Code_Point)
                        # apply to unassigned code points; ignore them
                        self.table[char][-1].add(p)

        with open_data(LINE_BREAK, version) as file:
            for s in file:
                s = s.partition("#")[0]
                s = [i.strip() for i in s.split(";")]
                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
                    continue
                if ".." not in s[0]:
                    first = last = int(s[0], 16)
                else:
                    first, last = [int(c, 16) for c in s[0].split("..")]
                for char in range(first, last + 1):
                    self.table[char][-1].add("Line_Break")

        # We only want the quickcheck properties
        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
        # Yes is the default, hence only N and M occur
        # In 3.2.0, the format was different (NF?_NO)
        # The parsing will incorrectly determine these as
        # "yes", however, unicodedata.c will not perform quickchecks
        # for older versions, and no delta records will be created.
        quickchecks = [0] * NUM_CODE_POINTS
        qc_order = "NFD_QC NFKD_QC NFC_QC NFKC_QC".split()
        with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
            for s in file:
                if "#" in s:
                    s = s[: s.index("#")]
                s = [i.strip() for i in s.split(";")]
                if len(s) < 2 or s[1] not in qc_order:
                    continue
                quickcheck = "MN".index(s[2]) + 1  # Maybe or No
                quickcheck_shift = qc_order.index(s[1]) * 2
                quickcheck <<= quickcheck_shift
                if ".." not in s[0]:
                    first = last = int(s[0], 16)
                else:
                    first, last = [int(c, 16) for c in s[0].split("..")]
                for char in range(first, last + 1):
                    assert not (quickchecks[char] >> quickcheck_shift) & 3
                    quickchecks[char] |= quickcheck

        with open_data(UNIHAN, version) as file:
            zip = zipfile.ZipFile(file)
            if version == OLD_VERSION:
                data = zip.open(f"Unihan-{OLD_VERSION}.txt").read()
            else:
                data = zip.open("Unihan_NumericValues.txt").read()
        for line in data.decode("utf-8").splitlines():
            if not line.startswith("U+"):
                continue
            code, tag, value = line.split(None, 3)[:3]
            if tag not in ("kAccountingNumeric", "kPrimaryNumeric", "kOtherNumeric"):
                continue
            value = value.strip().replace(",", "")
            i = int(code[2:], 16)
            # Patch the numeric field
            if self.table[i] is not None:
                self.table[i][8] = value
        sc = self.special_casing = {}
        with open_data(SPECIAL_CASING, version) as file:
            for s in file:
                s = s[:-1].split("#", 1)[0]
                if not s:
                    continue
                data = s.split("; ")
                if data[4]:
                    # We ignore all conditionals (since they depend on
                    # languages) except for one, which is hardcoded. See
                    # handle_capital_sigma in unicodeobject.c.
                    continue
                c = int(data[0], 16)
                lower = [int(char, 16) for char in data[1].split()]
                title = [int(char, 16) for char in data[2].split()]
                upper = [int(char, 16) for char in data[3].split()]
                sc[c] = (lower, title, upper)
        cf = self.case_folding = {}
        if version != OLD_VERSION:
            with open_data(CASE_FOLDING, version) as file:
                for s in file:
                    s = s[:-1].split("#", 1)[0]
                    if not s:
                        continue
                    data = s.split("; ")
                    if data[1] in "CF":
                        c = int(data[0], 16)
                        cf[c] = [int(char, 16) for char in data[2].split()]

        for char in CODE_POINTS:
            if self.table[char] is not None:
                self.table[char].append(quickchecks[char])
def __init__()

def init()