def makeunicodetype()

in Tools/unicode/makeunicodedata.py [0:0]


def makeunicodetype(unicode, trace):

    FILE = "Objects/unicodetype_db.h"

    print("--- Preparing", FILE, "...")

    # extract unicode types
    dummy = (0, 0, 0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
    numeric = {}
    spaces = []
    linebreaks = []
    extra_casing = []

    for char in unicode.chars:
        record = unicode.table[char]
        if record:
            # extract database properties
            category = record[2]
            bidirectional = record[4]
            properties = record[16]
            flags = 0
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
            if "Lowercase" in properties:
                flags |= LOWER_MASK
            if 'Line_Break' in properties or bidirectional == "B":
                flags |= LINEBREAK_MASK
                linebreaks.append(char)
            if category == "Zs" or bidirectional in ("WS", "B", "S"):
                flags |= SPACE_MASK
                spaces.append(char)
            if category == "Lt":
                flags |= TITLE_MASK
            if "Uppercase" in properties:
                flags |= UPPER_MASK
            if char == ord(" ") or category[0] not in ("C", "Z"):
                flags |= PRINTABLE_MASK
            if "XID_Start" in properties:
                flags |= XID_START_MASK
            if "XID_Continue" in properties:
                flags |= XID_CONTINUE_MASK
            if "Cased" in properties:
                flags |= CASED_MASK
            if "Case_Ignorable" in properties:
                flags |= CASE_IGNORABLE_MASK
            sc = unicode.special_casing.get(char)
            cf = unicode.case_folding.get(char, [char])
            if record[12]:
                upper = int(record[12], 16)
            else:
                upper = char
            if record[13]:
                lower = int(record[13], 16)
            else:
                lower = char
            if record[14]:
                title = int(record[14], 16)
            else:
                title = upper
            if sc is None and cf != [lower]:
                sc = ([lower], [title], [upper])
            if sc is None:
                if upper == lower == title:
                    upper = lower = title = 0
                else:
                    upper = upper - char
                    lower = lower - char
                    title = title - char
                    assert (abs(upper) <= 2147483647 and
                            abs(lower) <= 2147483647 and
                            abs(title) <= 2147483647)
            else:
                # This happens either when some character maps to more than one
                # character in uppercase, lowercase, or titlecase or the
                # casefolded version of the character is different from the
                # lowercase. The extra characters are stored in a different
                # array.
                flags |= EXTENDED_CASE_MASK
                lower = len(extra_casing) | (len(sc[0]) << 24)
                extra_casing.extend(sc[0])
                if cf != sc[0]:
                    lower |= len(cf) << 20
                    extra_casing.extend(cf)
                upper = len(extra_casing) | (len(sc[2]) << 24)
                extra_casing.extend(sc[2])
                # Title is probably equal to upper.
                if sc[1] == sc[2]:
                    title = upper
                else:
                    title = len(extra_casing) | (len(sc[1]) << 24)
                    extra_casing.extend(sc[1])
            # decimal digit, integer digit
            decimal = 0
            if record[6]:
                flags |= DECIMAL_MASK
                decimal = int(record[6])
            digit = 0
            if record[7]:
                flags |= DIGIT_MASK
                digit = int(record[7])
            if record[8]:
                flags |= NUMERIC_MASK
                numeric.setdefault(record[8], []).append(char)
            item = (
                upper, lower, title, decimal, digit, flags
                )
            # add entry to index and item tables
            i = cache.get(item)
            if i is None:
                cache[item] = i = len(table)
                table.append(item)
            index[char] = i

    print(len(table), "unique character type entries")
    print(sum(map(len, numeric.values())), "numeric code points")
    print(len(spaces), "whitespace code points")
    print(len(linebreaks), "linebreak code points")
    print(len(extra_casing), "extended case array")

    print("--- Writing", FILE, "...")

    with open(FILE, "w") as fp:
        fprint = partial(print, file=fp)

        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
        fprint()
        fprint("/* a list of unique character type descriptors */")
        fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
        for item in table:
            fprint("    {%d, %d, %d, %d, %d, %d}," % item)
        fprint("};")
        fprint()

        fprint("/* extended case mappings */")
        fprint()
        fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
        for c in extra_casing:
            fprint("    %d," % c)
        fprint("};")
        fprint()

        # split decomposition index table
        index1, index2, shift = splitbins(index, trace)

        fprint("/* type indexes */")
        fprint("#define SHIFT", shift)
        Array("index1", index1).dump(fp, trace)
        Array("index2", index2).dump(fp, trace)

        # Generate code for _PyUnicode_ToNumeric()
        numeric_items = sorted(numeric.items())
        fprint('/* Returns the numeric value as double for Unicode characters')
        fprint(' * having this property, -1.0 otherwise.')
        fprint(' */')
        fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
        fprint('{')
        fprint('    switch (ch) {')
        for value, codepoints in numeric_items:
            # Turn text into float literals
            parts = value.split('/')
            parts = [repr(float(part)) for part in parts]
            value = '/'.join(parts)

            codepoints.sort()
            for codepoint in codepoints:
                fprint('    case 0x%04X:' % (codepoint,))
            fprint('        return (double) %s;' % (value,))
        fprint('    }')
        fprint('    return -1.0;')
        fprint('}')
        fprint()

        # Generate code for _PyUnicode_IsWhitespace()
        fprint("/* Returns 1 for Unicode characters having the bidirectional")
        fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
        fprint(" */")
        fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
        fprint('{')
        fprint('    switch (ch) {')

        for codepoint in sorted(spaces):
            fprint('    case 0x%04X:' % (codepoint,))
        fprint('        return 1;')

        fprint('    }')
        fprint('    return 0;')
        fprint('}')
        fprint()

        # Generate code for _PyUnicode_IsLinebreak()
        fprint("/* Returns 1 for Unicode characters having the line break")
        fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
        fprint(" * type 'B', 0 otherwise.")
        fprint(" */")
        fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
        fprint('{')
        fprint('    switch (ch) {')
        for codepoint in sorted(linebreaks):
            fprint('    case 0x%04X:' % (codepoint,))
        fprint('        return 1;')

        fprint('    }')
        fprint('    return 0;')
        fprint('}')
        fprint()