def parse_node_in_tree()

in src/ppxgboost/BoosterParser.py [0:0]


def parse_node_in_tree(s, lvl, feature_set, min_max):
    # The index of the regular expression is defined based on the xgboost
    # output formatting.

    # split the string into chunks where each represents a single tree_node
    # the first item in the list is the root for this tree
    current_node = re.split(r"\n", s)[0]

    # try to parse this tree_node as a leaf
    # a leaf is identified by the pattern <int>:leaf=<float>
    # where the float is possibly negative and possibly an integer
    # # similar to '\w.-'. The '-' is to capture negative value, and '.' for floating point number.
    # e.g. 10:leaf=0.009 will be parsed to ['10', '0.009']
    leaf_strs = re.findall(r"[\d.-]+", current_node)

    if len(leaf_strs) == 2:  # if this is a leaf
        return Leaf(int(leaf_strs[0]), float(leaf_strs[1]))
    else:
        # the parsing for scientific notations.
        # if the leaf value is a scientific notation, then the parsing above does not work
        # the value contains 'e-', e.g. '2.3e-05', therefore, we need to parsing the scientific value
        if len(leaf_strs) == 3 and 'e-' in current_node:
            pos = current_node.find('=')
            value = float(current_node[pos + 1:])

            # As Client's query is unpredictible, it's impossible
            # this is only to avoid the comparision between two extreme tiny
            # values when encountered in the model. The Affine transform from
            # PPBoost.py already takes care of the precision up to 1.0e-7.
            #
            # In case we encountered a comparision
            # between ~1.0e-14 and 0. OPE cannot support this,
            # so manually set the tiny number (1.0e-14) to a bigger
            # number (1.0e-7) in order to make the comparison go thru.
            # As a result, the current methodology cannot support more than 7 digits of
            # floating number precision.
            if abs(value) <= PRECISION_BOUND_COMP_ZERO:
                value = SETUP_BOUND_COMP_ZERO * int(np.sign(value))
            return Leaf(int(leaf_strs[0]), value)

    # An interior tree_node is identified by the pattern
    # '\w' means find all word characters - e.g. matches a "word" character: a letter or digit
    #   or underbar [a-zA-Z0-9_] (Note that \w contains underscore)
    # '.' and '-' are literal '.' and '-' --> add to to capture if some feature name contains '-' or '.'
    # The square bracket is used to indicate a set of characters. '+' indicates the repetitions,
    # and () indicates a match group.
    # The regex below splits the statement (i.e. tree tree_node) into
    # a list of strings. We match the column name with `[\w\s.-]+`, which allow for alpha-numeric
    # characters, whitespace, `.`, and `-`.
    #
    # e.g. current_node = '0:[XYZ ABC<3] yes=1,no=2,missing=1'
    # leaf_strs = ['0', 'XYZ ABC', '3', 'yes', '1', 'no', '2', 'missing', '1']
    pattern = re.compile(r"(\w+):\[([\w\s.-]+)[<>=]+(.+)\] (\w+)=(.+),(\w+)=(.+),(\w+)=(.+)")
    match = pattern.match(current_node)
    if match is None:
        raise Exception("Invalid tree:\n" + current_node + "\nNote that column names can only contain alpha-numeric characters, whitespace, '.', or '-'.")
    leaf_strs = match.groups()

    # we've parsed the root, now find and parse the subtrees
    split_str = r"\n"
    for i in range(0, lvl + 1):
        split_str = split_str + r"\t"

    # split the input on \n\t...\t[0-9]
    # This splits the string into 5 pieces:
    # index 0 is current_node,
    # index 1 is the id of the left subtree
    # index 2 is the rest of the string for the left subtree
    # index 3 is the id of the right subtree
    # index 4 is the rest of the string for the right subtree
    subtrees = re.split(split_str + r"(\d+)", s)

    # recurse to the next level.
    left = parse_node_in_tree(subtrees[1] + subtrees[2], lvl + 1, feature_set, min_max)
    right = parse_node_in_tree(subtrees[3] + subtrees[4], lvl + 1, feature_set, min_max)

    # create a dictionary that maps the subtree Id to the subtree object
    child_dict = {left.id: left, right.id: right}

    # Check if the comparison is a floating point number.
    #   if it is then convert it to float
    #   else we convert it to an int.
    if '.' in leaf_strs[2]:
        node_value = float(leaf_strs[2])

        # Similar to above (precision issue)
        if abs(node_value) <= PRECISION_BOUND_COMP_ZERO:
            node_value = SETUP_BOUND_COMP_ZERO * int(np.sign(node_value))
    else:
        node_value = int(leaf_strs[2])

    if node_value < min_max['min']:
        min_max['min'] = node_value

    if node_value > min_max['max']:
        min_max['max'] = node_value

    feature_set.add(str(leaf_strs[1]))

    return Interior(int(leaf_strs[0]), leaf_strs[1], node_value, child_dict[int(leaf_strs[4])],
                    child_dict[int(leaf_strs[6])], child_dict[int(leaf_strs[8])])