in src/ppxgboost/BoosterParser.py [0:0]
def parse_node_in_tree(s, lvl, feature_set, min_max):
# The index of the regular expression is defined based on the xgboost
# output formatting.
# split the string into chunks where each represents a single tree_node
# the first item in the list is the root for this tree
current_node = re.split(r"\n", s)[0]
# try to parse this tree_node as a leaf
# a leaf is identified by the pattern <int>:leaf=<float>
# where the float is possibly negative and possibly an integer
# # similar to '\w.-'. The '-' is to capture negative value, and '.' for floating point number.
# e.g. 10:leaf=0.009 will be parsed to ['10', '0.009']
leaf_strs = re.findall(r"[\d.-]+", current_node)
if len(leaf_strs) == 2: # if this is a leaf
return Leaf(int(leaf_strs[0]), float(leaf_strs[1]))
else:
# the parsing for scientific notations.
# if the leaf value is a scientific notation, then the parsing above does not work
# the value contains 'e-', e.g. '2.3e-05', therefore, we need to parsing the scientific value
if len(leaf_strs) == 3 and 'e-' in current_node:
pos = current_node.find('=')
value = float(current_node[pos + 1:])
# As Client's query is unpredictible, it's impossible
# this is only to avoid the comparision between two extreme tiny
# values when encountered in the model. The Affine transform from
# PPBoost.py already takes care of the precision up to 1.0e-7.
#
# In case we encountered a comparision
# between ~1.0e-14 and 0. OPE cannot support this,
# so manually set the tiny number (1.0e-14) to a bigger
# number (1.0e-7) in order to make the comparison go thru.
# As a result, the current methodology cannot support more than 7 digits of
# floating number precision.
if abs(value) <= PRECISION_BOUND_COMP_ZERO:
value = SETUP_BOUND_COMP_ZERO * int(np.sign(value))
return Leaf(int(leaf_strs[0]), value)
# An interior tree_node is identified by the pattern
# '\w' means find all word characters - e.g. matches a "word" character: a letter or digit
# or underbar [a-zA-Z0-9_] (Note that \w contains underscore)
# '.' and '-' are literal '.' and '-' --> add to to capture if some feature name contains '-' or '.'
# The square bracket is used to indicate a set of characters. '+' indicates the repetitions,
# and () indicates a match group.
# The regex below splits the statement (i.e. tree tree_node) into
# a list of strings. We match the column name with `[\w\s.-]+`, which allow for alpha-numeric
# characters, whitespace, `.`, and `-`.
#
# e.g. current_node = '0:[XYZ ABC<3] yes=1,no=2,missing=1'
# leaf_strs = ['0', 'XYZ ABC', '3', 'yes', '1', 'no', '2', 'missing', '1']
pattern = re.compile(r"(\w+):\[([\w\s.-]+)[<>=]+(.+)\] (\w+)=(.+),(\w+)=(.+),(\w+)=(.+)")
match = pattern.match(current_node)
if match is None:
raise Exception("Invalid tree:\n" + current_node + "\nNote that column names can only contain alpha-numeric characters, whitespace, '.', or '-'.")
leaf_strs = match.groups()
# we've parsed the root, now find and parse the subtrees
split_str = r"\n"
for i in range(0, lvl + 1):
split_str = split_str + r"\t"
# split the input on \n\t...\t[0-9]
# This splits the string into 5 pieces:
# index 0 is current_node,
# index 1 is the id of the left subtree
# index 2 is the rest of the string for the left subtree
# index 3 is the id of the right subtree
# index 4 is the rest of the string for the right subtree
subtrees = re.split(split_str + r"(\d+)", s)
# recurse to the next level.
left = parse_node_in_tree(subtrees[1] + subtrees[2], lvl + 1, feature_set, min_max)
right = parse_node_in_tree(subtrees[3] + subtrees[4], lvl + 1, feature_set, min_max)
# create a dictionary that maps the subtree Id to the subtree object
child_dict = {left.id: left, right.id: right}
# Check if the comparison is a floating point number.
# if it is then convert it to float
# else we convert it to an int.
if '.' in leaf_strs[2]:
node_value = float(leaf_strs[2])
# Similar to above (precision issue)
if abs(node_value) <= PRECISION_BOUND_COMP_ZERO:
node_value = SETUP_BOUND_COMP_ZERO * int(np.sign(node_value))
else:
node_value = int(leaf_strs[2])
if node_value < min_max['min']:
min_max['min'] = node_value
if node_value > min_max['max']:
min_max['max'] = node_value
feature_set.add(str(leaf_strs[1]))
return Interior(int(leaf_strs[0]), leaf_strs[1], node_value, child_dict[int(leaf_strs[4])],
child_dict[int(leaf_strs[6])], child_dict[int(leaf_strs[8])])