in causalml/inference/tree/models.py [0:0]
def growDecisionTreeFrom(self, X, treatment, y, evaluationFunction, max_depth=10,
min_samples_leaf=100, depth=1,
min_samples_treatment=10, n_reg=100,
parentNodeSummary=None):
'''
Train the uplift decision tree.
Args
----
X : ndarray, shape = [num_samples, num_features]
An ndarray of the covariates used to train the uplift model.
treatment : array-like, shape = [num_samples]
An array containing the treatment group for each unit.
y : array-like, shape = [num_samples]
An array containing the outcome of interest for each unit.
evaluationFunction : string
Choose from one of the models: 'KL', 'ED', 'Chi', 'CTS'.
max_depth: int, optional (default=10)
The maximum depth of the tree.
min_samples_leaf: int, optional (default=100)
The minimum number of samples required to be split at a leaf node.
depth : int, optional (default = 1)
The current depth.
min_samples_treatment: int, optional (default=10)
The minimum number of samples required of the experiment group to be split at a leaf node.
n_reg: int, optional (default=10)
The regularization parameter defined in Rzepakowski et al. 2012,
the weight (in terms of sample size) of the parent node influence
on the child node, only effective for 'KL', 'ED', 'Chi', 'CTS' methods.
parentNodeSummary : dictionary, optional (default = None)
Node summary statistics of the parent tree node.
Returns
-------
object of DecisionTree class
'''
if len(X) == 0:
return DecisionTree()
# Current Node Info and Summary
currentNodeSummary = self.tree_node_summary(treatment, y,
min_samples_treatment=min_samples_treatment,
n_reg=n_reg,
parentNodeSummary=parentNodeSummary)
if evaluationFunction == self.evaluate_CTS:
currentScore = evaluationFunction(currentNodeSummary)
else:
currentScore = evaluationFunction(currentNodeSummary, control_name=self.control_name)
# Prune Stats
maxAbsDiff = 0
maxDiff = -1.
bestTreatment = self.control_name
suboptTreatment = self.control_name
maxDiffTreatment = self.control_name
maxDiffSign = 0
for treatment_group in currentNodeSummary:
if treatment_group != self.control_name:
diff = (currentNodeSummary[treatment_group][0]
- currentNodeSummary[self.control_name][0])
if abs(diff) >= maxAbsDiff:
maxDiffTreatment = treatment_group
maxDiffSign = np.sign(diff)
maxAbsDiff = abs(diff)
if diff >= maxDiff:
maxDiff = diff
suboptTreatment = treatment_group
if diff > 0:
bestTreatment = treatment_group
if maxDiff > 0:
pt = currentNodeSummary[bestTreatment][0]
nt = currentNodeSummary[bestTreatment][1]
pc = currentNodeSummary[self.control_name][0]
nc = currentNodeSummary[self.control_name][1]
p_value = (1. - stats.norm.cdf((pt - pc) / np.sqrt(pt * (1 - pt) / nt + pc * (1 - pc) / nc))) * 2
else:
pt = currentNodeSummary[suboptTreatment][0]
nt = currentNodeSummary[suboptTreatment][1]
pc = currentNodeSummary[self.control_name][0]
nc = currentNodeSummary[self.control_name][1]
p_value = (1. - stats.norm.cdf((pc - pt) / np.sqrt(pt * (1 - pt) / nt + pc * (1 - pc) / nc))) * 2
upliftScore = [maxDiff, p_value]
bestGain = 0.0
bestAttribute = None
# last column is the result/target column, 2nd to the last is the treatment group
columnCount = X.shape[1]
if (self.max_features and self.max_features > 0 and self.max_features <= columnCount):
max_features = self.max_features
else:
max_features = columnCount
for col in list(np.random.choice(a=range(columnCount), size=max_features, replace=False)):
columnValues = X[:, col]
# unique values
lsUnique = np.unique(columnValues)
if (isinstance(lsUnique[0], int) or
isinstance(lsUnique[0], float)):
if len(lsUnique) > 10:
lspercentile = np.percentile(columnValues, [3, 5, 10, 20, 30, 50, 70, 80, 90, 95, 97])
else:
lspercentile = np.percentile(lsUnique, [10, 50, 90])
lsUnique = np.unique(lspercentile)
for value in lsUnique:
X_l, X_r, w_l, w_r, y_l, y_r = self.divideSet(X, treatment, y, col, value)
# check the split validity on min_samples_leaf 372
if (len(X_l) < min_samples_leaf or len(X_r) < min_samples_leaf):
continue
# summarize notes
# Gain -- Entropy or Gini
p = float(len(X_l)) / len(X)
leftNodeSummary = self.tree_node_summary(w_l, y_l,
min_samples_treatment=min_samples_treatment,
n_reg=n_reg,
parentNodeSummary=currentNodeSummary)
rightNodeSummary = self.tree_node_summary(w_r, y_r,
min_samples_treatment=min_samples_treatment,
n_reg=n_reg,
parentNodeSummary=parentNodeSummary)
# check the split validity on min_samples_treatment
if set(leftNodeSummary.keys()) != set(rightNodeSummary.keys()):
continue
node_mst = 10**8
for ti in leftNodeSummary:
node_mst = np.min([node_mst, leftNodeSummary[ti][1]])
node_mst = np.min([node_mst, rightNodeSummary[ti][1]])
if node_mst < min_samples_treatment:
continue
# evaluate the split
if evaluationFunction == self.evaluate_CTS:
leftScore1 = evaluationFunction(leftNodeSummary)
rightScore2 = evaluationFunction(rightNodeSummary)
gain = (currentScore - p * leftScore1 - (1 - p) * rightScore2)
gain_for_imp = (len(X) * currentScore - len(X_l) * leftScore1 - len(X_r) * rightScore2)
else:
if (self.control_name in leftNodeSummary and
self.control_name in rightNodeSummary):
leftScore1 = evaluationFunction(leftNodeSummary, control_name=self.control_name)
rightScore2 = evaluationFunction(rightNodeSummary, control_name=self.control_name)
gain = (p * leftScore1 + (1 - p) * rightScore2 - currentScore)
gain_for_imp = (len(X_l) * leftScore1 + len(X_r) * rightScore2 - len(X) * currentScore)
if self.normalization:
norm_factor = self.normI(currentNodeSummary,
leftNodeSummary,
rightNodeSummary,
self.control_name,
alpha=0.9)
else:
norm_factor = 1
gain = gain / norm_factor
else:
gain = 0
if (gain > bestGain and len(X_l) > min_samples_leaf and len(X_r) > min_samples_leaf):
bestGain = gain
bestAttribute = (col, value)
best_set_left = [X_l, w_l, y_l]
best_set_right = [X_r, w_r, y_r]
self.feature_imp_dict[bestAttribute[0]] += gain_for_imp
dcY = {'impurity': '%.3f' % currentScore, 'samples': '%d' % len(X)}
# Add treatment size
dcY['group_size'] = ''
for treatment_group in currentNodeSummary:
dcY['group_size'] += ' ' + treatment_group + ': ' + str(currentNodeSummary[treatment_group][1])
dcY['upliftScore'] = [round(upliftScore[0], 4), round(upliftScore[1], 4)]
dcY['matchScore'] = round(upliftScore[0], 4)
if bestGain > 0 and depth < max_depth:
trueBranch = self.growDecisionTreeFrom(
*best_set_left, evaluationFunction, max_depth, min_samples_leaf,
depth + 1, min_samples_treatment=min_samples_treatment,
n_reg=n_reg, parentNodeSummary=currentNodeSummary
)
falseBranch = self.growDecisionTreeFrom(
*best_set_right, evaluationFunction, max_depth, min_samples_leaf,
depth + 1, min_samples_treatment=min_samples_treatment,
n_reg=n_reg, parentNodeSummary=currentNodeSummary
)
return DecisionTree(
col=bestAttribute[0], value=bestAttribute[1],
trueBranch=trueBranch, falseBranch=falseBranch, summary=dcY,
maxDiffTreatment=maxDiffTreatment, maxDiffSign=maxDiffSign,
nodeSummary=currentNodeSummary,
backupResults=self.uplift_classification_results(treatment, y),
bestTreatment=bestTreatment, upliftScore=upliftScore
)
else:
if evaluationFunction == self.evaluate_CTS:
return DecisionTree(
results=self.uplift_classification_results(treatment, y),
summary=dcY, nodeSummary=currentNodeSummary,
bestTreatment=bestTreatment, upliftScore=upliftScore
)
else:
return DecisionTree(
results=self.uplift_classification_results(treatment, y),
summary=dcY, maxDiffTreatment=maxDiffTreatment,
maxDiffSign=maxDiffSign, nodeSummary=currentNodeSummary,
bestTreatment=bestTreatment, upliftScore=upliftScore
)