in causalPartition.py [0:0]
def _split_exposure_hajek_eht(self, node_id, df, probabilities, feature_set, max_attempt, eps, delta, outcome, rules, N, current_mse, criteria):
"""
the actual splitting implementation for non-separate tree;
recursion
"""
b_feature = ''
b_threshold = 0
b_left = None
b_right = None
b_average_left_hajek = 0
b_average_right_hajek = 0
b_mse = 10000000000.0
ranges = {}
for feature in feature_set:
gc.collect()
# find the more compact valid region
upper = 1.
lower = 0.
for rule in rules:
if rule[0] == feature:
if rule[1] == 0:
lower = np.maximum(rule[2], lower)
else:
upper = np.minimum(rule[2], upper)
if lower > upper:
continue
for k in range(max_attempt):
threshold = np.random.uniform(lower, upper)
cz_l_1 = self._contain_zero(probabilities, rules+[(feature, 0, threshold)], eps, delta, treated=1)
cz_r_1 = self._contain_zero(probabilities, rules+[(feature, 1, threshold)], eps, delta, treated=1)
cz_l_0 = self._contain_zero(probabilities, rules+[(feature, 0, threshold)], eps, delta, treated=0)
cz_r_0 = self._contain_zero(probabilities, rules+[(feature, 1, threshold)], eps, delta, treated=0)
if np.mean(cz_l_1) > delta or np.mean(cz_r_1) > delta or np.mean(cz_r_0) > delta or np.mean(cz_r_0) > delta:
continue
idxs_left = np.product([df[key] <= th for key, sign, th in rules if sign == 0] + \
[df[key] > th for key, sign, th in rules if sign == 1] + \
[df[feature] <= threshold],
axis=0) > 0
idxs_right = np.product([df[key] <= th for key, sign, th in rules if sign == 0] + \
[df[key] > th for key, sign, th in rules if sign == 1] + \
[df[feature] > threshold],
axis=0) > 0
left = df[idxs_left]
right = df[idxs_right]
# propensity score for left partition + ego treated
propensities_left_1 = np.mean(np.product([probabilities[key][idxs_left] <= th for key, sign, th in rules if sign == 0] + \
[probabilities[key][idxs_left] > th for key, sign, th in rules if sign == 1] + \
[probabilities[feature][idxs_left] <= threshold] + \
[probabilities[self.treatment][idxs_left] == 1],
axis=0), axis=1)
# propensity score for left partition + ego non treated
propensities_left_0 = np.mean(np.product([probabilities[key][idxs_left] <= th for key, sign, th in rules if sign == 0] + \
[probabilities[key][idxs_left] > th for key, sign, th in rules if sign == 1] + \
[probabilities[feature][idxs_left] <= threshold] + \
[probabilities[self.treatment][idxs_left] == 0],
axis=0), axis=1)
propensities_right_1 = np.mean(np.product([probabilities[key][idxs_right] <= th for key, sign, th in rules if sign == 0] + \
[probabilities[key][idxs_right] > th for key, sign, th in rules if sign == 1] + \
[probabilities[feature][idxs_right] > threshold] + \
[probabilities[self.treatment][idxs_right] == 1],
axis=0), axis=1)
propensities_right_0 = np.mean(np.product([probabilities[key][idxs_right] <= th for key, sign, th in rules if sign == 0] + \
[probabilities[key][idxs_right] > th for key, sign, th in rules if sign == 1] + \
[probabilities[feature][idxs_right] > threshold] + \
[probabilities[self.treatment][idxs_right] == 0],
axis=0), axis=1)
# filter those whose propensities scores are very small (This may lead to lose observations)
idxs_left_filter = np.logical_and(propensities_left_1 > eps, propensities_left_0 > eps)
left = left[idxs_left_filter]
propensities_left_1 = propensities_left_1[idxs_left_filter]
propensities_left_0 = propensities_left_0[idxs_left_filter]
# filter those whose propensities scores are very small (This may lead to lose observations)
idxs_right_filter = np.logical_and(propensities_right_1 > eps, propensities_right_0 > eps)
right = right[idxs_right_filter]
propensities_right_1 = propensities_right_1[idxs_right_filter]
propensities_right_0 = propensities_right_0[idxs_right_filter]
if np.mean(left[self.treatment]) == 0 or np.mean(left[self.treatment]) == 1 or \
np.mean(right[self.treatment]) == 0 or np.mean(right[self.treatment]) == 1:
continue
if len(left) == 0 or len(right) == 0:
continue
# The covariate implementation does not work as expected; should always be None
mod_left = sm.WLS(left[outcome], sm.add_constant(left[[self.treatment]]), \
weights=1.0 / propensities_left_1 * left[self.treatment] + 1.0 / propensities_left_0 * (1-left[self.treatment]))
res_left = mod_left.fit()
mod_right = sm.WLS(right[outcome], sm.add_constant(right[self.treatment]), \
weights=1.0 / propensities_right_1 * right[self.treatment] + 1.0 / propensities_right_0 * (1-right[self.treatment]))
res_right = mod_right.fit()
average_left_hajek = res_left.params[1]
average_right_hajek = res_right.params[1]
average_left_hajek_se = res_left.bse[1]
average_right_hajek_se = res_right.bse[1]
# need further improvement
mse_left = np.sum((1.0 / propensities_left_1 * left[self.treatment] + 1.0 / propensities_left_0 * (1-left[self.treatment])) *
((res_left.resid) ** 2))
mse_right = np.sum((1.0 / propensities_right_1 * right[self.treatment] + 1.0 / propensities_right_0 * (1-right[self.treatment])) *
((res_right.resid) ** 2))
mse = mse_left * 1.0 * len(left)/(len(left)+len(right)) + mse_right * 1.0 * len(right)/(len(left)+len(right))
if mse < b_mse:
flag = True
assert len(criteria) > 0
if 'non_trivial_reduction' in criteria:
if not (mse < current_mse - criteria['non_trivial_reduction']):
flag = False
if 'reasonable_propensity' in criteria:
if not (np.abs(np.sum(1.0 / propensities_left_1 * left[self.treatment])/len(df) - 1.0) <= criteria['reasonable_propensity'] \
and \
np.abs(np.sum(1.0 / propensities_right_1 * right[self.treatment])/len(df) - 1.0) <= criteria['reasonable_propensity'] \
and \
np.abs(np.sum(1.0 / propensities_left_0 * (1 - left[self.treatment]))/len(df) - 1.0) <= criteria['reasonable_propensity'] \
and \
np.abs(np.sum(1.0 / propensities_right_0 * (1 - right[self.treatment]))/len(df) - 1.0) <= criteria['reasonable_propensity']
):
flag = False
if 'separate_reduction' in criteria:
if not (mse_left < current_mse and mse_right < current_mse):
flag = False
if 'min_leaf_size' in criteria:
if not (len(left) >= criteria['min_leaf_size'] and len(right) >= criteria['min_leaf_size']):
flag = False
if flag:
b_feature = feature
b_mse = mse
b_mse_left = mse_left
b_mse_right = mse_right
b_threshold = threshold
b_average_left_hajek = average_left_hajek
b_average_right_hajek = average_right_hajek
b_average_left_hajek_se = average_left_hajek_se
b_average_right_hajek_se = average_right_hajek_se
b_left = left
b_right = right
b_left_rules = rules + [(feature, 0, threshold)]
b_right_rules = rules + [(feature, 1, threshold)]
result = {}
if b_feature != '':
# if find a valid partition
result_left = self._split_exposure_hajek_eht(node_id*2, df, probabilities, feature_set, max_attempt, eps, delta, outcome, b_left_rules, N, b_mse_left, criteria)
result_right = self._split_exposure_hajek_eht(node_id*2+1, df, probabilities, feature_set, max_attempt, eps, delta, outcome, b_right_rules, N, b_mse_right, criteria)
result['mse'] = result_left['mse'] * 1.0 * len(b_left)/(len(b_left)+len(b_right)) + \
result_right['mse'] * 1.0 * len(b_right)/(len(b_left)+len(b_right))
result['feature'] = b_feature
result['threshold'] = b_threshold
result_left['hajek'] = b_average_left_hajek
result_right['hajek'] = b_average_right_hajek
result_left['hajek_se'] = b_average_left_hajek_se
result_right['hajek_se'] = b_average_right_hajek_se
result_left['N'] = len(b_left)
result_right['N'] = len(b_right)
result['left_result'] = result_left
result['right_result'] = result_right
return result
else:
result['mse'] = current_mse
return result