def _split_exposure_hajek

def _split_exposure_hajek_eht()

in causalPartition.py [0:0]
143 lines of code
60 McCabe index (conditional complexity)

    def _split_exposure_hajek_eht(self, node_id, df, probabilities, feature_set, max_attempt, eps, delta, outcome, rules, N, current_mse, criteria):
        """
        the actual splitting implementation for non-separate tree; 
        recursion
        """
        b_feature = ''
        b_threshold = 0
        b_left = None
        b_right = None
        b_average_left_hajek = 0
        b_average_right_hajek = 0
        b_mse = 10000000000.0

        ranges = {}
        
        for feature in feature_set:
            gc.collect()
            # find the more compact valid region
            upper = 1.
            lower = 0.
            for rule in rules:
                if rule[0] == feature:
                    if rule[1] == 0:
                        lower = np.maximum(rule[2], lower)
                    else:
                        upper = np.minimum(rule[2], upper)
            
            if lower > upper:
                continue
                
            for k in range(max_attempt):
                threshold = np.random.uniform(lower, upper)
                
                cz_l_1 = self._contain_zero(probabilities, rules+[(feature, 0, threshold)], eps, delta, treated=1)
                cz_r_1 = self._contain_zero(probabilities, rules+[(feature, 1, threshold)], eps, delta, treated=1)
                cz_l_0 = self._contain_zero(probabilities, rules+[(feature, 0, threshold)], eps, delta, treated=0)
                cz_r_0 = self._contain_zero(probabilities, rules+[(feature, 1, threshold)], eps, delta, treated=0)
                
                if np.mean(cz_l_1) > delta or np.mean(cz_r_1) > delta or np.mean(cz_r_0) > delta or np.mean(cz_r_0) > delta:
                    continue

                idxs_left = np.product([df[key] <= th for key, sign, th in rules if sign == 0] + \
                       [df[key] > th for key, sign, th in rules if sign == 1] + \
                        [df[feature] <= threshold],
                       axis=0) > 0

                idxs_right = np.product([df[key] <= th for key, sign, th in rules if sign == 0] + \
                       [df[key] > th for key, sign, th in rules if sign == 1] + \
                        [df[feature] > threshold],
                       axis=0) > 0

                left = df[idxs_left]
                right = df[idxs_right]
                
                # propensity score for left partition + ego treated
                propensities_left_1 = np.mean(np.product([probabilities[key][idxs_left] <= th for key, sign, th in rules if sign == 0] + \
                            [probabilities[key][idxs_left] > th for key, sign, th in rules if sign == 1] + \
                            [probabilities[feature][idxs_left] <= threshold] + \
                            [probabilities[self.treatment][idxs_left] == 1],
                           axis=0), axis=1)
                
                # propensity score for left partition + ego non treated
                propensities_left_0 = np.mean(np.product([probabilities[key][idxs_left] <= th for key, sign, th in rules if sign == 0] + \
                            [probabilities[key][idxs_left] > th for key, sign, th in rules if sign == 1] + \
                            [probabilities[feature][idxs_left] <= threshold] + \
                            [probabilities[self.treatment][idxs_left] == 0],
                           axis=0), axis=1)
                
                propensities_right_1 = np.mean(np.product([probabilities[key][idxs_right] <= th for key, sign, th in rules if sign == 0] + \
                            [probabilities[key][idxs_right] > th for key, sign, th in rules if sign == 1] + \
                            [probabilities[feature][idxs_right] > threshold] + \
                            [probabilities[self.treatment][idxs_right] == 1],
                           axis=0), axis=1)

                propensities_right_0 = np.mean(np.product([probabilities[key][idxs_right] <= th for key, sign, th in rules if sign == 0] + \
                            [probabilities[key][idxs_right] > th for key, sign, th in rules if sign == 1] + \
                            [probabilities[feature][idxs_right] > threshold] + \
                            [probabilities[self.treatment][idxs_right] == 0],
                           axis=0), axis=1)
                
                # filter those whose propensities scores are very small (This may lead to lose observations)
                idxs_left_filter = np.logical_and(propensities_left_1 > eps, propensities_left_0 > eps)
                left = left[idxs_left_filter]
                propensities_left_1 = propensities_left_1[idxs_left_filter]
                propensities_left_0 = propensities_left_0[idxs_left_filter]
                
                # filter those whose propensities scores are very small (This may lead to lose observations)
                idxs_right_filter = np.logical_and(propensities_right_1 > eps, propensities_right_0 > eps)
                right = right[idxs_right_filter]
                propensities_right_1 = propensities_right_1[idxs_right_filter]                
                propensities_right_0 = propensities_right_0[idxs_right_filter]                
                
                if np.mean(left[self.treatment]) == 0 or np.mean(left[self.treatment]) == 1 or \
                    np.mean(right[self.treatment]) == 0 or np.mean(right[self.treatment]) == 1:
                    continue
                
                if len(left) == 0 or len(right) == 0:
                    continue
                
                # The covariate implementation does not work as expected; should always be None

                mod_left = sm.WLS(left[outcome], sm.add_constant(left[[self.treatment]]), \
                    weights=1.0 / propensities_left_1 * left[self.treatment] + 1.0 / propensities_left_0 * (1-left[self.treatment]))
                res_left = mod_left.fit()

                mod_right = sm.WLS(right[outcome], sm.add_constant(right[self.treatment]), \
                    weights=1.0 / propensities_right_1 * right[self.treatment] + 1.0 / propensities_right_0 * (1-right[self.treatment]))
                res_right = mod_right.fit()
                
                average_left_hajek = res_left.params[1] 
                average_right_hajek = res_right.params[1]

                average_left_hajek_se = res_left.bse[1]  
                average_right_hajek_se = res_right.bse[1]
                # need further improvement
                
                mse_left = np.sum((1.0 / propensities_left_1 * left[self.treatment] + 1.0 / propensities_left_0 * (1-left[self.treatment])) * 
                                  ((res_left.resid) ** 2))
                mse_right = np.sum((1.0 / propensities_right_1 * right[self.treatment] + 1.0 / propensities_right_0 * (1-right[self.treatment])) * 
                                   ((res_right.resid) ** 2))
                mse = mse_left * 1.0 * len(left)/(len(left)+len(right)) + mse_right * 1.0 * len(right)/(len(left)+len(right))

                if mse < b_mse:
                    flag = True
                    assert len(criteria) > 0
                    if 'non_trivial_reduction' in criteria:
                        if not (mse < current_mse - criteria['non_trivial_reduction']):
                            flag = False
                    if 'reasonable_propensity' in criteria:
                        if not (np.abs(np.sum(1.0 / propensities_left_1 * left[self.treatment])/len(df) - 1.0) <= criteria['reasonable_propensity'] \
                                and \
                                np.abs(np.sum(1.0 / propensities_right_1 * right[self.treatment])/len(df) - 1.0) <= criteria['reasonable_propensity'] \
                                and \
                                np.abs(np.sum(1.0 / propensities_left_0 * (1 - left[self.treatment]))/len(df) - 1.0) <= criteria['reasonable_propensity'] \
                                and \
                                np.abs(np.sum(1.0 / propensities_right_0 * (1 - right[self.treatment]))/len(df) - 1.0) <= criteria['reasonable_propensity']
                               ):
                            flag = False
                    if 'separate_reduction' in criteria:
                        if not (mse_left < current_mse and mse_right < current_mse):
                            flag = False
                    if 'min_leaf_size' in criteria:
                        if not (len(left) >= criteria['min_leaf_size'] and len(right) >= criteria['min_leaf_size']):
                            flag = False
                    if flag:
                        b_feature = feature
                        b_mse = mse
                        b_mse_left = mse_left
                        b_mse_right = mse_right
                        b_threshold = threshold
                        b_average_left_hajek = average_left_hajek
                        b_average_right_hajek = average_right_hajek
                        b_average_left_hajek_se = average_left_hajek_se
                        b_average_right_hajek_se = average_right_hajek_se
                        b_left = left
                        b_right = right
                        b_left_rules = rules + [(feature, 0, threshold)]
                        b_right_rules = rules + [(feature, 1, threshold)]
        
        result = {}
        if b_feature != '':
            # if find a valid partition
            result_left = self._split_exposure_hajek_eht(node_id*2, df, probabilities, feature_set, max_attempt, eps, delta, outcome, b_left_rules, N, b_mse_left, criteria)       
            result_right = self._split_exposure_hajek_eht(node_id*2+1, df, probabilities, feature_set, max_attempt, eps, delta, outcome, b_right_rules, N, b_mse_right, criteria)

            result['mse'] = result_left['mse'] * 1.0 * len(b_left)/(len(b_left)+len(b_right)) + \
                        result_right['mse'] * 1.0 * len(b_right)/(len(b_left)+len(b_right))
            result['feature'] = b_feature
            result['threshold'] = b_threshold
            result_left['hajek'] = b_average_left_hajek
            result_right['hajek'] = b_average_right_hajek
            result_left['hajek_se'] = b_average_left_hajek_se
            result_right['hajek_se'] = b_average_right_hajek_se
            result_left['N'] = len(b_left)
            result_right['N'] = len(b_right)
            result['left_result'] = result_left
            result['right_result'] = result_right
            return result
        else:
            result['mse'] = current_mse
            return result