def _estimate_effect()

in dowhy/causal_estimators/propensity_score_stratification_estimator.py [0:0]
66 lines of code
19 McCabe index (conditional complexity)

    def _estimate_effect(self):
        if self.recalculate_propensity_score is True:
            if self.propensity_score_model is None:
                self.propensity_score_model = linear_model.LogisticRegression()
            self.propensity_score_model.fit(self._observed_common_causes, self._treatment)
            self._data[self.propensity_score_column] = self.propensity_score_model.predict_proba(self._observed_common_causes)[:, 1]
        else:
            # check if the user provides the propensity score column
            if self.propensity_score_column not in self._data.columns:
                raise ValueError(f"Propensity score column {self.propensity_score_column} does not exist. Please specify the column name that has your pre-computed propensity score.")
            else:
                self.logger.info(f"Using pre-computed propensity score incolumn {self.propensity_score_column}")
        clipped = None
        # Infer the right strata based on clipping threshold
        if self.num_strata == "auto":
            # 0.5 because there are two values for the treatment
            clipping_t = self.clipping_threshold
            num_strata = 0.5 * self._data.shape[0] / clipping_t
            # To be conservative and allow most strata to be included in the
            # analysis
            strata_found = False
            while not strata_found:
                self.logger.info("'num_strata' selected as {}".format(num_strata))
                try:
                    clipped = self._get_strata(num_strata, self.clipping_threshold)
                    num_ret_strata = clipped.groupby(['strata']).count().reset_index()
                    # At least 90% of the strata should be included in analysis
                    if num_ret_strata.shape[0] >= 0.5 * num_strata:
                        strata_found = True
                    else:
                        num_strata = int(num_strata / 2)
                        self.logger.info(f"Less than half the strata have at least {self.clipping_threshold} data points. Selecting fewer number of strata.")
                        if num_strata < 2:
                            raise ValueError("Not enough data to generate at least two strata. This error may be due to a high value of 'clipping_threshold'.")
                except ValueError:
                    self.logger.info("No strata found with at least {} data points. Selecting fewer number of strata".format(self.clipping_threshold))
                    num_strata = int(num_strata / 2)
                    if num_strata < 2:
                        raise ValueError("Not enough data to generate at least two strata. This error may be due to a high value of 'clipping_threshold'.")
        else:
            clipped = self._get_strata(self.num_strata, self.clipping_threshold)

        # sum weighted outcomes over all strata  (weight by treated population)
        weighted_outcomes = clipped.groupby('strata').agg({
            self._treatment_name[0]: ['sum'],
            'dbar': ['sum'],
            'd_y': ['sum'],
            'dbar_y': ['sum']
        })
        weighted_outcomes.columns = ["_".join(x) for x in weighted_outcomes.columns.to_numpy().ravel()]
        treatment_sum_name = self._treatment_name[0] + "_sum"
        control_sum_name = "dbar_sum"

        weighted_outcomes['d_y_mean'] = weighted_outcomes['d_y_sum'] / weighted_outcomes[treatment_sum_name]
        weighted_outcomes['dbar_y_mean'] = weighted_outcomes['dbar_y_sum'] / weighted_outcomes['dbar_sum']
        weighted_outcomes['effect'] = weighted_outcomes['d_y_mean'] - weighted_outcomes['dbar_y_mean']
        total_treatment_population = weighted_outcomes[treatment_sum_name].sum()
        total_control_population = weighted_outcomes[control_sum_name].sum()
        total_population = total_treatment_population + total_control_population
        self.logger.debug("Total number of data points is {0}, including {1} from treatment and {2} from control.". format(total_population, total_treatment_population, total_control_population))

        if self._target_units=="att":
            est = (weighted_outcomes['effect'] * weighted_outcomes[treatment_sum_name]).sum() / total_treatment_population
        elif self._target_units=="atc":
            est = (weighted_outcomes['effect'] * weighted_outcomes[control_sum_name]).sum() / total_control_population
        elif self._target_units == "ate":
            est = (weighted_outcomes['effect'] * (weighted_outcomes[control_sum_name]+weighted_outcomes[treatment_sum_name])).sum() / total_population
        else:
            raise ValueError("Target units string value not supported")

        # TODO - how can we add additional information into the returned estimate?
        #        such as how much clipping was done, or per-strata info for debugging?
        estimate = CausalEstimate(estimate=est,
                                  control_value=self._control_value,
                                  treatment_value=self._treatment_value,
                                  target_estimand=self._target_estimand,
                                  realized_estimand_expr=self.symbolic_estimator,
                                  propensity_scores = self._data[self.propensity_score_column])
        return estimate