def refute_estimate()

in dowhy/causal_refuters/dummy_outcome_refuter.py [0:0]
106 lines of code
9 McCabe index (conditional complexity)

    def refute_estimate(self):

        # We need to change the identified estimand
        # We thus, make a copy. This is done as we don't want
        # to change the original DataFrame
        identified_estimand = copy.deepcopy(self._target_estimand)
        identified_estimand.outcome_variable = ["dummy_outcome"]

        self.logger.info("Refutation over {} simulated datasets".format(self._num_simulations) )
        self.logger.info("The transformation passed: {}".format(self._transformation_list) )

        simulation_results = []
        refute_list = []

        # We use collections.OrderedDict to maintain the order in which the data is stored
        causal_effect_map = OrderedDict()

        # Check if we are using an estimator in the transformation list
        estimator_present = self._has_estimator()

        # The rationale behind ordering of the loops is the fact that we induce randomness everytime we create the
        # Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation
        # loops. Thus, we can get different values everytime we get the estimator.

        for _ in range( self._num_simulations ):
            estimates = []

            if estimator_present == False:

                # Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
                if self._test_fraction != DummyOutcomeRefuter.DEFAULT_TEST_FRACTION:
                    self.logger.warning("'test_fraction' is not applicable as there is no base treatment value.")

                # Adding an unobserved confounder if provided by the user
                if self._unobserved_confounder_values is not None:
                    self._data['simulated'] = self._unobserved_confounder_values
                    self._chosen_variables.append('simulated')
                # We set X_train = 0 and outcome_train to be 0
                validation_df = self._data
                X_train = None
                outcome_train = None
                X_validation_df = validation_df[self._chosen_variables]


                X_validation = X_validation_df.values
                outcome_validation = validation_df[self._outcome_name_str].values

                # Get the final outcome, after running through all the values in the transformation list
                outcome_validation = self.process_data(X_train, outcome_train, X_validation, outcome_validation, self._transformation_list)

                # Check if the value of true effect has been already stored
                # We use None as the key as we have no base category for this refutation
                if None not in causal_effect_map:
                    # As we currently support only one treatment
                    causal_effect_map[None] = self._true_causal_effect( validation_df[ self._treatment_name[0] ] )

                outcome_validation += causal_effect_map[None]


                new_data = validation_df.assign(dummy_outcome=outcome_validation)


                new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
                new_effect = new_estimator.estimate_effect()
                estimates.append(new_effect.value)

            else:

                groups = self.preprocess_data_by_treatment()
                group_count = 0

                if len(self._test_fraction) == 1:
                    self._test_fraction = len(groups) * self._test_fraction

                for key_train, _ in groups:
                    base_train = groups.get_group(key_train).sample(frac=self._test_fraction[group_count].base)
                    train_set = set( [ tuple(line) for line in base_train.values ] )
                    total_set = set( [ tuple(line) for line in groups.get_group(key_train).values ] )
                    base_validation = pd.DataFrame( list( total_set.difference(train_set) ), columns=base_train.columns )
                    X_train_df = base_train[self._chosen_variables]

                    X_train = X_train_df.values
                    outcome_train = base_train[self._outcome_name_str].values

                    validation_df = []
                    transformation_list = self._transformation_list
                    validation_df.append(base_validation)

                    for key_validation, _ in groups:
                        if key_validation != key_train:
                            validation_df.append(groups.get_group(key_validation).sample(frac=self._test_fraction[group_count].other))

                    validation_df = pd.concat(validation_df)
                    X_validation_df = validation_df[self._chosen_variables]

                    X_validation = X_validation_df.values
                    outcome_validation = validation_df[self._outcome_name_str].values

                    # If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
                    if X_train.shape[0] <= self._min_data_point_threshold:
                        transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION
                        self.logger.warning("The number of data points in X_train:{} for category:{} is less than threshold:{}".format(X_train.shape[0], key_train, self._min_data_point_threshold))
                        self.logger.warning("Therefore, defaulting to the minimal set of transformations:{}".format(transformation_list))

                    outcome_validation = self.process_data(X_train, outcome_train, X_validation, outcome_validation, transformation_list)

                    # Check if the value of true effect has been already stored
                    # This ensures that we calculate the causal effect only once.
                    # We use key_train as we map data with respect to the base category of the data

                    if key_train not in causal_effect_map:
                        # As we currently support only one treatment
                        causal_effect_map[key_train] = self._true_causal_effect( validation_df[ self._treatment_name[0] ] )

                    # Add h(t) to f(W) to get the dummy outcome
                    outcome_validation += causal_effect_map[key_train]

                    new_data = validation_df.assign(dummy_outcome=outcome_validation)
                    new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
                    new_effect = new_estimator.estimate_effect()

                    estimates.append(new_effect.value)
                    group_count += 1


            simulation_results.append(estimates)


        # We convert to ndarray for ease in indexing
        # The data is of the form
        # sim1: cat1 cat2 ... catn
        # sim2: cat1 cat2 ... catn
        simulation_results = np.array(simulation_results)

        # Note: We would like the causal_estimator to find the true causal estimate that we have specified through this
        # refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the
        # distribution of the refuter.

        if estimator_present == False:

            dummy_estimate = CausalEstimate(
                    estimate = causal_effect_map[None],
                    control_value = self._estimate.control_value,
                    treatment_value=self._estimate.treatment_value,
                    target_estimand =self._estimate.target_estimand,
                    realized_estimand_expr=self._estimate.realized_estimand_expr)

            refute = CausalRefutation(
                        dummy_estimate.value,
                        np.mean(simulation_results),
                        refutation_type="Refute: Use a Dummy Outcome"
                    )

            refute.add_significance_test_results(
                self.test_significance(dummy_estimate, np.ravel(simulation_results))
            )

            refute.add_refuter(self)

            refute_list.append(refute)

        else:
            # True Causal Effect list
            causal_effect_list = list( causal_effect_map.values() )
            # Iterating through the refutation for each category
            for train_category in range(simulation_results.shape[1]):
                dummy_estimate = CausalEstimate(
                    estimate=causal_effect_list[train_category],
                    control_value=self._estimate.control_value,
                    treatment_value=self._estimate.treatment_value,
                    target_estimand=self._estimate.target_estimand,
                    realized_estimand_expr=self._estimate.realized_estimand_expr)

                refute = CausalRefutation(
                    dummy_estimate.value,
                    np.mean(simulation_results[:, train_category]),
                    refutation_type="Refute: Use a Dummy Outcome"
                )

                refute.add_significance_test_results(
                    self.test_significance(dummy_estimate, simulation_results[:, train_category])
                )

                refute.add_refuter(self)
                refute_list.append(refute)


        return refute_list