in dowhy/causal_refuters/dummy_outcome_refuter.py [0:0]
def refute_estimate(self):
# We need to change the identified estimand
# We thus, make a copy. This is done as we don't want
# to change the original DataFrame
identified_estimand = copy.deepcopy(self._target_estimand)
identified_estimand.outcome_variable = ["dummy_outcome"]
self.logger.info("Refutation over {} simulated datasets".format(self._num_simulations) )
self.logger.info("The transformation passed: {}".format(self._transformation_list) )
simulation_results = []
refute_list = []
# We use collections.OrderedDict to maintain the order in which the data is stored
causal_effect_map = OrderedDict()
# Check if we are using an estimator in the transformation list
estimator_present = self._has_estimator()
# The rationale behind ordering of the loops is the fact that we induce randomness everytime we create the
# Train and the Validation Datasets. Thus, we run the simulation loop followed by the training and the validation
# loops. Thus, we can get different values everytime we get the estimator.
for _ in range( self._num_simulations ):
estimates = []
if estimator_present == False:
# Warn the user that the specified parameter is not applicable when no estimator is present in the transformation
if self._test_fraction != DummyOutcomeRefuter.DEFAULT_TEST_FRACTION:
self.logger.warning("'test_fraction' is not applicable as there is no base treatment value.")
# Adding an unobserved confounder if provided by the user
if self._unobserved_confounder_values is not None:
self._data['simulated'] = self._unobserved_confounder_values
self._chosen_variables.append('simulated')
# We set X_train = 0 and outcome_train to be 0
validation_df = self._data
X_train = None
outcome_train = None
X_validation_df = validation_df[self._chosen_variables]
X_validation = X_validation_df.values
outcome_validation = validation_df[self._outcome_name_str].values
# Get the final outcome, after running through all the values in the transformation list
outcome_validation = self.process_data(X_train, outcome_train, X_validation, outcome_validation, self._transformation_list)
# Check if the value of true effect has been already stored
# We use None as the key as we have no base category for this refutation
if None not in causal_effect_map:
# As we currently support only one treatment
causal_effect_map[None] = self._true_causal_effect( validation_df[ self._treatment_name[0] ] )
outcome_validation += causal_effect_map[None]
new_data = validation_df.assign(dummy_outcome=outcome_validation)
new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
new_effect = new_estimator.estimate_effect()
estimates.append(new_effect.value)
else:
groups = self.preprocess_data_by_treatment()
group_count = 0
if len(self._test_fraction) == 1:
self._test_fraction = len(groups) * self._test_fraction
for key_train, _ in groups:
base_train = groups.get_group(key_train).sample(frac=self._test_fraction[group_count].base)
train_set = set( [ tuple(line) for line in base_train.values ] )
total_set = set( [ tuple(line) for line in groups.get_group(key_train).values ] )
base_validation = pd.DataFrame( list( total_set.difference(train_set) ), columns=base_train.columns )
X_train_df = base_train[self._chosen_variables]
X_train = X_train_df.values
outcome_train = base_train[self._outcome_name_str].values
validation_df = []
transformation_list = self._transformation_list
validation_df.append(base_validation)
for key_validation, _ in groups:
if key_validation != key_train:
validation_df.append(groups.get_group(key_validation).sample(frac=self._test_fraction[group_count].other))
validation_df = pd.concat(validation_df)
X_validation_df = validation_df[self._chosen_variables]
X_validation = X_validation_df.values
outcome_validation = validation_df[self._outcome_name_str].values
# If the number of data points is too few, run the default transformation: [("zero",""),("noise", {'std_dev':1} )]
if X_train.shape[0] <= self._min_data_point_threshold:
transformation_list = DummyOutcomeRefuter.DEFAULT_TRANSFORMATION
self.logger.warning("The number of data points in X_train:{} for category:{} is less than threshold:{}".format(X_train.shape[0], key_train, self._min_data_point_threshold))
self.logger.warning("Therefore, defaulting to the minimal set of transformations:{}".format(transformation_list))
outcome_validation = self.process_data(X_train, outcome_train, X_validation, outcome_validation, transformation_list)
# Check if the value of true effect has been already stored
# This ensures that we calculate the causal effect only once.
# We use key_train as we map data with respect to the base category of the data
if key_train not in causal_effect_map:
# As we currently support only one treatment
causal_effect_map[key_train] = self._true_causal_effect( validation_df[ self._treatment_name[0] ] )
# Add h(t) to f(W) to get the dummy outcome
outcome_validation += causal_effect_map[key_train]
new_data = validation_df.assign(dummy_outcome=outcome_validation)
new_estimator = CausalEstimator.get_estimator_object(new_data, identified_estimand, self._estimate)
new_effect = new_estimator.estimate_effect()
estimates.append(new_effect.value)
group_count += 1
simulation_results.append(estimates)
# We convert to ndarray for ease in indexing
# The data is of the form
# sim1: cat1 cat2 ... catn
# sim2: cat1 cat2 ... catn
simulation_results = np.array(simulation_results)
# Note: We would like the causal_estimator to find the true causal estimate that we have specified through this
# refuter. Let the value of the true causal effect be h(t). In the following section of code, we wish to find out if h(t) falls in the
# distribution of the refuter.
if estimator_present == False:
dummy_estimate = CausalEstimate(
estimate = causal_effect_map[None],
control_value = self._estimate.control_value,
treatment_value=self._estimate.treatment_value,
target_estimand =self._estimate.target_estimand,
realized_estimand_expr=self._estimate.realized_estimand_expr)
refute = CausalRefutation(
dummy_estimate.value,
np.mean(simulation_results),
refutation_type="Refute: Use a Dummy Outcome"
)
refute.add_significance_test_results(
self.test_significance(dummy_estimate, np.ravel(simulation_results))
)
refute.add_refuter(self)
refute_list.append(refute)
else:
# True Causal Effect list
causal_effect_list = list( causal_effect_map.values() )
# Iterating through the refutation for each category
for train_category in range(simulation_results.shape[1]):
dummy_estimate = CausalEstimate(
estimate=causal_effect_list[train_category],
control_value=self._estimate.control_value,
treatment_value=self._estimate.treatment_value,
target_estimand=self._estimate.target_estimand,
realized_estimand_expr=self._estimate.realized_estimand_expr)
refute = CausalRefutation(
dummy_estimate.value,
np.mean(simulation_results[:, train_category]),
refutation_type="Refute: Use a Dummy Outcome"
)
refute.add_significance_test_results(
self.test_significance(dummy_estimate, simulation_results[:, train_category])
)
refute.add_refuter(self)
refute_list.append(refute)
return refute_list