in causalml/dataset/synthetic.py [0:0]
def get_synthetic_summary_holdout(synthetic_data_func, n=1000, valid_size=0.2, k=1):
"""Generate a summary for predictions on synthetic data for train and holdout using specified function
Args:
synthetic_data_func (function): synthetic data generation function
n (int, optional): number of samples per simulation
valid_size(float,optional): validation/hold out data size
k (int, optional): number of simulations
Returns:
(tuple): summary evaluation metrics of predictions for train and validation:
- summary_train (pandas.DataFrame): training data evaluation summary
- summary_train (pandas.DataFrame): validation data evaluation summary
"""
summaries_train = []
summaries_validation = []
for i in range(k):
preds_dict_train, preds_dict_valid = get_synthetic_preds_holdout(
synthetic_data_func, n=n, valid_size=valid_size
)
actuals_train = preds_dict_train[KEY_ACTUAL]
actuals_validation = preds_dict_valid[KEY_ACTUAL]
synthetic_summary_train = pd.DataFrame(
{
label: [preds.mean(), mse(preds, actuals_train)]
for label, preds in preds_dict_train.items()
if KEY_GENERATED_DATA not in label.lower()
},
index=["ATE", "MSE"],
).T
synthetic_summary_train["Abs % Error of ATE"] = np.abs(
(
synthetic_summary_train["ATE"]
/ synthetic_summary_train.loc[KEY_ACTUAL, "ATE"]
)
- 1
)
synthetic_summary_validation = pd.DataFrame(
{
label: [preds.mean(), mse(preds, actuals_validation)]
for label, preds in preds_dict_valid.items()
if KEY_GENERATED_DATA not in label.lower()
},
index=["ATE", "MSE"],
).T
synthetic_summary_validation["Abs % Error of ATE"] = np.abs(
(
synthetic_summary_validation["ATE"]
/ synthetic_summary_validation.loc[KEY_ACTUAL, "ATE"]
)
- 1
)
# calculate kl divergence for training
for label in synthetic_summary_train.index:
stacked_values = np.hstack((preds_dict_train[label], actuals_train))
stacked_low = np.percentile(stacked_values, 0.1)
stacked_high = np.percentile(stacked_values, 99.9)
bins = np.linspace(stacked_low, stacked_high, 100)
distr = np.histogram(preds_dict_train[label], bins=bins)[0]
distr = np.clip(distr / distr.sum(), 0.001, 0.999)
true_distr = np.histogram(actuals_train, bins=bins)[0]
true_distr = np.clip(true_distr / true_distr.sum(), 0.001, 0.999)
kl = entropy(distr, true_distr)
synthetic_summary_train.loc[label, "KL Divergence"] = kl
# calculate kl divergence for validation
for label in synthetic_summary_validation.index:
stacked_values = np.hstack((preds_dict_valid[label], actuals_validation))
stacked_low = np.percentile(stacked_values, 0.1)
stacked_high = np.percentile(stacked_values, 99.9)
bins = np.linspace(stacked_low, stacked_high, 100)
distr = np.histogram(preds_dict_valid[label], bins=bins)[0]
distr = np.clip(distr / distr.sum(), 0.001, 0.999)
true_distr = np.histogram(actuals_validation, bins=bins)[0]
true_distr = np.clip(true_distr / true_distr.sum(), 0.001, 0.999)
kl = entropy(distr, true_distr)
synthetic_summary_validation.loc[label, "KL Divergence"] = kl
summaries_train.append(synthetic_summary_train)
summaries_validation.append(synthetic_summary_validation)
summary_train = sum(summaries_train) / k
summary_validation = sum(summaries_validation) / k
return (
summary_train[["Abs % Error of ATE", "MSE", "KL Divergence"]],
summary_validation[["Abs % Error of ATE", "MSE", "KL Divergence"]],
)