in causalml/dataset/synthetic.py [0:0]
def get_synthetic_summary(synthetic_data_func, n=1000, k=1, estimators={}):
"""Generate a summary for predictions on synthetic data using specified function
Args:
synthetic_data_func (function): synthetic data generation function
n (int, optional): number of samples per simulation
k (int, optional): number of simulations
"""
summaries = []
for i in range(k):
synthetic_preds = get_synthetic_preds(
synthetic_data_func, n=n, estimators=estimators
)
actuals = synthetic_preds[KEY_ACTUAL]
synthetic_summary = pd.DataFrame(
{
label: [preds.mean(), mse(preds, actuals)]
for label, preds in synthetic_preds.items()
if label != KEY_GENERATED_DATA
},
index=["ATE", "MSE"],
).T
synthetic_summary["Abs % Error of ATE"] = np.abs(
(synthetic_summary["ATE"] / synthetic_summary.loc[KEY_ACTUAL, "ATE"]) - 1
)
for label in synthetic_summary.index:
stacked_values = np.hstack((synthetic_preds[label], actuals))
stacked_low = np.percentile(stacked_values, 0.1)
stacked_high = np.percentile(stacked_values, 99.9)
bins = np.linspace(stacked_low, stacked_high, 100)
distr = np.histogram(synthetic_preds[label], bins=bins)[0]
distr = np.clip(distr / distr.sum(), 0.001, 0.999)
true_distr = np.histogram(actuals, bins=bins)[0]
true_distr = np.clip(true_distr / true_distr.sum(), 0.001, 0.999)
kl = entropy(distr, true_distr)
synthetic_summary.loc[label, "KL Divergence"] = kl
summaries.append(synthetic_summary)
summary = sum(summaries) / k
return summary[["Abs % Error of ATE", "MSE", "KL Divergence"]]