def get_synthetic_summary()

in causalml/dataset/synthetic.py [0:0]


def get_synthetic_summary(synthetic_data_func, n=1000, k=1, estimators={}):
    """Generate a summary for predictions on synthetic data using specified function

    Args:
        synthetic_data_func (function): synthetic data generation function
        n (int, optional): number of samples per simulation
        k (int, optional): number of simulations
    """
    summaries = []

    for i in range(k):
        synthetic_preds = get_synthetic_preds(
            synthetic_data_func, n=n, estimators=estimators
        )
        actuals = synthetic_preds[KEY_ACTUAL]
        synthetic_summary = pd.DataFrame(
            {
                label: [preds.mean(), mse(preds, actuals)]
                for label, preds in synthetic_preds.items()
                if label != KEY_GENERATED_DATA
            },
            index=["ATE", "MSE"],
        ).T

        synthetic_summary["Abs % Error of ATE"] = np.abs(
            (synthetic_summary["ATE"] / synthetic_summary.loc[KEY_ACTUAL, "ATE"]) - 1
        )

        for label in synthetic_summary.index:
            stacked_values = np.hstack((synthetic_preds[label], actuals))
            stacked_low = np.percentile(stacked_values, 0.1)
            stacked_high = np.percentile(stacked_values, 99.9)
            bins = np.linspace(stacked_low, stacked_high, 100)

            distr = np.histogram(synthetic_preds[label], bins=bins)[0]
            distr = np.clip(distr / distr.sum(), 0.001, 0.999)
            true_distr = np.histogram(actuals, bins=bins)[0]
            true_distr = np.clip(true_distr / true_distr.sum(), 0.001, 0.999)

            kl = entropy(distr, true_distr)
            synthetic_summary.loc[label, "KL Divergence"] = kl

        summaries.append(synthetic_summary)

    summary = sum(summaries) / k
    return summary[["Abs % Error of ATE", "MSE", "KL Divergence"]]