def plot_paired_ttests_per_dataset()

in src/alpaca_eval/plotting.py [0:0]


def plot_paired_ttests_per_dataset(df, is_print_values=False, is_add_alpaca_eval=False):
    min_dataset_size = df.drop_duplicates("instruction").groupby("dataset")["instruction"].count().min()

    all_pvalues = dict()
    for d in df["dataset"].unique():
        df_sub = df.query(f"dataset=='{d}'")
        all_pvalues[d] = _get_ttest_df(df_sub, n_samples=min_dataset_size)

    if is_add_alpaca_eval:
        all_pvalues["AlpacaEval"] = _get_ttest_df(df, n_samples=min_dataset_size)

    if is_print_values:
        for i, (key, curr_df) in enumerate(all_pvalues.items()):
            print(key, f"mean p-val: {curr_df.mean(axis=None):.3f}", f"max p-val: {curr_df.max(axis=None):.3f}")

    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(23, 15))

    with plot_config(font_scale=0.5):
        for i, (key, curr_df) in enumerate(all_pvalues.items()):
            ax = axes[i // 3][i % 3]
            g = sns.heatmap(
                curr_df,
                annot=True,
                fmt=".2f",
                cbar=False,
                square=True,
                xticklabels=False,
                ax=ax,
                mask=np.triu(np.ones_like(curr_df, dtype=bool)),
            )
            ax.set_title(key + f" n={min_dataset_size}", fontsize=20)
            g.set(xlabel="", ylabel="")

        for i in range(len(all_pvalues), axes.size):
            ax = axes.flatten()[i]
            ax.set_visible(False)

        # adjust spacing between plots
        plt.tight_layout()

    plt.show()
    return g