in src/alpaca_eval/plotting.py [0:0]
def plot_paired_ttests_per_dataset(df, is_print_values=False, is_add_alpaca_eval=False):
min_dataset_size = df.drop_duplicates("instruction").groupby("dataset")["instruction"].count().min()
all_pvalues = dict()
for d in df["dataset"].unique():
df_sub = df.query(f"dataset=='{d}'")
all_pvalues[d] = _get_ttest_df(df_sub, n_samples=min_dataset_size)
if is_add_alpaca_eval:
all_pvalues["AlpacaEval"] = _get_ttest_df(df, n_samples=min_dataset_size)
if is_print_values:
for i, (key, curr_df) in enumerate(all_pvalues.items()):
print(key, f"mean p-val: {curr_df.mean(axis=None):.3f}", f"max p-val: {curr_df.max(axis=None):.3f}")
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(23, 15))
with plot_config(font_scale=0.5):
for i, (key, curr_df) in enumerate(all_pvalues.items()):
ax = axes[i // 3][i % 3]
g = sns.heatmap(
curr_df,
annot=True,
fmt=".2f",
cbar=False,
square=True,
xticklabels=False,
ax=ax,
mask=np.triu(np.ones_like(curr_df, dtype=bool)),
)
ax.set_title(key + f" n={min_dataset_size}", fontsize=20)
g.set(xlabel="", ylabel="")
for i in range(len(all_pvalues), axes.size):
ax = axes.flatten()[i]
ax.set_visible(False)
# adjust spacing between plots
plt.tight_layout()
plt.show()
return g