in src/alpaca_eval/plotting.py [0:0]
def plot_paired_ttests_pvalues(df):
df_ttest = _get_ttest_df(df)
all_sub_ttest_df = {
n: _get_ttest_df(df, n_samples=n, random_state=123, sorted_idx=list(df_ttest.index))
for n in range(50, len(df["instruction"].unique()), 50)
}
df_describe = pd.DataFrame(
{
"mean": {k: v.mean(axis=None) for k, v in all_sub_ttest_df.items()},
"90% quantile": {k: v.stack().quantile(q=0.9) for k, v in all_sub_ttest_df.items()},
"max": {k: v.max(axis=None) for k, v in all_sub_ttest_df.items()},
}
)
melted = df_describe.melt(ignore_index=False, value_name="p-value", var_name="aggregator").reset_index(
names="# samples"
)
with plot_config(rc={"lines.linewidth": 4, "axes.grid": False}):
ax = sns.lineplot(melted, x="# samples", y="p-value", hue="aggregator")
ax.axhline(y=0.05, color="black", linestyle="--", linewidth=2, alpha=0.5)
# Get the handles and labels from the existing line plot legend
handles, labels = ax.get_legend_handles_labels()
# Create a new legend element for the horizontal line
legend_elements = [Line2D([0], [0], color="black", linestyle="--", label="0.05")]
# Combine the handles, labels, and new legend element
all_handles = handles + legend_elements
all_labels = labels + ["0.05"]
# Plot the combined legend
ax.legend(handles=all_handles, labels=all_labels)
plt.show()
return ax