def plot_paired_ttests

def plot_paired_ttests_pvalues()

in src/alpaca_eval/plotting.py [0:0]
26 lines of code
8 McCabe index (conditional complexity)

def plot_paired_ttests_pvalues(df):
    df_ttest = _get_ttest_df(df)
    all_sub_ttest_df = {
        n: _get_ttest_df(df, n_samples=n, random_state=123, sorted_idx=list(df_ttest.index))
        for n in range(50, len(df["instruction"].unique()), 50)
    }

    df_describe = pd.DataFrame(
        {
            "mean": {k: v.mean(axis=None) for k, v in all_sub_ttest_df.items()},
            "90% quantile": {k: v.stack().quantile(q=0.9) for k, v in all_sub_ttest_df.items()},
            "max": {k: v.max(axis=None) for k, v in all_sub_ttest_df.items()},
        }
    )

    melted = df_describe.melt(ignore_index=False, value_name="p-value", var_name="aggregator").reset_index(
        names="# samples"
    )

    with plot_config(rc={"lines.linewidth": 4, "axes.grid": False}):
        ax = sns.lineplot(melted, x="# samples", y="p-value", hue="aggregator")

        ax.axhline(y=0.05, color="black", linestyle="--", linewidth=2, alpha=0.5)

        # Get the handles and labels from the existing line plot legend
        handles, labels = ax.get_legend_handles_labels()

        # Create a new legend element for the horizontal line
        legend_elements = [Line2D([0], [0], color="black", linestyle="--", label="0.05")]

        # Combine the handles, labels, and new legend element
        all_handles = handles + legend_elements
        all_labels = labels + ["0.05"]

        # Plot the combined legend
        ax.legend(handles=all_handles, labels=all_labels)
    plt.show()
    return ax