in src/alpaca_eval/plotting.py [0:0]
def _get_ttest_df(df, n_samples=None, random_state=123, sorted_idx=None):
"""return a dataframe of pairwise relative ttest with potential subsampling"""
df_pivoted = df.pivot(index="instruction", values="preference", columns=["generator_2"])
if n_samples is not None:
df_pivoted = df_pivoted.sample(n=n_samples, random_state=random_state)
# win_rate = metrics.pairwise_to_winrate(df["preference"])['win_rate']
if sorted_idx is None:
sorted_idx = list(
df.groupby("generator_2")["preference"]
.apply(lambda x: metrics.pairwise_to_winrate(x)["win_rate"])
.sort_values(ascending=False)
.index
)
return _pairwise_ttest(df_pivoted[sorted_idx].replace({0: 1, 1: 0})).astype(float) # draw is 0 but to test order it