in src/alpaca_eval/metrics.py [0:0]
def describe_head2head(self, predictions: npt.ArrayLike) -> dict[str, float]:
"""Compute the generalized win rate of the prediction."""
predictions = self.preprocess_predictions(predictions)
n_draws = self._idcs_draws(predictions).sum()
# makes it easier to work with
predictions = predictions.astype(float).replace({0.0: 1.5})
is_preference = predictions.apply(validate_alpacaeval_preference, is_allow_nan=False)
n_not_pair = sum(~is_preference)
if n_not_pair > 0:
logging.info(f"drop {n_not_pair} outputs that are not preferences")
predictions = predictions[is_preference] - 1
n_wins = (predictions > 0.5).sum()
n_wins_base = (predictions < 0.5).sum()
n_total = len(predictions)
return dict(
win_rate=predictions.mean() * 100,
standard_error=predictions.sem() * 100,
n_wins=n_wins,
n_wins_base=n_wins_base,
n_draws=n_draws,
# note that n_draws will happen more often for weighted win rate because you can get 1.5 somewhat often due
# to float precision
n_total=n_total,
)