def describe_head2head()

in src/alpaca_eval/metrics.py [0:0]


    def describe_head2head(self, predictions: npt.ArrayLike) -> dict[str, float]:
        """Compute the generalized win rate of the prediction."""
        predictions = self.preprocess_predictions(predictions)
        n_draws = self._idcs_draws(predictions).sum()

        # makes it easier to work with
        predictions = predictions.astype(float).replace({0.0: 1.5})

        is_preference = predictions.apply(validate_alpacaeval_preference, is_allow_nan=False)
        n_not_pair = sum(~is_preference)
        if n_not_pair > 0:
            logging.info(f"drop {n_not_pair} outputs that are not preferences")

        predictions = predictions[is_preference] - 1

        n_wins = (predictions > 0.5).sum()
        n_wins_base = (predictions < 0.5).sum()
        n_total = len(predictions)

        return dict(
            win_rate=predictions.mean() * 100,
            standard_error=predictions.sem() * 100,
            n_wins=n_wins,
            n_wins_base=n_wins_base,
            n_draws=n_draws,
            # note that n_draws will happen more often for weighted win rate because you can get 1.5 somewhat often due
            # to float precision
            n_total=n_total,
        )