def _preprocess()

in src/alpaca_eval/annotators/pairwise_evaluator.py [0:0]


    def _preprocess(self, to_annotate: utils.AnyData) -> pd.DataFrame:
        # same as preprocess but with potential random noising and dealing with eauality

        df_to_annotate = super()._preprocess(to_annotate)

        # 1. adds random noise => avoids annotating examples that will be noised out.
        if self.p_label_flip:
            logging.info(f"Adding random noise to the labels p_label_flip={self.p_label_flip}.")
            # if you have 25% change of flipping the label, you have 50% chance of selecting random label
            # note that the noise is always binary (1 or 2), even when the annotation is a float (e.g. using logprobs)
            p_noise = self.p_label_flip * 2
            noisy_preference = df_to_annotate.apply(
                # we add "noisy_label" at the beginning to use ~independent seeds between tasks
                lambda x: utils.random_seeded_choice(  # seed on inputs for reproducibility
                    seed="noisy_preference" + "".join(x[self.random_seed_keys]) + str(self.seed),
                    choices=[np.nan, 1, 2],
                    weights=[1 - p_noise, self.p_label_flip, self.p_label_flip],
                ),
                axis=1,
            )
            df_to_annotate["is_noisy_label"] = ~noisy_preference.isna()
            # keeps previously annotated examples when you did not add noise
            df_to_annotate[self.annotation_key] = np.where(
                df_to_annotate["is_noisy_label"],
                noisy_preference,
                df_to_annotate[self.annotation_key],
            )

        # 2. deals with equality
        idcs_is_same_outputs = df_to_annotate["output_1"] == df_to_annotate["output_2"]
        df_to_annotate.loc[idcs_is_same_outputs, self.annotation_key] = 1.5
        # for backward compatibility 0 used to mean same output => replace with 1.5
        df_to_annotate[self.annotation_key] = df_to_annotate[self.annotation_key].replace({0: 1.5})

        return df_to_annotate