def _preprocess()

in src/alpaca_eval/annotators/base.py [0:0]


    def _preprocess(self, to_annotate: utils.AnyData) -> pd.DataFrame:
        """Preprocess the examples to annotate. In particular takes care of filtering unnecessary examples."""

        df_to_annotate = utils.convert_to_dataframe(to_annotate)
        self._add_missing_primary_keys_(df_to_annotate)

        # don't remove output keys to keep
        for c in self.other_output_keys_to_keep + [self.annotation_key]:
            if c in df_to_annotate.columns:
                logging.warning(f"{c} column is already in the dataframe. We will overwrite it.")
                df_to_annotate[c] = None

        # remove duplicates because you only need to annotate one of them
        df_to_annotate = df_to_annotate.drop_duplicates(subset=self.primary_keys)

        # set the annotater for each example
        df_to_annotate[self.annotator_column] = df_to_annotate.apply(
            lambda x: utils.random_seeded_choice(
                # we add "annotator" at the beginning to not use the same seed for all tasks
                seed="annotator" + "".join(x[self.random_seed_keys]) + str(self.seed),
                choices=list(self.annotators.keys()),
            ),
            axis=1,
        )

        if self.is_avoid_reannotations:
            df_to_annotate = self._apply_cached_annotations(df_to_annotate)

        return df_to_annotate