in src/alpaca_eval/annotators/base.py [0:0]
def _preprocess(self, to_annotate: utils.AnyData) -> pd.DataFrame:
"""Preprocess the examples to annotate. In particular takes care of filtering unnecessary examples."""
df_to_annotate = utils.convert_to_dataframe(to_annotate)
self._add_missing_primary_keys_(df_to_annotate)
# don't remove output keys to keep
for c in self.other_output_keys_to_keep + [self.annotation_key]:
if c in df_to_annotate.columns:
logging.warning(f"{c} column is already in the dataframe. We will overwrite it.")
df_to_annotate[c] = None
# remove duplicates because you only need to annotate one of them
df_to_annotate = df_to_annotate.drop_duplicates(subset=self.primary_keys)
# set the annotater for each example
df_to_annotate[self.annotator_column] = df_to_annotate.apply(
lambda x: utils.random_seeded_choice(
# we add "annotator" at the beginning to not use the same seed for all tasks
seed="annotator" + "".join(x[self.random_seed_keys]) + str(self.seed),
choices=list(self.annotators.keys()),
),
axis=1,
)
if self.is_avoid_reannotations:
df_to_annotate = self._apply_cached_annotations(df_to_annotate)
return df_to_annotate