in trl/trainer/callbacks.py [0:0]
def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
# At every evaluation step, we generate completions for the model and compare them with the reference
# completions that have been generated at the beginning of training. We then compute the win rate and log it to
# the trainer.
tokenizer = kwargs["processing_class"]
tokenizer.padding_side = "left"
accelerator = self.trainer.accelerator
model = self.trainer.model_wrapped
with accelerator.split_between_processes(self.eval_dataset["prompt"]) as prompts:
completions = _generate_completions(
prompts,
model=model,
tokenizer=tokenizer,
accelerator=accelerator,
generation_config=self.generation_config,
batch_size=args.per_device_eval_batch_size,
)
completions = list(zip(self.ref_completions, completions))
if self.use_soft_judge:
ref_win_probs = self.judge.judge(prompts, completions, self.shuffle_order, return_scores=True)
winner_indices = [0 if score > 0.5 else 1 for score in ref_win_probs]
ref_win_probs = gather_object(ref_win_probs)
else:
winner_indices = self.judge.judge(prompts, completions, self.shuffle_order)
prompts = gather_object(prompts)
completions = gather_object(completions)
winner_indices = gather_object(winner_indices)
# Logging
if self.trainer.accelerator.is_main_process:
win_rate = sum(winner_idx == 1 for winner_idx in winner_indices) / len(winner_indices)
if self.use_soft_judge:
avg_win_prob = 1.0 - sum(ref_win_probs) / len(ref_win_probs)
self.trainer.log({"eval_avg_win_prob": avg_win_prob, "eval_win_rate": win_rate})
else:
self.trainer.log({"eval_win_rate": win_rate})
if "wandb" in args.report_to:
import wandb
if wandb.run is not None:
df = _win_rate_completions_df(
state=state,
prompts=prompts,
completions=completions,
winner_indices=winner_indices,
)
wandb.log({"win_rate_completions": wandb.Table(dataframe=df)})
if "comet_ml" in args.report_to:
df = _win_rate_completions_df(
state=state,
prompts=prompts,
completions=completions,
winner_indices=winner_indices,
)
log_table_to_comet_experiment(
name="win_rate_completions.csv",
table=df,
)