in evals/elsuite/error_recovery/eval.py [0:0]
def eval_sample(self, solver: Solver, sample: Sample, rng: random.Random, extra_logging=None):
task = sample.task
# Get the baseline with no provided reasoning
nr_task_state = self._get_no_reasoning_task_state(sample)
# only "end" makes sense for 'no reasoning'
nr_notif_pos = "end" if self.mistake_notif_pos == "end" else None
if self.mistake_notif_ir_only:
nr_notif_pos = None
nr_answer = self._get_answer(
solver=solver,
task_state=nr_task_state,
sample=sample,
mistake_notif_pos=nr_notif_pos,
)
# Run with correct reasoning
cr_task_state = self._get_correct_reasoning_task_state(sample)
cr_notif_pos = self.mistake_notif_pos
if self.mistake_notif_ir_only:
cr_notif_pos = None
cr_answer = self._get_answer(
solver=solver,
task_state=cr_task_state,
sample=sample,
mistake_notif_pos=cr_notif_pos,
)
# Run with incorrect reasoning
ir_task_state = self._get_incorrect_reasoning_task_state(sample)
ir_notif_pos = self.mistake_notif_pos
ir_answer = self._get_answer(
solver=solver,
task_state=ir_task_state,
sample=sample,
mistake_notif_pos=ir_notif_pos,
)
assert len(sample.correct_steps) == sample.mistake_index
metrics = {
"task": task,
"num_ground_truth_steps": sample.num_ground_truth_steps,
"mistake_index": sample.mistake_index,
"target": str(sample.target), # ground truth answer
"mistake_notification_position": self.mistake_notif_pos,
"mistake_notification_for_ir_only": self.mistake_notif_ir_only,
"NR_sampled": nr_answer,
"CR_sampled": cr_answer,
"IR_sampled": ir_answer,
"NR_correct": nr_answer == str(sample.target),
"CR_correct": cr_answer == str(sample.target),
"IR_correct": ir_answer == str(sample.target),
}
evals.record.record_metrics(**metrics)