in evals/elsuite/error_recovery/eval.py [0:0]
def run(self, recorder: evals.record.Recorder):
samples = self.get_samples()
self.eval_all_samples(recorder, samples)
metrics = recorder.get_metrics()
NR_correct_rate = len([i for i in metrics if i["NR_correct"]]) / len(metrics)
CR_correct_rate = len([i for i in metrics if i["CR_correct"]]) / len(metrics)
IR_correct_rate = len([i for i in metrics if i["IR_correct"]]) / len(metrics)
results = {
"NR_correct_rate": NR_correct_rate,
"CR_correct_rate": CR_correct_rate,
"IR_correct_rate": IR_correct_rate,
}
# Split results per type of task
all_tasks = set([i["task"] for i in metrics])
for task in all_tasks:
filtered_metrics = [i for i in metrics if i["task"] == task]
NR_correct_rate = len([i for i in filtered_metrics if i["NR_correct"]]) / len(
filtered_metrics
)
CR_correct_rate = len([i for i in filtered_metrics if i["CR_correct"]]) / len(
filtered_metrics
)
IR_correct_rate = len([i for i in filtered_metrics if i["IR_correct"]]) / len(
filtered_metrics
)
# we use hyphens in the task name so they can be extracted by splitting on underscores
task_string = task.replace("_", "-")
results.update(
{
f"task_{task_string}_NR_correct_rate": NR_correct_rate,
f"task_{task_string}_CR_correct_rate": CR_correct_rate,
f"task_{task_string}_IR_correct_rate": IR_correct_rate,
}
)
return results