def run()

in evals/elsuite/error_recovery/eval.py [0:0]


    def run(self, recorder: evals.record.Recorder):
        samples = self.get_samples()

        self.eval_all_samples(recorder, samples)
        metrics = recorder.get_metrics()

        NR_correct_rate = len([i for i in metrics if i["NR_correct"]]) / len(metrics)
        CR_correct_rate = len([i for i in metrics if i["CR_correct"]]) / len(metrics)
        IR_correct_rate = len([i for i in metrics if i["IR_correct"]]) / len(metrics)

        results = {
            "NR_correct_rate": NR_correct_rate,
            "CR_correct_rate": CR_correct_rate,
            "IR_correct_rate": IR_correct_rate,
        }

        # Split results per type of task
        all_tasks = set([i["task"] for i in metrics])
        for task in all_tasks:
            filtered_metrics = [i for i in metrics if i["task"] == task]
            NR_correct_rate = len([i for i in filtered_metrics if i["NR_correct"]]) / len(
                filtered_metrics
            )
            CR_correct_rate = len([i for i in filtered_metrics if i["CR_correct"]]) / len(
                filtered_metrics
            )
            IR_correct_rate = len([i for i in filtered_metrics if i["IR_correct"]]) / len(
                filtered_metrics
            )

            # we use hyphens in the task name so they can be extracted by splitting on underscores
            task_string = task.replace("_", "-")
            results.update(
                {
                    f"task_{task_string}_NR_correct_rate": NR_correct_rate,
                    f"task_{task_string}_CR_correct_rate": CR_correct_rate,
                    f"task_{task_string}_IR_correct_rate": IR_correct_rate,
                }
            )

        return results