def run()

in evals/elsuite/self_prompting/eval.py [0:0]


    def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]:
        samples = self.get_samples()

        # Shuffle and limit samples
        np.random.shuffle(samples)
        samples_by_task = samples[: self.n_tasks]
        assert len(samples_by_task) == self.n_tasks
        for task in samples_by_task:
            np.random.shuffle(task["test_samples"])
            np.random.shuffle(task["train_samples"])
            task["test_samples"] = task["test_samples"][: self.n_samples_per_task]
            task["train_samples"] = task["train_samples"][: self.n_preview_samples]
            assert len(task["test_samples"]) == self.n_samples_per_task
            assert len(task["train_samples"]) == self.n_preview_samples

        # Run prompting
        prompting_samples = []
        for task in samples_by_task:
            for tasker_model in self.tasker_models:
                prompting_samples.append(
                    {
                        "stage": "prompting",
                        "tasker_model": tasker_model,
                        "task": task,
                    }
                )
        assert len(prompting_samples) == len(self.tasker_models) * self.n_tasks
        prompting_results = self.eval_all_samples(recorder, prompting_samples)

        # Run tasking
        tasking_samples = []  # Store in flattened list for parallel eval
        for prompt_res in prompting_results:
            prompt_res["stage"] = "tasking"  # Update stage
            for sample in prompt_res["task"]["test_samples"]:
                tasking_samples.append(
                    {
                        **prompt_res,
                        "input": sample["input"],
                        "output": sample["output"],
                    }
                )
        assert len(tasking_samples) == len(prompting_results) * self.n_samples_per_task
        self.eval_all_samples(recorder, tasking_samples)

        # The score of a Prompter is the average score of all Tasker models it writes prompts for
        metrics = recorder.get_metrics()

        # Primary metrics
        result = {
            "accuracy": np.mean([metric["exact"] for metric in metrics]),
            "accuracy_fuzzy": np.mean([metric["fuzzy"] for metric in metrics]),
        }
        # Relative improvement against baseline
        improvement_scores = self._calculate_improvement_wrt_baseline(result)
        if improvement_scores:
            result.update(improvement_scores)

        # Peripheral metrics
        result.update(
            {
                "prompt_rule_violation_rate": np.mean(
                    [int(metric["prompt_rule_violation"]) for metric in metrics]
                ),
                "n_samples": len(metrics),
            }
        )

        # Breakdown by tasker model
        def compute_mean_tasker(key, tasker_model):
            return np.mean(
                [metric[key] for metric in metrics if metric["tasker_model"] == tasker_model]
            )

        for tasker in self.tasker_models:
            result.update(
                {
                    f"accuracy_{tasker}": compute_mean_tasker("exact", tasker),
                    f"accuracy_fuzzy_{tasker}": compute_mean_tasker("fuzzy", tasker),
                }
            )

        return result