in evals/elsuite/self_prompting/eval.py [0:0]
def run(self, recorder: evals.record.Recorder) -> dict[str, Union[float, int]]:
samples = self.get_samples()
# Shuffle and limit samples
np.random.shuffle(samples)
samples_by_task = samples[: self.n_tasks]
assert len(samples_by_task) == self.n_tasks
for task in samples_by_task:
np.random.shuffle(task["test_samples"])
np.random.shuffle(task["train_samples"])
task["test_samples"] = task["test_samples"][: self.n_samples_per_task]
task["train_samples"] = task["train_samples"][: self.n_preview_samples]
assert len(task["test_samples"]) == self.n_samples_per_task
assert len(task["train_samples"]) == self.n_preview_samples
# Run prompting
prompting_samples = []
for task in samples_by_task:
for tasker_model in self.tasker_models:
prompting_samples.append(
{
"stage": "prompting",
"tasker_model": tasker_model,
"task": task,
}
)
assert len(prompting_samples) == len(self.tasker_models) * self.n_tasks
prompting_results = self.eval_all_samples(recorder, prompting_samples)
# Run tasking
tasking_samples = [] # Store in flattened list for parallel eval
for prompt_res in prompting_results:
prompt_res["stage"] = "tasking" # Update stage
for sample in prompt_res["task"]["test_samples"]:
tasking_samples.append(
{
**prompt_res,
"input": sample["input"],
"output": sample["output"],
}
)
assert len(tasking_samples) == len(prompting_results) * self.n_samples_per_task
self.eval_all_samples(recorder, tasking_samples)
# The score of a Prompter is the average score of all Tasker models it writes prompts for
metrics = recorder.get_metrics()
# Primary metrics
result = {
"accuracy": np.mean([metric["exact"] for metric in metrics]),
"accuracy_fuzzy": np.mean([metric["fuzzy"] for metric in metrics]),
}
# Relative improvement against baseline
improvement_scores = self._calculate_improvement_wrt_baseline(result)
if improvement_scores:
result.update(improvement_scores)
# Peripheral metrics
result.update(
{
"prompt_rule_violation_rate": np.mean(
[int(metric["prompt_rule_violation"]) for metric in metrics]
),
"n_samples": len(metrics),
}
)
# Breakdown by tasker model
def compute_mean_tasker(key, tasker_model):
return np.mean(
[metric[key] for metric in metrics if metric["tasker_model"] == tasker_model]
)
for tasker in self.tasker_models:
result.update(
{
f"accuracy_{tasker}": compute_mean_tasker("exact", tasker),
f"accuracy_fuzzy_{tasker}": compute_mean_tasker("fuzzy", tasker),
}
)
return result