in project/paperbench/paperbench/metrics.py [0:0]
def per_paper_results(eval_runs: list[EvaluationRun], n_runs: int) -> dict[str, dict]:
"""
Computes the mean and standard error of the replication score for each paper
over the expected number of runs.
"""
paper_ids = {
pe.paper_id for eval_run in eval_runs for pe in eval_run.paper_evaluations.values()
}
def _init_result(num_runs) -> dict:
seeds = {f"run_{i}": None for i in range(1, num_runs + 1)}
results = {
"mean": None,
"std_err": None,
"n_runs": None,
**seeds,
}
return results
results = {paper_id: _init_result(n_runs) for paper_id in paper_ids}
# first, fill in the scores for each seed that's available
for i, eval_run in enumerate(eval_runs, start=1):
for paper_eval in eval_run.paper_evaluations.values():
paper_id = paper_eval.paper_id
score = paper_eval.graded_task_node.score
seed = f"run_{i}"
results[paper_id][seed] = score
# then compute the mean/stderr over the available seeds
for paper_id, paper_results in results.items():
avail_scores = [score for score in paper_results.values() if score is not None]
results[paper_id]["mean"] = np.mean(avail_scores).item()
results[paper_id]["n_runs"] = len(avail_scores)
results[paper_id]["std_err"] = (
(np.std(avail_scores, ddof=1) / np.sqrt(len(avail_scores))).item()
if avail_scores
else float("nan")
)
return results