def per_paper_results()

in project/paperbench/paperbench/metrics.py [0:0]


def per_paper_results(eval_runs: list[EvaluationRun], n_runs: int) -> dict[str, dict]:
    """
    Computes the mean and standard error of the replication score for each paper
    over the expected number of runs.
    """
    paper_ids = {
        pe.paper_id for eval_run in eval_runs for pe in eval_run.paper_evaluations.values()
    }

    def _init_result(num_runs) -> dict:
        seeds = {f"run_{i}": None for i in range(1, num_runs + 1)}
        results = {
            "mean": None,
            "std_err": None,
            "n_runs": None,
            **seeds,
        }
        return results

    results = {paper_id: _init_result(n_runs) for paper_id in paper_ids}

    # first, fill in the scores for each seed that's available
    for i, eval_run in enumerate(eval_runs, start=1):
        for paper_eval in eval_run.paper_evaluations.values():
            paper_id = paper_eval.paper_id
            score = paper_eval.graded_task_node.score
            seed = f"run_{i}"
            results[paper_id][seed] = score

    # then compute the mean/stderr over the available seeds
    for paper_id, paper_results in results.items():
        avail_scores = [score for score in paper_results.values() if score is not None]
        results[paper_id]["mean"] = np.mean(avail_scores).item()
        results[paper_id]["n_runs"] = len(avail_scores)
        results[paper_id]["std_err"] = (
            (np.std(avail_scores, ddof=1) / np.sqrt(len(avail_scores))).item()
            if avail_scores
            else float("nan")
        )

    return results