def gather_eval_runs()

in project/paperbench/paperbench/nano/utils.py [0:0]


def gather_eval_runs(results: list["PaperBenchResult"], n_runs: int) -> list[EvaluationRun]:
    """
    Gathers succesfully graded results of nano/eval into a list of n_runs EvaluationRuns
    where a single EvaluationRun does not contain more than one evaluation of the same paper.
    """
    seed_to_eval_run = {
        seed: EvaluationRun(seed=seed, paper_evaluations={}) for seed in range(n_runs)
    }
    paper_to_cur_seed = {}

    if not results:
        return list(seed_to_eval_run.values())

    for result in results:
        if result.judge_output is None or result.judge_output.graded_task_tree is None:
            continue
        paper_id = result.paper_id
        if paper_id not in paper_to_cur_seed:
            paper_to_cur_seed[paper_id] = 0
        seed = paper_to_cur_seed[paper_id]
        paper_to_cur_seed[paper_id] += 1
        paper_eval = PaperEvaluation(
            paper_id=paper_id,
            graded_task_node=result.judge_output.graded_task_tree,
            paper_run_id=result.run_id,
        )
        seed_to_eval_run[seed].paper_evaluations[paper_id] = paper_eval

    return list(seed_to_eval_run.values())