def _create_clean_results()

in project/nanoeval/nanoeval/evaluation.py [0:0]


def _create_clean_results(results: list[tuple[Task, Any]]) -> list[tuple[Task, Any]]:
    """
    Clean results are defined as the LAST retry_idx for each task. These results are what get
    used to compute metrics in the final summary.
    """

    # Pick the latest retry idx for each task
    clean_results: list[tuple[Task, Any]] = []
    for task, result in results:
        found = False
        for i, (clean_task, _) in enumerate(clean_results):
            if (clean_task.question_id, clean_task.attempt_id) == (
                task.question_id,
                task.attempt_id,
            ):
                found = True
                if task.retry_idx > clean_task.retry_idx:
                    clean_results[i] = (task, result)
                break
        if not found:
            clean_results.append((task, result))

    # Sanity check the deduplication function.
    seen_tasks: set[tuple[str, int]] = set()
    for task, _ in clean_results:
        task_id = (task.question_id, task.attempt_id)
        assert task_id not in seen_tasks, (
            f"Duplicate task found: {task_id}. This is a bug in nanoeval."
        )
        seen_tasks.add(task_id)

    return clean_results