in project/nanoeval/nanoeval/evaluation.py [0:0]
def _create_clean_results(results: list[tuple[Task, Any]]) -> list[tuple[Task, Any]]:
"""
Clean results are defined as the LAST retry_idx for each task. These results are what get
used to compute metrics in the final summary.
"""
# Pick the latest retry idx for each task
clean_results: list[tuple[Task, Any]] = []
for task, result in results:
found = False
for i, (clean_task, _) in enumerate(clean_results):
if (clean_task.question_id, clean_task.attempt_id) == (
task.question_id,
task.attempt_id,
):
found = True
if task.retry_idx > clean_task.retry_idx:
clean_results[i] = (task, result)
break
if not found:
clean_results.append((task, result))
# Sanity check the deduplication function.
seen_tasks: set[tuple[str, int]] = set()
for task, _ in clean_results:
task_id = (task.question_id, task.attempt_id)
assert task_id not in seen_tasks, (
f"Duplicate task found: {task_id}. This is a bug in nanoeval."
)
seen_tasks.add(task_id)
return clean_results