def _compute_metrics_plus_outcome_aggregations()

in project/nanoeval/nanoeval/metrics/agents.py [0:0]


def _compute_metrics_plus_outcome_aggregations(samples_df: pd.DataFrame) -> dict[str, Any]:
    """
    Compute standard metrics and aggregations for a given DataFrame of samples.
    """

    # Get answer_group_id on samples_df
    samples_df["answer_group_id"] = samples_df.groupby("instance").cumcount()
    answer_group_correctness_df = samples_df[["instance", "attempt", "answer_group_id"]]
    answer_group_correctness_df["is_correct"] = samples_df["correct"]

    metrics: dict[str, Any] = {
        **compute_default_metrics(samples_df, answer_group_correctness_df),
        "aggregations": {
            "num_tasks": len(samples_df["instance"].unique()),
            "num_attempts": len(samples_df),
            "num_correct": int(samples_df["correct"].sum()),
            "num_incorrect": int((~samples_df["system_error"] & ~samples_df["correct"]).sum()),
            "num_system_error": int(samples_df["system_error"].sum()),
            "error_breakdown": samples_df[samples_df["error"].notnull()]["error"]
            .value_counts()
            .to_dict(),
        },
    }

    # Validations
    assert metrics["aggregations"]["num_correct"] + metrics["aggregations"][
        "num_incorrect"
    ] + metrics["aggregations"]["num_system_error"] == len(samples_df), f"{metrics=}, {samples_df=}"
    assert (
        sum(metrics["aggregations"]["error_breakdown"].values())
        == metrics["aggregations"]["num_system_error"]
    )

    return metrics