in project/nanoeval/nanoeval/metrics/agents.py [0:0]
def _compute_metrics_plus_outcome_aggregations(samples_df: pd.DataFrame) -> dict[str, Any]:
"""
Compute standard metrics and aggregations for a given DataFrame of samples.
"""
# Get answer_group_id on samples_df
samples_df["answer_group_id"] = samples_df.groupby("instance").cumcount()
answer_group_correctness_df = samples_df[["instance", "attempt", "answer_group_id"]]
answer_group_correctness_df["is_correct"] = samples_df["correct"]
metrics: dict[str, Any] = {
**compute_default_metrics(samples_df, answer_group_correctness_df),
"aggregations": {
"num_tasks": len(samples_df["instance"].unique()),
"num_attempts": len(samples_df),
"num_correct": int(samples_df["correct"].sum()),
"num_incorrect": int((~samples_df["system_error"] & ~samples_df["correct"]).sum()),
"num_system_error": int(samples_df["system_error"].sum()),
"error_breakdown": samples_df[samples_df["error"].notnull()]["error"]
.value_counts()
.to_dict(),
},
}
# Validations
assert metrics["aggregations"]["num_correct"] + metrics["aggregations"][
"num_incorrect"
] + metrics["aggregations"]["num_system_error"] == len(samples_df), f"{metrics=}, {samples_df=}"
assert (
sum(metrics["aggregations"]["error_breakdown"].values())
== metrics["aggregations"]["num_system_error"]
)
return metrics