in project/paperbench/experiments/judge_eval/judge_eval_perf_tables.py [0:0]
def make_metrics_tables(model_results):
for model, stats in model_results.items():
overall_metrics = stats["aggregate_metrics"]
metrics = {}
metrics["Overall"] = {
"accuracy": overall_metrics.get("accuracy"),
"precision": overall_metrics.get("precision"),
"recall": overall_metrics.get("recall"),
"f1": overall_metrics.get("f1"),
}
stratified = overall_metrics.get("stratified", {})
for category in ["Code Development", "Code Execution", "Result Analysis"]:
category_metrics = stratified.get(category, {})
metrics[category] = {
"accuracy": category_metrics.get("accuracy"),
"precision": category_metrics.get("precision"),
"recall": category_metrics.get("recall"),
"f1": category_metrics.get("f1"),
}
df = pd.DataFrame.from_dict(metrics, orient="index")
output_path = f"experiments/judge_eval/tables/metrics_table_{model}.csv"
df.to_csv(output_path, index=True)
print(f"Saved metrics table for model {model} to '{output_path}'")