in project/paperbench/experiments/judge_eval/judge_eval_perf_cost.py [0:0]
def create_metrics_table(model_results):
"""
Creates a DataFrame with performance and cost metrics for each model
"""
data = []
# Use the same model order as in the plot
for model in MODELS_SORTED:
stats = model_results[model]
model_name = stats["model_name"]
reasoning_effort = stats["reasoning_effort"]
f1_score = stats["aggregate_metrics"]["f1"]
cost = compute_cost(stats["aggregate_token_usage"]["mean"], MODEL_COST_PER_TOKEN)
# Use the same model name mapping as in the plot
display_name = MODEL_NAME_TO_LABEL[model_name]
data.append(
{
"Model": display_name,
"Reasoning Effort": reasoning_effort if reasoning_effort else "default",
"F1 Score": f1_score,
"Avg Cost Per Paper (USD)": cost,
}
)
df = pd.DataFrame(data)
# No need to sort since we're using MODELS_SORTED order
# Save to CSV
output_path = "experiments/judge_eval/tables/perf_cost_table.csv"
df.to_csv(output_path, index=False, float_format="%.4f")
print(f"Saved metrics table to '{output_path}'")
return df