in project/paperbench/experiments/judge_eval/judge_eval_perf_tables.py [0:0]
def make_performance_table(model_results, random_baseline_results):
model_to_overall = {
model: stats["aggregate_metrics"]["f1"] for model, stats in model_results.items()
}
model_to_code_dev = {
model: stats["aggregate_metrics"]["stratified"]["Code Development"]["f1"]
for model, stats in model_results.items()
}
model_to_code_exec = {
model: stats["aggregate_metrics"]["stratified"]["Code Execution"]["f1"]
for model, stats in model_results.items()
}
model_to_res_analysis = {
model: stats["aggregate_metrics"]["stratified"]["Result Analysis"]["f1"]
for model, stats in model_results.items()
}
models_sorted = [
"gpt-4o-mini-2024-07-18",
"gpt-4o-2024-08-06",
"o1-mini-2024-09-12_high",
"o1-2024-12-17_high",
"o3-mini-2025-01-31_high",
]
# make table, with rows as models and columns being overall, code dev, code exec, and res analysis
data = []
for model in models_sorted:
data.append(
{
"Model": model,
"Overall": model_to_overall.get(model),
"Code Development": model_to_code_dev.get(model),
"Code Execution": model_to_code_exec.get(model),
"Result Analysis": model_to_res_analysis.get(model),
}
)
# Append random baseline row
baseline = {
"Model": "Random Baseline",
"Overall": random_baseline_results.get("f1"),
"Code Development": random_baseline_results.get("stratified", {})
.get("Code Development", {})
.get("f1"),
"Code Execution": random_baseline_results.get("stratified", {})
.get("Code Execution", {})
.get("f1"),
"Result Analysis": random_baseline_results.get("stratified", {})
.get("Result Analysis", {})
.get("f1"),
}
data.append(baseline)
df = pd.DataFrame(data)
df.to_csv("experiments/judge_eval/tables/perf_table.csv", index=False)
print("Saved performance table to 'experiments/judge_eval/tables/perf_table.csv'")