in project/paperbench/experiments/judge_eval/judge_eval_perf_tables.py [0:0]
def read_model_results() -> dict:
"""
Saves the aggregate metrics and token usage for each model from a set of paths
"""
model_results = {}
paths = [
"experiments/judge_eval/judge_eval_results/simple_gpt-4o-2024-08-06_20250323_043512/results.json",
"experiments/judge_eval/judge_eval_results/simple_gpt-4o-mini-2024-07-18_20250323_041827/results.json",
"experiments/judge_eval/judge_eval_results/simple_o1-2024-12-17_20250323_003218/results.json",
"experiments/judge_eval/judge_eval_results/simple_o1-mini-2024-09-12_20250323_014231/results.json",
"experiments/judge_eval/judge_eval_results/simple_o3-mini-2025-01-31_20250322_232740/results.json",
]
for path in paths:
with open(path, "r") as f:
data = json.load(f)
model_name = data["model_name"]
reasoning_effort = (
data.get("judge_kwargs", {}).get("completion_kwargs", {}).get("reasoning_effort", None)
)
key = f"{model_name}_{reasoning_effort}" if reasoning_effort else model_name
model_results[key] = {
"model_name": model_name,
"reasoning_effort": reasoning_effort,
"aggregate_metrics": data["aggregate_metrics"],
}
return model_results