in project/paperbench/experiments/judge_eval/judge_eval_perf_cost.py [0:0]
def plot_perf_cost(model_results, random_baseline_results):
model_to_f1 = {
model: stats["aggregate_metrics"]["f1"] for model, stats in model_results.items()
}
model_to_cost_per_rubric = {
model: compute_cost(stats["aggregate_token_usage"]["mean"], MODEL_COST_PER_TOKEN)
for model, stats in model_results.items()
}
model_name_to_marker = {
"gpt-4o-mini-2024-07-18": "o", # Circle
"gpt-4o-2024-08-06": "s", # Square
"o1-mini-2024-09-12": "^", # Triangle
"o1-2024-12-17": "D", # Diamond
"o3-mini-2025-01-31": "v", # triangle down
}
model_name_to_color = {
"gpt-4o-mini-2024-07-18": "blue",
"gpt-4o-2024-08-06": "green",
"o1-mini-2024-09-12": "red",
"o1-2024-12-17": "purple",
"o3-mini-2025-01-31": "orange",
}
reasoning_effort_to_marker_size = {
None: 4,
"low": 1,
"medium": 2,
"high": 4,
}
plt.rcParams.update({"font.size": 7})
f, ax = plt.subplots(1, 1, figsize=(6.75133 / 1.5, 2.75), dpi=300)
# Track which models we've already added to legend
legend_models = set()
for model in MODELS_SORTED:
model_name = model_results[model]["model_name"]
reasoning_effort = model_results[model]["reasoning_effort"]
# to avoid duplicated legend entries
label = MODEL_NAME_TO_LABEL[model_name] if model_name not in legend_models else None
ax.scatter(
model_to_cost_per_rubric[model],
model_to_f1[model],
marker=model_name_to_marker[model_name],
color=model_name_to_color[model_name],
label=label,
s=reasoning_effort_to_marker_size[reasoning_effort] ** 2 * 2,
linewidth=0.2,
edgecolor="black",
)
legend_models.add(model_name)
ax.axhline(
random_baseline_results["f1"],
color="red",
label="random baseline",
linewidth=0.5,
linestyle="--",
)
ax.scatter(12 * 100, 1, marker="*", color="black", label="expert human", s=4**2 * 2)
ax.set_xlabel("Average SimpleJudge Cost Per Paper [USD]")
ax.set_ylabel("Performance on JudgeEval [F1]")
ax.set_xscale("log")
handles, labels = ax.get_legend_handles_labels()
handles = [h[0] if isinstance(h, container.ErrorbarContainer) else h for h in handles]
ax.legend(handles, labels, loc="upper left")
ax.grid(axis="y", which="major", linewidth=0.1, alpha=0.5)
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.grid(axis="y", which="minor", linewidth=0.05, alpha=0.5)
f.tight_layout()
plt.savefig(
"experiments/judge_eval/perf_cost.pdf", bbox_inches="tight", dpi=400, pad_inches=0.01
)
print("Saved plot to 'experiments/judge_eval/perf_cost.pdf'")