def plot_perf

def plot_perf_cost()

in project/paperbench/experiments/judge_eval/judge_eval_perf_cost.py [0:0]
68 lines of code
7 McCabe index (conditional complexity)

def plot_perf_cost(model_results, random_baseline_results):
    model_to_f1 = {
        model: stats["aggregate_metrics"]["f1"] for model, stats in model_results.items()
    }

    model_to_cost_per_rubric = {
        model: compute_cost(stats["aggregate_token_usage"]["mean"], MODEL_COST_PER_TOKEN)
        for model, stats in model_results.items()
    }

    model_name_to_marker = {
        "gpt-4o-mini-2024-07-18": "o",  # Circle
        "gpt-4o-2024-08-06": "s",  # Square
        "o1-mini-2024-09-12": "^",  # Triangle
        "o1-2024-12-17": "D",  # Diamond
        "o3-mini-2025-01-31": "v",  # triangle down
    }

    model_name_to_color = {
        "gpt-4o-mini-2024-07-18": "blue",
        "gpt-4o-2024-08-06": "green",
        "o1-mini-2024-09-12": "red",
        "o1-2024-12-17": "purple",
        "o3-mini-2025-01-31": "orange",
    }

    reasoning_effort_to_marker_size = {
        None: 4,
        "low": 1,
        "medium": 2,
        "high": 4,
    }

    plt.rcParams.update({"font.size": 7})

    f, ax = plt.subplots(1, 1, figsize=(6.75133 / 1.5, 2.75), dpi=300)

    # Track which models we've already added to legend
    legend_models = set()

    for model in MODELS_SORTED:
        model_name = model_results[model]["model_name"]
        reasoning_effort = model_results[model]["reasoning_effort"]

        # to avoid duplicated legend entries
        label = MODEL_NAME_TO_LABEL[model_name] if model_name not in legend_models else None

        ax.scatter(
            model_to_cost_per_rubric[model],
            model_to_f1[model],
            marker=model_name_to_marker[model_name],
            color=model_name_to_color[model_name],
            label=label,
            s=reasoning_effort_to_marker_size[reasoning_effort] ** 2 * 2,
            linewidth=0.2,
            edgecolor="black",
        )

        legend_models.add(model_name)
    ax.axhline(
        random_baseline_results["f1"],
        color="red",
        label="random baseline",
        linewidth=0.5,
        linestyle="--",
    )
    ax.scatter(12 * 100, 1, marker="*", color="black", label="expert human", s=4**2 * 2)

    ax.set_xlabel("Average SimpleJudge Cost Per Paper [USD]")
    ax.set_ylabel("Performance on JudgeEval [F1]")
    ax.set_xscale("log")

    handles, labels = ax.get_legend_handles_labels()
    handles = [h[0] if isinstance(h, container.ErrorbarContainer) else h for h in handles]
    ax.legend(handles, labels, loc="upper left")

    ax.grid(axis="y", which="major", linewidth=0.1, alpha=0.5)
    ax.yaxis.set_minor_locator(MultipleLocator(0.05))
    ax.grid(axis="y", which="minor", linewidth=0.05, alpha=0.5)

    f.tight_layout()

    plt.savefig(
        "experiments/judge_eval/perf_cost.pdf", bbox_inches="tight", dpi=400, pad_inches=0.01
    )
    print("Saved plot to 'experiments/judge_eval/perf_cost.pdf'")