def create_depth_score_plot()

in project/paperbench/experiments/judge_max_depth/plot.py [0:0]


def create_depth_score_plot(data: list[dict], output_path: str) -> None:
    """
    Create and save a bar plot showing scores by depth.

    Args:
        data: List of dictionaries containing depth, score and num_leaf_nodes
        output_path: Path where to save the output plot
    """
    # Create DataFrame and calculate statistics
    df = pd.DataFrame(data)
    stats = (
        df.groupby("depth")
        .agg({"score": ["mean", "std", "count"], "num_leaf_nodes": ["max"]})
        .reset_index()
    )
    stats.columns = ["depth", "mean", "std", "count", "max_leaves"]
    stats["stderr"] = stats["std"] / np.sqrt(stats["count"])

    # Create bar plot
    plt.rcParams.update({"font.size": 7})
    plt.figure(figsize=(6.75133 / 1.5, 3.75))
    x = np.arange(len(stats))
    plt.bar(x, stats["mean"], yerr=stats["stderr"], capsize=5)

    # Add horizontal line for human score
    plt.axhline(y=HUMAN_JUDGE_SCORE, color="red", linestyle="--", label="Human Judge")
    plt.legend()

    plt.ylabel("Reproduction Score")
    plt.grid(True, axis="y", linestyle="--", alpha=0.2)
    plt.xticks(x, [f"{d} depth / {m} leaves" for d, m in zip(stats["depth"], stats["max_leaves"])])
    plt.xticks(rotation=45, ha="right")

    # Add value labels on top of bars
    for i, row in stats.iterrows():
        plt.text(
            i,
            row["mean"] + row["stderr"] + 0.001,
            f'{row["mean"]:.2f}±{row["stderr"]:.2f}',
            ha="center",
            va="bottom",
        )

    plt.tight_layout()
    plt.savefig(output_path, bbox_inches="tight", dpi=400, pad_inches=0.01)
    plt.show()
    plt.close()