def make_metrics

def make_metrics_tables()

in project/paperbench/experiments/judge_eval/judge_eval_perf_tables.py [0:0]

23 lines of code
3 McCabe index (conditional complexity)


def make_metrics_tables(model_results):
    for model, stats in model_results.items():
        overall_metrics = stats["aggregate_metrics"]
        metrics = {}
        metrics["Overall"] = {
            "accuracy": overall_metrics.get("accuracy"),
            "precision": overall_metrics.get("precision"),
            "recall": overall_metrics.get("recall"),
            "f1": overall_metrics.get("f1"),
        }
        stratified = overall_metrics.get("stratified", {})
        for category in ["Code Development", "Code Execution", "Result Analysis"]:
            category_metrics = stratified.get(category, {})
            metrics[category] = {
                "accuracy": category_metrics.get("accuracy"),
                "precision": category_metrics.get("precision"),
                "recall": category_metrics.get("recall"),
                "f1": category_metrics.get("f1"),
            }
        df = pd.DataFrame.from_dict(metrics, orient="index")
        output_path = f"experiments/judge_eval/tables/metrics_table_{model}.csv"
        df.to_csv(output_path, index=True)
        print(f"Saved metrics table for model {model} to '{output_path}'")