def results_report()

in code/run_eval_prm_trl.py [0:0]


def results_report(aggregated_results: dict[str, dict[str, int | float]]) -> None:
    """Prints the final results. """
    print("Individual Results:")
    print("-" * 70)
    max_config_length = max(len(config) for config in aggregated_results.keys())

    for config, metrics in aggregated_results.items():
        print(f"{config:<{max_config_length}} -> Precision: {metrics['precision']:.2f}  Recall: {metrics['recall']:.2f}  F1 Score: {metrics['f1_score']:.2f}")

    # Calculate weighted averages
    total_problems = sum(metrics['num_problems'] for metrics in aggregated_results.values())
    weighted_precision = 0
    weighted_recall = 0
    weighted_f1 = 0

    for metrics in aggregated_results.values():
        weight = metrics['num_problems'] / total_problems
        weighted_precision += metrics['precision'] * weight
        weighted_recall += metrics['recall'] * weight
        weighted_f1 += metrics['f1_score'] * weight

    # Print aggregated results
    print("Weighted Averages:")
    print("-" * 70)
    print(f"{'Weighted':<{max_config_length}} -> Precision: {weighted_precision:.2f}  Recall: {weighted_recall:.2f}  F1 Score: {weighted_f1:.2f}")