in code/run_eval_prm_trl.py [0:0]
def results_report(aggregated_results: dict[str, dict[str, int | float]]) -> None:
"""Prints the final results. """
print("Individual Results:")
print("-" * 70)
max_config_length = max(len(config) for config in aggregated_results.keys())
for config, metrics in aggregated_results.items():
print(f"{config:<{max_config_length}} -> Precision: {metrics['precision']:.2f} Recall: {metrics['recall']:.2f} F1 Score: {metrics['f1_score']:.2f}")
# Calculate weighted averages
total_problems = sum(metrics['num_problems'] for metrics in aggregated_results.values())
weighted_precision = 0
weighted_recall = 0
weighted_f1 = 0
for metrics in aggregated_results.values():
weight = metrics['num_problems'] / total_problems
weighted_precision += metrics['precision'] * weight
weighted_recall += metrics['recall'] * weight
weighted_f1 += metrics['f1_score'] * weight
# Print aggregated results
print("Weighted Averages:")
print("-" * 70)
print(f"{'Weighted':<{max_config_length}} -> Precision: {weighted_precision:.2f} Recall: {weighted_recall:.2f} F1 Score: {weighted_f1:.2f}")