in evalbench/scorers/recallmatcher.py [0:0]
def compute_precision_recall(self, golden_results, generated_results):
"""
Calculates precision and recall for two sets of results, removing duplicates and Nones.
Args:
golden_results: A list of the correct results.
generated_results: A list of the results generated by the model.
Returns:
A dictionary containing the following:
1. precision
2. recall
3. orig_golden_size: no. of expected results before removing duplicates
4. orig_generated_size: no. of generated results before removing duplicates
5. dedup_golden_size: no. of expected results after removing duplicates
6. dedup_generated_size: no. of generated results after removing duplicates
"""
# Filter out None values (assuming they shouldn't be considered)
golden_results = (
[x for x in golden_results if x is not None] if golden_results else []
)
generated_results = (
[x for x in generated_results if x is not None] if generated_results else []
)
orig_golden_size = len(golden_results)
orig_generated_size = len(generated_results)
golden_results_set = convert_to_set(golden_results)
generated_results_set = convert_to_set(generated_results)
dedup_golden_size = len(golden_results_set)
dedup_generated_size = len(generated_results_set)
common_results_set = golden_results_set.intersection(generated_results_set)
correct_predictions = len(common_results_set)
if golden_results == generated_results:
precision = 1
recall = 1
else:
# Calculate precision and recall
recall = (
correct_predictions / len(generated_results) if generated_results else 0
)
precision = (
correct_predictions / len(golden_results) if golden_results else 0
)
full_result = {
"precision": precision,
"recall": recall,
"orig_golden_size": orig_golden_size,
"orig_generated_size": orig_generated_size,
"dedup_golden_size": dedup_golden_size,
"dedup_generated_size": dedup_generated_size,
}
return full_result