in utils/evaluate_retrieval.py [0:0]
def main(retrieved_passages_pattern: str, temp_files_directory: str, workers: int):
retrieved_passages_files = Path().glob(retrieved_passages_pattern)
temp_files_directory = Path(temp_files_directory)
temp_files_directory.mkdir(exist_ok=True, parents=True)
question_id_to_docs = {}
for retrieved_passages_file in retrieved_passages_files:
with open(retrieved_passages_file) as infile:
with Pool(workers) as p:
for i, passage_results in enumerate(
p.imap(compute_f1_for_retrieved_passage, infile)
):
if (i + 1) % 5000 == 0:
logging.info(
f'Processing {retrieved_passages_file.name}, {i + 1} lines done...'
)
qid = f"{passage_results['Conversation-ID']}_{passage_results['Turn-ID']}"
if qid not in question_id_to_docs:
question_id_to_docs[qid] = []
question_id_to_docs[qid].append(
{
'Conversation-ID': passage_results['Conversation-ID'],
'Turn-ID': passage_results['Turn-ID'],
'docid': passage_results['docid'],
'content': passage_results['content'],
'rank': passage_results['rank'],
'answer': passage_results['answer'],
'heuristic_answer': passage_results['heuristic_answer'],
'f1': passage_results['f1'],
}
)
print('Final metrics:')
unique_relevant_docs = get_unique_relevant_docs_count(question_id_to_docs, RELEVANCE_THRESHOLD)
unique_docs_perfect_f1 = get_unique_relevant_docs_count(question_id_to_docs, 1.0)
avg_relevant_docs_per_question = get_average_relevant_docs_per_question(
question_id_to_docs, 1.0
)
print(f'Total number of unique queries: {len(question_id_to_docs)}')
print(f'Total number of unique relevant docs: {unique_relevant_docs}')
print(f'Total number of unique docs with F1=1.0: {unique_docs_perfect_f1}')
print(f'Average number of relevant docs per query: {avg_relevant_docs_per_question}')
mrr = compute_mean_reciprocal_rank(question_id_to_docs, RELEVANCE_THRESHOLD)
recall_at_10 = compute_recall_at_k(question_id_to_docs, 10, RELEVANCE_THRESHOLD)
recall_at_100 = compute_recall_at_k(question_id_to_docs, 100, RELEVANCE_THRESHOLD)
print(f'Mean Reciprocal Rank (MRR): {mrr:.4f}')
print(f'Recall@10: {recall_at_10 * 100:.2f}%')
print(f'Recall@100: {recall_at_100 * 100:.2f}%')
em_upper_bound, f1_upper_bound = compute_extractive_upper_bounds(
question_id_to_docs, temp_files_directory
)
print(f'Extractive Upper Bound for EM (100 point scale): {em_upper_bound * 100:.2f}')
print(f'Extractive Upper Bound for F1 (100 point scale): {f1_upper_bound * 100:.2f}')