in scripts/evaluate_hybrid.py [0:0]
def evaluation(in_prefix, *, eval_type, rank_ulimit):
k_values = [1, 2, 3, 4, 5, 10, 25, 50, 100, 200]
total = 0
correct = {}
for k in k_values:
correct[k] = 0
map_result = 0.0
mrr_result = 0.0
judge_key = 'NOT_INTIALIZED'
for inpath in sorted(glob.glob(in_prefix + "*")):
print(inpath)
infile = open(inpath, 'rt', encoding='utf-8')
for i, line in enumerate(infile):
line = line.strip()
question = json.loads(line)
if i == 0:
if isinstance(question['passages'][0]['judge'], dict):
judge_key = 'judge_contain_some'
else:
judge_key = None
total += 1
if len(question[eval_type]) ==0:
print("NONE")
continue
rank_list = [("", _determine_judge(doc, judge_key)) for doc in question[eval_type]]
rank_list = rank_list[:rank_ulimit]
# calculate recall
for k in k_values:
if k > rank_ulimit:
break
for j in range(k):
if j >= len(rank_list): break
if rank_list[j][1] == 1:
correct[k] += 1
break
# calculate MAP and MRR
cur_map = 0.0
cur_map_total = 0.0
cur_mrr = 0.0
scores = []
for j in range(len(rank_list)):
cur_dist, cur_label = rank_list[j]
scores.append(cur_dist)
if cur_label == 1:
cur_map_total += 1
cur_map += cur_map_total / (1 + j)
if cur_mrr == 0.0:
cur_mrr = 1 / float(1 + j)
if cur_map_total != 0.0: cur_map = cur_map / cur_map_total
map_result += cur_map
mrr_result += cur_mrr
# output accuracy
out_report = ''
for k in k_values:
cur_accuracy = correct[k] / float(total) * 100
out_report += 'Top-%d:\t%.2f\n' % (k, cur_accuracy)
map_result = map_result / float(total) * 100
mrr_result = mrr_result / float(total) * 100
out_report += 'MAP:\t%.2f\n' % (map_result)
out_report += 'MRR:\t%.2f\n' % (mrr_result)
out_report += 'Total:\t%d\n' % (total)
out_report += 'Size of rank list:\t%d\n' % (rank_ulimit)
acc = correct[1] / float(total)
print(f'eval_type: {eval_type}')
print(f'judge_key: {judge_key}')
print(out_report)
print()
return acc