in scripts/evaluate_script.py [0:0]
def get_raw_scores(examples, reference):
"""
Computes the exact and f1 scores from the examples and the model predictions
"""
# print(len(examples), len(reference))
exact_scores = {}
f1_scores = {}
i = 0
flag = False
# examples = examples[::-1]
# reference = reference[::-1]
for idx, example in tqdm(enumerate(examples), total=len(examples)):
# if idx % 3 != 0 or 'answer' not in example['tgt']:
# continue
eg_tgt = remove_special_tokens(replace_keys(example['tgt'], 'answer'))
try:
while eg_tgt not in [remove_special_tokens(tokenizer.decode(tokenizer.encode(str(x)))) for x in reference[i]['answers']]:
# pdb.set_trace()
i += 1
flag = True
except:
break
if flag:
print(idx, i)
flag = False
gold_answers = [str(x).lstrip() for x in reference[i]['denotation']]
qas_id = reference[i]['qid']
prediction = replace_keys(example['gen_text'], "answer")
exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)
i += 1
qid_list = exact_scores.keys()
total = len(qid_list)
return collections.OrderedDict(
[
("total exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
("total f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
("total", total),
]
)