in metrics/super_glue/super_glue.py [0:0]
def evaluate_multirc(ids_preds, labels):
"""
Computes F1 score and Exact Match for MultiRC predictions.
"""
question_map = {}
for id_pred, label in zip(ids_preds, labels):
question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}'
pred = id_pred["prediction"]
if question_id in question_map:
question_map[question_id].append((pred, label))
else:
question_map[question_id] = [(pred, label)]
f1s, ems = [], []
for question, preds_labels in question_map.items():
question_preds, question_labels = zip(*preds_labels)
f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro")
f1s.append(f1)
em = int(sum(p == l for p, l in preds_labels) == len(preds_labels))
ems.append(em)
f1_m = float(sum(f1s) / len(f1s))
em = sum(ems) / len(ems)
f1_a = float(f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds]))
return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a}