def evaluate_multirc()

in metrics/super_glue/super_glue.py [0:0]


def evaluate_multirc(ids_preds, labels):
    """
    Computes F1 score and Exact Match for MultiRC predictions.
    """
    question_map = {}
    for id_pred, label in zip(ids_preds, labels):
        question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}'
        pred = id_pred["prediction"]
        if question_id in question_map:
            question_map[question_id].append((pred, label))
        else:
            question_map[question_id] = [(pred, label)]
    f1s, ems = [], []
    for question, preds_labels in question_map.items():
        question_preds, question_labels = zip(*preds_labels)
        f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro")
        f1s.append(f1)
        em = int(sum(p == l for p, l in preds_labels) == len(preds_labels))
        ems.append(em)
    f1_m = float(sum(f1s) / len(f1s))
    em = sum(ems) / len(ems)
    f1_a = float(f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds]))
    return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a}