def compute_score()

in metrics/cuad/compute_score.py [0:0]


def compute_score(dataset, predictions):
    f1 = exact_match = total = 0
    precisions = []
    recalls = []
    for article in dataset:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                total += 1
                if qa["id"] not in predictions:
                    message = "Unanswered question " + qa["id"] + " will receive score 0."
                    print(message, file=sys.stderr)
                    continue
                ground_truths = list(map(lambda x: x["text"], qa["answers"]))
                prediction = predictions[qa["id"]]
                precision, recall = compute_precision_recall(prediction, ground_truths, qa["id"])

                precisions.append(precision)
                recalls.append(recall)

                if precision == 0 and recall == 0:
                    f1 += 0
                else:
                    f1 += 2 * (precision * recall) / (precision + recall)

                exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)

    precisions = [x for _, x in sorted(zip(recalls, precisions))]
    recalls.sort()

    f1 = 100.0 * f1 / total
    exact_match = 100.0 * exact_match / total
    aupr = get_aupr(precisions, recalls)

    prec_at_90_recall = get_prec_at_recall(precisions, recalls, recall_thresh=0.9)
    prec_at_80_recall = get_prec_at_recall(precisions, recalls, recall_thresh=0.8)

    return {
        "exact_match": exact_match,
        "f1": f1,
        "aupr": aupr,
        "prec_at_80_recall": prec_at_80_recall,
        "prec_at_90_recall": prec_at_90_recall,
    }