def get_raw_scores()

in scripts/evaluate_script.py [0:0]


def get_raw_scores(examples, reference):
    """
    Computes the exact and f1 scores from the examples and the model predictions
    """

    # print(len(examples), len(reference))

    exact_scores = {}
    f1_scores = {}

    i = 0
    flag = False
    # examples = examples[::-1]
    # reference = reference[::-1]
    for idx, example in tqdm(enumerate(examples), total=len(examples)):

        # if idx % 3 != 0 or 'answer' not in example['tgt']:
        #     continue

        eg_tgt = remove_special_tokens(replace_keys(example['tgt'], 'answer'))
        try:
            while eg_tgt not in [remove_special_tokens(tokenizer.decode(tokenizer.encode(str(x)))) for x in reference[i]['answers']]:
                # pdb.set_trace()
                i += 1
                flag = True
        except:
            break

        if flag:
            print(idx, i)
            flag = False

        gold_answers = [str(x).lstrip() for x in reference[i]['denotation']]
        qas_id = reference[i]['qid']
        prediction = replace_keys(example['gen_text'], "answer")

        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)

        i += 1

    qid_list = exact_scores.keys()
    total = len(qid_list)

    return collections.OrderedDict(
        [
            ("total exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
            ("total f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
            ("total", total),
        ]
    )