def evaluate_ensemble()

in pytorch-transformers/ensemble_answers_by_confidence_script.py [0:0]


def evaluate_ensemble(filepaths, ensembled_file, args):
    # Write ensembled nbest_predictions.json
    prefix = f'{args.split}'
    hotpot_answer_file = f'{DATA_DIR}/hotpot-orig/{args.split}.json'
    model_weights = np.ones(len(filepaths)) / len(filepaths)
    all_nbest_predictions = []
    all_null_odds = []
    print('    Reading files...')
    for filepath in tqdm(filepaths):
        with open(f'{CUR_DIR}/{filepath}') as f:
            all_nbest_predictions.append(json.load(f))
        with open(f'{CUR_DIR}/{filepath}'.replace("nbest_predictions", "null_odds")) as f:
            all_null_odds.append(json.load(f))

    qids = all_nbest_predictions[0].keys()
    ensembled_nbest_predictions = {}
    ensembled_null_odds = {}
    print('    Ensembling...')
    for qid in tqdm(qids):
        text2prob, text2start_logit, text2end_logit = {}, {}, {}
        for model_no, (nbest_prediction, null_odds) in enumerate(zip(all_nbest_predictions, all_null_odds)):
            for pred in nbest_prediction[qid]:
                text_evidence = (pred['text'], pred.get('evidence', ''))
                text2prob[text_evidence] = text2prob.get(text_evidence, 0.) + (model_weights[model_no] * pred['probability'])
                text2start_logit[text_evidence] = text2start_logit.get(text_evidence, 0.) + (model_weights[model_no] * pred['start_logit'])
                text2end_logit[text_evidence] = text2end_logit.get(text_evidence, 0.) + (model_weights[model_no] * pred['end_logit'])
            ensembled_null_odds[qid] = ensembled_null_odds.get(qid, 0.) + (model_weights[model_no] * null_odds[qid])
        ensembled_nbest_predictions[qid] = []
        for text_evidence in sorted(text2prob, key=text2prob.get, reverse=True):
            ensembled_nbest_predictions[qid].append({
                'text': text_evidence[0],
                'evidence': text_evidence[1],
                'probability': text2prob[text_evidence],
                'start_logit': text2start_logit[text_evidence],
                'end_logit': text2end_logit[text_evidence],
                'logit': text2start_logit[text_evidence] + text2end_logit[text_evidence] - ensembled_null_odds[qid],
                'no_answer_logit': ensembled_null_odds[qid],
            })
    ensembled_dir = ensembled_file.rsplit('/', 1)[0]
    os.makedirs(ensembled_dir, exist_ok=True)
    ensembled_filepath = os.path.join(ensembled_dir, f'nbest_predictions_{prefix}.json')
    print(f'    Saving ensemble predictions...')
    with open(f'{CUR_DIR}/{ensembled_filepath}', 'w') as f:
        json.dump(ensembled_nbest_predictions, f, indent=2)
    with open(f'{CUR_DIR}/{ensembled_filepath}'.replace("nbest_predictions", "null_odds"), 'w') as f:
        json.dump(ensembled_null_odds, f, indent=2)

    all_best_paragraph_probabilities = []
    best_paragraph_probabilities = []
    # Load answer predictions and confidences
    output_dir = f'{CUR_DIR}/{ensembled_dir}'
    with open(f'{CUR_DIR}/{ensembled_filepath}') as f:
        nbest_predictions = json.load(f)
    with open(f'{CUR_DIR}/{ensembled_filepath}'.replace('nbest_predictions', 'null_odds')) as f:
        null_odds = json.load(f)
    qids = {paraqid.split('.')[0] for paraqid in nbest_predictions.keys() if '.' in paraqid}
    print('# of eval Qs:', len(qids))

    # Get predicted answers
    pred_answers_and_sps = {'answer': {}, 'sp': {}, 'probability': {}, 'start_logit': {}, 'end_logit': {}}
    globally_normed_pred_answers_and_sps = {'answer': {}, 'sp': {}, 'probability': {}, 'start_logit': {}, 'end_logit': {}}
    pred_infos = {}
    globally_normed_pred_infos = {}
    max_num_paragraphs = 10
    for qid in qids:
        # Find paragraph with answer prediction
        min_null_odds = float('inf')
        max_logit_sum = float('-inf')
        best_single_hop_qid = None
        example_null_odds = []
        best_paragraph_no = 0
        for paragraph_no in range(max_num_paragraphs):
            single_hop_qid = qid + '.' + str(paragraph_no)
            if single_hop_qid in null_odds:
                example_null_odds.append(null_odds[single_hop_qid])
                if null_odds[single_hop_qid] < min_null_odds:
                    best_single_hop_qid = single_hop_qid
                    best_paragraph_no = paragraph_no
                    min_null_odds = null_odds[single_hop_qid]
            if single_hop_qid in nbest_predictions:
                for nbest_prediction in nbest_predictions[single_hop_qid]:
                    if len(nbest_prediction['text']) > 0:
                        logit_sum = nbest_prediction['start_logit'] + nbest_prediction['end_logit'] - null_odds[single_hop_qid]
                        if logit_sum > max_logit_sum:
                            globally_normed_pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                            globally_normed_pred_infos[qid] = nbest_prediction
                            for key in ['probability', 'start_logit', 'end_logit']:
                                globally_normed_pred_answers_and_sps[key][qid] = nbest_prediction[key]
                            max_logit_sum = logit_sum
        paragraph_logits = -np.array(example_null_odds)
        best_paragraph_probability = softmax(paragraph_logits)[best_paragraph_no]
        best_paragraph_probabilities.append(best_paragraph_probability)

        # Find/store answer and supporting fact
        pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        globally_normed_pred_answers_and_sps['sp'][qid] = []  # NB: Dummy supporting fact for now
        for nbest_prediction in nbest_predictions[best_single_hop_qid]:
            if len(nbest_prediction['text']) > 0:
                pred_answers_and_sps['answer'][qid] = nbest_prediction['text']
                pred_infos[qid] = nbest_prediction
                for key in ['probability', 'start_logit', 'end_logit']:
                    pred_answers_and_sps[key][qid] = nbest_prediction[key]
                # pred_answers_and_sps['probability'][qid] *= best_paragraph_probability
                break
        assert qid in pred_answers_and_sps['answer'], 'Error: No predicted answer found.'
        assert qid in globally_normed_pred_answers_and_sps['answer'], 'Error: No globally normed predicted answer found.'

    hotpot_output_prediction_gn_file = os.path.join(output_dir, "hotpot_predictions_gn_{}.json".format(prefix))
    with open(hotpot_output_prediction_gn_file, "w") as writer:
        writer.write(json.dumps(globally_normed_pred_answers_and_sps, indent=2))
    hotpot_gn_results = evaluate_on_hotpot(hotpot_output_prediction_gn_file, hotpot_answer_file) if not args.no_answer_file else {}
    with open(os.path.join(output_dir, "hotpot_predictions_gn_info_{}.json".format(prefix)), "w") as writer:
        writer.write(json.dumps(globally_normed_pred_infos, indent=2))
    print(f'    Saved to {os.path.join(output_dir, "hotpot_predictions_gn_info_{}.json".format(prefix))}')

    hotpot_gn_results = {'gn_' + k: v * 100. for k, v in hotpot_gn_results.items()}
    all_best_paragraph_probabilities.append(np.array(best_paragraph_probabilities))
    return hotpot_gn_results