def read_squad()

in src/biencoder_predict_qa.py [0:0]


def read_squad(squad_file, roberta, args):
    with open(squad_file) as f:
        squad_data = json.load(f)

    # Get examples to prepare to batch them
    examples = []
    truncated_questions = 0
    for a in squad_data['data']:
        for p in a['paragraphs']:
            paragraph_bpe = roberta.encode(p['context'])
            for q in p['qas']:
                question_bpe = roberta.encode(q['question'])
                if len(question_bpe) > args.question_max_len:
                    question_bpe = question_bpe[:args.question_max_len]
                    truncated_questions += 1
                examples.append((q['id'], paragraph_bpe, question_bpe))
    print(f'Truncated {truncated_questions}/{len(examples)} questions')
    return examples