in src/biencoder_predict_qa.py [0:0]
def read_squad(squad_file, roberta, args):
with open(squad_file) as f:
squad_data = json.load(f)
# Get examples to prepare to batch them
examples = []
truncated_questions = 0
for a in squad_data['data']:
for p in a['paragraphs']:
paragraph_bpe = roberta.encode(p['context'])
for q in p['qas']:
question_bpe = roberta.encode(q['question'])
if len(question_bpe) > args.question_max_len:
question_bpe = question_bpe[:args.question_max_len]
truncated_questions += 1
examples.append((q['id'], paragraph_bpe, question_bpe))
print(f'Truncated {truncated_questions}/{len(examples)} questions')
return examples