in prepro.py [0:0]
def process_qa_file_entry(entry, is_training, tokenizer, args):
positive_input_ids = []
positive_input_mask = []
positive_start_positions = []
positive_end_positions = []
positive_answer_mask = []
positive_doc_tokens = [p[1] for p in entry['positives']]
positive_tokens = []
positive_tok_to_orig_map = []
negative_input_ids = []
negative_input_mask = []
num_truncated = 0
num_q_tokens = 0
assert args.debug or (is_training and len(entry['positives'])>0) \
or (not is_training and len(entry['positives'])>=0 and len(entry['negatives'])==0)
if (not is_training) and len(entry['positives'])==len(entry['negatives'])==0:
print ("Dev example with no retrieval found")
entry['positives'] = [["dummy", "dummy", []]]
# entry is a json line from the input file. Contains question, answers, positives and negatives
for idx, passage in enumerate(entry['positives'][:100]):
input_ids, input_mask, tokens, tok_to_orig_map, start_positions, end_positions, answer_mask, truncated, q_tokens = \
convert_qa_feature(tokenizer, entry['question'], passage,
max_length=args.max_passage_length,
max_n_answers=args.max_n_answers,
compute_span=is_training, similar_answers=entry['similar_answers'] if args.similar_answers else None, args=args)
num_truncated += truncated
if idx == 0: # its the same question with multiple passages
num_q_tokens += q_tokens
positive_input_ids.append(input_ids)
positive_input_mask.append(input_mask)
positive_tokens.append(tokens)
positive_tok_to_orig_map.append(tok_to_orig_map)
positive_start_positions.append(start_positions)
positive_end_positions.append(end_positions)
positive_answer_mask.append(answer_mask)
for i, passage in enumerate(entry['negatives']):
input_ids, input_mask, _, _, _, _, _, truncated, q_tokens = convert_qa_feature(tokenizer,
entry['question'],
passage,
max_length=args.max_passage_length,
max_n_answers=args.max_n_answers,
compute_span=False, similar_answers=entry['similar_answers'] if args.similar_answers else None, args=args)
num_truncated += truncated
negative_input_ids.append(input_ids)
negative_input_mask.append(input_mask)
return {
'id': entry['id'],
'positive_input_ids': positive_input_ids,
'positive_input_mask': positive_input_mask,
'positive_tokens': positive_tokens,
'positive_doc_tokens': positive_doc_tokens,
'positive_tok_to_orig_map': positive_tok_to_orig_map,
'positive_start_positions': positive_start_positions,
'positive_end_positions': positive_end_positions,
'positive_answer_mask': positive_answer_mask,
'negative_input_ids': negative_input_ids,
'negative_input_mask': negative_input_mask,
'question': entry['question'],
'gt_title': entry['gt_title'],
'answers': entry['answers'],
'truncated': num_truncated,
'q_tokens': num_q_tokens
}