in utils_ranking.py [0:0]
def _create_examples(self):
"""Creates examples for the training and dev sets. In this method, text_b includes all the candidates."""
with jsonlines.open(self.filepath, 'r') as reader:
for idx_i, data in enumerate(reader.iter()):
input_ids, token_type_ids, attention_mask = [], [], []
for idx_j, line in enumerate(data['candidates']):
guid = "%s-%s" % (self.mode, f"example_{idx_i}_index_{idx_j}") # example_1_index_7
text_a = data['question']
text_b = line['article_title'] + '[title]' + line['text']
label = line['judge']['judge_contain_some'] if type(line['judge']) == dict else line['judge']
if idx_j == 0:
assert label == 1
else:
assert label == 0
inputs = self.tokenizer.encode_plus(
text_a,
text_b,
add_special_tokens=True,
max_length=self.max_length,
padding='max_length',
truncation='only_second')
input_ids.extend(inputs["input_ids"])
token_type_ids.extend(inputs["token_type_ids"])
attention_mask.extend(inputs["attention_mask"])
assert len(input_ids) == self.max_length * self.num_cand, f"actual length {len(input_ids)}; required {max_length * num_cand}"
assert len(token_type_ids) == self.max_length * self.num_cand
assert len(attention_mask) == self.max_length * self.num_cand
yield InputFeatures(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
label=0
)