in utils_ranking.py [0:0]
def process_one(examples):
input_ids, token_type_ids, attention_mask = [], [], []
assert len(examples) == num_cand
for (ex_index, example) in enumerate(examples):
inputs = tokenizer.encode_plus(
example.text_a,
example.text_b,
add_special_tokens=True,
max_length=max_length,
padding='max_length',
truncation='only_second')
if example.guid.split('_')[1] == '0' and ex_index < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("input_ids: %s" % " ".join([str(x) for x in inputs["input_ids"]]))
logger.info("input_tokens: %s" % " ".join([str(tokenizer.convert_ids_to_tokens(x)) for x in inputs["input_ids"]]))
logger.info("attention_mask: %s" % " ".join([str(x) for x in inputs["attention_mask"]]))
logger.info("token_type_ids: %s" % " ".join([str(x) for x in inputs["token_type_ids"]]))
input_ids.extend(inputs["input_ids"])
token_type_ids.extend(inputs["token_type_ids"])
attention_mask.extend(inputs["attention_mask"])
assert len(input_ids) == max_length * num_cand, f"actual length {len(input_ids)}; required {max_length * num_cand}"
assert len(token_type_ids) == max_length * num_cand
assert len(attention_mask) == max_length * num_cand
return InputFeatures(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
label=0
)