in src/utils_data.py [0:0]
def process_one(example):
inputs = tokenizer.encode_plus(
example.source,
max_length=max_source_length,
add_special_tokens=True,
padding='max_length',
truncation='longest_first',
# return_tensors="pt"
)
labels = tokenizer.encode(
example.target,
max_length=max_target_length,
add_special_tokens=True,
padding='max_length',
truncation='longest_first',
# return_tensors="pt"
)
assert len(inputs['input_ids']) == max_source_length
assert len(labels) == max_target_length
if int(example.guid.split('-')[-1]) < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("input_ids: %s" % " ".join([str(x) for x in inputs["input_ids"]]))
logger.info("attention_mask: %s" % " ".join([str(x) for x in inputs["attention_mask"]]))
logger.info("input_tokens: %s" % tokenizer.decode(inputs['input_ids']))
logger.info("labels: %s" % tokenizer.decode(labels))
labels = [x if x > 0 else -100 for x in labels]
return InputFeatures(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
labels=labels,
)