in src/utils_fusion_in_decoder.py [0:0]
def process_one(examples):
output = []
for (ex_index, example) in enumerate(examples):
inputs = tokenizer.batch_encode_plus(
example.source, # source is a list of str
max_length=max_source_length,
add_special_tokens=True,
padding='max_length',
truncation='longest_first',
# return_tensors="pt"
)
labels = tokenizer.encode(
example.target,
max_length=max_source_length,
add_special_tokens=True,
padding='max_length',
truncation='longest_first',
# return_tensors="pt"
)
if int(example.guid.split('-')[-1]) < 10:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("input_ids: %s" % " ".join([str(x) for x in inputs["input_ids"][0]]))
logger.info("attention_mask: %s" % " ".join([str(x) for x in inputs["attention_mask"][0]]))
logger.info("input_tokens: %s" % tokenizer.decode(inputs["input_ids"][0]))
logger.info("labels: %s" % tokenizer.decode(labels))
for input in inputs['input_ids']:
assert len(input) == max_source_length
labels = [x if x > 0 else -100 for x in labels]
output.append(
InputFeatures(
input_ids=inputs['input_ids'], # list of lists
attention_mask=inputs['attention_mask'], # list of lists
labels=labels, # list
)
)
return output