in kilt/readers/t5/data.py [0:0]
def encode_seq(tokenizer, seqs, max_length, out_dir, dataset, side='source', type_path='train', pad_to_max_length=True,
return_tensors="pt"):
examples = []
lengths = []
output_file = os.path.join(out_dir, dataset + "-" + type_path + "-" + side + ".encoded")
with open(output_file, "w") as f_out:
texts = []
for text in seqs:
if dataset_task_map[dataset] == 'Entity Linking' and side == 'source':
length = int(int(dataset_config[dataset]['source_length']) / 2)
mention_start = text.find('[START_ENT]')
mention_end = text.find('[END_ENT]')
left = text[0:mention_start]
right = text[mention_end + len('[END_ENT]'):]
left_ids = tokenizer.encode(left)
right_ids = tokenizer.encode(right)
left = tokenizer.decode(left_ids[max(0, len(left_ids) - length):len(left_ids)])
right = tokenizer.decode(right_ids[0:min(len(right_ids), length)])
text = left + ' ' + text[mention_start:mention_end] + '[END_ENT] ' + right
if dataset == 'wow' and side == 'source':
text = text.replace('\n', '[SEP]')
if dataset == 'fever' and side == 'target':
if text == "REFUTES":
text = "<REFUTES>"
if text == "SUPPORTS":
text = "<SUPPORTS>"
txt = text if side == 'target' else \
dataset_task_map[dataset] + ": " + text
txt = txt + tokenizer.eos_token
texts.append(txt)
if dataset == 'wow' and side == 'source':
tokenized = tokenizer.batch_encode_plus(
texts, add_special_tokens=True, max_length=max_length, pad_to_max_length='left',
return_tensors=return_tensors,
)
else:
tokenized = tokenizer.batch_encode_plus(
texts, add_special_tokens=True, max_length=max_length, pad_to_max_length=pad_to_max_length,
return_tensors=return_tensors,
)
#lengths.append(tokenized["input_ids"].size()[1])
for input in tokenized["input_ids"]:
tokens = tokenizer.convert_ids_to_tokens(input)
f_out.write(' | '.join(tokens) + "\n")
return tokenized