in lama/modules/gpt_connector.py [0:0]
def __get_input_tensors(self, sentence_list):
"""Concatenates, tokenize and converts a sentences to model inputs.
Args:
sentence_list: A list of strings. The string may contain a special
[MASK] token.
Returns:
A tuple (src_tensor, dst_tensor, masked_indices, tokenized_text).
src_tensor: torch.LongTensor with shape (seq_len), the input to
the new without the last symbol and with EOS prepended.
dst_tensor: torch.LongTensor with shape (seq_len).
masked_indices: A list of indices of [MASK] in dst_tensor.
tokenized_text: A list of token string.
"""
# Split the sentence by [MASK] and tokenize the chunks independently.
tokenized_text = []
masked_indices = []
for sentence_idx, sentence in enumerate(sentence_list):
if sentence_idx > 0:
tokenized_text.append(OPENAI_EOS)
for chunk_idx, chunk in enumerate(sentence.split('[MASK]')):
if chunk_idx > 0:
masked_indices.append(len(tokenized_text))
tokenized_text.append(self.unk_symbol)
chunk = chunk.strip()
if chunk:
tokenized_text.extend(self.tokenizer.tokenize(chunk))
full_indexed_tokens = [
self.eos_id
] + self.tokenizer.convert_tokens_to_ids(tokenized_text)
full_tokens_tensor = torch.tensor(full_indexed_tokens)
src_tensor = full_tokens_tensor[:-1]
dst_tensor = full_tokens_tensor[1:]
return src_tensor, dst_tensor, masked_indices, tokenized_text