in lama/modules/bert_connector.py [0:0]
def __get_input_tensors(self, sentences):
if len(sentences) > 2:
print(sentences)
raise ValueError("BERT accepts maximum two sentences in input for each data point")
first_tokenized_sentence = self.tokenizer.tokenize(sentences[0])
first_segment_id = np.zeros(len(first_tokenized_sentence), dtype=int).tolist()
# add [SEP] token at the end
first_tokenized_sentence.append(BERT_SEP)
first_segment_id.append(0)
if len(sentences)>1 :
second_tokenized_sentece = self.tokenizer.tokenize(sentences[1])
second_segment_id = np.full(len(second_tokenized_sentece),1, dtype=int).tolist()
# add [SEP] token at the end
second_tokenized_sentece.append(BERT_SEP)
second_segment_id.append(1)
tokenized_text = first_tokenized_sentence + second_tokenized_sentece
segments_ids = first_segment_id + second_segment_id
else:
tokenized_text = first_tokenized_sentence
segments_ids = first_segment_id
# add [CLS] token at the beginning
tokenized_text.insert(0,BERT_CLS)
segments_ids.insert(0,0)
# look for masked indices
masked_indices = []
for i in range(len(tokenized_text)):
token = tokenized_text[i]
if token == MASK:
masked_indices.append(i)
indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
return tokens_tensor, segments_tensors, masked_indices, tokenized_text