in align/models.py [0:0]
def embed(self, words, langcode='en_XX'):
lbs, rbs = list(), list()
tokens, word_ids = list(), list()
for word in words:
word_tokens = self.tokenizer.tokenize(word)
lbs.append(len(tokens))
tokens.extend(word_tokens)
rbs.append(len(tokens))
tokens = [tokens + ['</s>', langcode]]
lengths = [len(x) for x in tokens]
max_length = max(lengths)
for i in range(len(tokens)):
word_ids.append(self.tokenizer.convert_tokens_to_ids(['<pad>'] * (max_length - len(tokens[i])) + tokens[i]))
encoder_input = {
'src_tokens': torch.tensor(word_ids).to(self.device),
'src_lengths': torch.tensor(lengths).to(self.device)
}
encoder_outs = self.model.forward_encoder(encoder_input)
np_encoder_outs = encoder_outs[0].encoder_out.float().detach()
word_features = list()
for i, lb in enumerate(lbs):
rb = rbs[i]
word_features.append(np_encoder_outs[lb:rb].mean(0))
word_features = torch.cat(word_features, dim=0)
return word_features