in src/utils.py [0:0]
def load_bin_embeddings(params, source, full_vocab):
"""
Reload pretrained embeddings from a fastText binary file.
"""
# reload fastText binary file
lang = params.src_lang if source else params.tgt_lang
model = load_fasttext_model(params.src_emb if source else params.tgt_emb)
words = model.get_labels()
assert model.get_dimension() == params.emb_dim
logger.info("Loaded binary model. Generating embeddings ...")
embeddings = torch.from_numpy(np.concatenate([model.get_word_vector(w)[None] for w in words], 0))
logger.info("Generated embeddings for %i words." % len(words))
assert embeddings.size() == (len(words), params.emb_dim)
# select a subset of word embeddings (to deal with casing)
if not full_vocab:
word2id, indexes = select_subset(words, params.max_vocab)
embeddings = embeddings[indexes]
else:
word2id = {w: i for i, w in enumerate(words)}
id2word = {i: w for w, i in word2id.items()}
dico = Dictionary(id2word, word2id, lang)
assert embeddings.size() == (len(dico), params.emb_dim)
return dico, embeddings