in tseval/embeddings.py [0:0]
def load_fasttext_embeddings(vocab_size=None):
if not os.path.exists(FASTTEXT_EMBEDDINGS_PATH):
from tseval.utils.prepare import prepare_resource
prepare_resource('fasttext_embeddings')
with open(FASTTEXT_EMBEDDINGS_PATH, 'r') as f:
# First line contains number and size of vectors
total_embeddings, embedding_dim = [int(val) for val in f.readline().split()]
if vocab_size is None:
vocab_size = total_embeddings
word_embeddings = torch.zeros(vocab_size, embedding_dim)
# TODO: Is having a vector of zeros the best embedding for unknown words?
embedded_words = ['<unk>']
for i, line in enumerate(f):
i = i + 1 # Shift i to take unk into account
if i >= vocab_size:
break
word, *embedding = line.strip(' \n').split(' ')
embedded_words.append(word)
word_embeddings[i, :] = torch.FloatTensor(np.array(embedding, dtype=float))
# For fast embedding retrieval
word2index = {word: i for i, word in enumerate(embedded_words)}
return word_embeddings, word2index