def load_fasttext_embeddings()

in tseval/embeddings.py [0:0]


def load_fasttext_embeddings(vocab_size=None):
        if not os.path.exists(FASTTEXT_EMBEDDINGS_PATH):
            from tseval.utils.prepare import prepare_resource
            prepare_resource('fasttext_embeddings')
        with open(FASTTEXT_EMBEDDINGS_PATH, 'r') as f:
            # First line contains number and size of vectors
            total_embeddings, embedding_dim = [int(val) for val in f.readline().split()]
            if vocab_size is None:
                vocab_size = total_embeddings
            word_embeddings = torch.zeros(vocab_size, embedding_dim)
            # TODO: Is having a vector of zeros the best embedding for unknown words?
            embedded_words = ['<unk>']
            for i, line in enumerate(f):
                i = i + 1  # Shift i to take unk into account
                if i >= vocab_size:
                    break
                word, *embedding = line.strip(' \n').split(' ')
                embedded_words.append(word)
                word_embeddings[i, :] = torch.FloatTensor(np.array(embedding, dtype=float))
        # For fast embedding retrieval
        word2index = {word: i for i, word in enumerate(embedded_words)}
        return word_embeddings, word2index