in pytorch_alternatives/custom_pytorch_nlp/util/preprocessing.py [0:0]
def get_word_embeddings(t, folder, lang="en"):
"""Download pre-trained word vectors and construct an embedding matrix for tokenizer `t`
Any tokens in `t` not found in the embedding vectors are mapped to all-zeros.
"""
vecs_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{lang}.300.vec.gz"
vecs_gz_filename = vecs_url.rpartition("/")[2]
os.makedirs(folder, exist_ok=True)
vecs_gz_filepath = os.path.join(folder, vecs_gz_filename)
tokenizer_vocab_size = len(t.vocab)
if wait_for_file_stable(vecs_gz_filepath):
print("Using existing embeddings file")
else:
print("Downloading word vectors...")
subprocess.run([" ".join(["wget", "-NP", folder, vecs_url])], check=True, shell=True)
print("Loading into memory...")
embeddings_index = dict()
with gzip.open(vecs_gz_filepath, "rt") as zipf:
firstline = zipf.readline()
emb_vocab_size, emb_d = firstline.split(" ")
emb_vocab_size = int(emb_vocab_size)
emb_d = int(emb_d)
for line in zipf:
values = line.split()
word = values[0]
# Only load subset of the embeddings recognised by the tokenizer:
if word in t.vocab.stoi:
coefs = np.asarray(values[1:], dtype="float32")
embeddings_index[word] = coefs
print("Loaded {} of {} word vectors for tokenizer vocabulary length {}".format(
len(embeddings_index),
emb_vocab_size,
tokenizer_vocab_size,
))
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((tokenizer_vocab_size, emb_d))
for word, i in t.vocab.stoi.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix