in ludwig/utils/data_utils.py [0:0]
def load_pretrained_embeddings(embeddings_path, vocab):
embeddings = load_glove(embeddings_path)
# find out the size of the embeddings
embeddings_size = len(next(iter(embeddings.values())))
# calculate an average embedding, to use for initializing missing words
avg_embedding = np.zeros(embeddings_size)
count = 0
for word in vocab:
if word in embeddings:
avg_embedding += embeddings[word]
count += 1
if count > 0:
avg_embedding /= count
# create the embedding matrix
embeddings_vectors = []
for word in vocab:
if word in embeddings:
embeddings_vectors.append(embeddings[word])
else:
embeddings_vectors.append(
avg_embedding + np.random.uniform(-0.01, 0.01, embeddings_size)
)
embeddings_matrix = np.stack(embeddings_vectors)
# let's help the garbage collector free some memory
embeddings = None
return embeddings_matrix