in ludwig/utils/data_utils.py [0:0]
def load_glove(file_path):
logger.info(' Loading Glove format file {}'.format(file_path))
embeddings = {}
embedding_size = 0
# collect embeddings size assuming the first line is correct
with open(file_path, 'r', encoding='utf-8') as f:
found_line = False
while not found_line:
line = f.readline()
if line:
embedding_size = len(line.split()) - 1
found_line = True
# collect embeddings
with open(file_path, 'r', encoding='utf-8') as f:
for line_number, line in enumerate(f):
if line:
try:
split = line.split()
if len(split) != embedding_size + 1:
raise ValueError
word = split[0]
embedding = np.array(
[float(val) for val in split[-embedding_size:]]
)
embeddings[word] = embedding
except ValueError:
logger.warning(
'Line {} in the GloVe file {} is malformed, '
'skipping it'.format(
line_number, file_path
)
)
logger.info(' {0} embeddings loaded'.format(len(embeddings)))
return embeddings