in drqa/reader/model.py [0:0]
def load_embeddings(self, words, embedding_file):
"""Load pretrained embeddings for a given list of words, if they exist.
Args:
words: iterable of tokens. Only those that are indexed in the
dictionary are kept.
embedding_file: path to text file of embeddings, space separated.
"""
words = {w for w in words if w in self.word_dict}
logger.info('Loading pre-trained embeddings for %d words from %s' %
(len(words), embedding_file))
embedding = self.network.embedding.weight.data
# When normalized, some words are duplicated. (Average the embeddings).
vec_counts = {}
with open(embedding_file) as f:
# Skip first line if of form count/dim.
line = f.readline().rstrip().split(' ')
if len(line) != 2:
f.seek(0)
for line in f:
parsed = line.rstrip().split(' ')
assert(len(parsed) == embedding.size(1) + 1)
w = self.word_dict.normalize(parsed[0])
if w in words:
vec = torch.Tensor([float(i) for i in parsed[1:]])
if w not in vec_counts:
vec_counts[w] = 1
embedding[self.word_dict[w]].copy_(vec)
else:
logging.warning(
'WARN: Duplicate embedding found for %s' % w
)
vec_counts[w] = vec_counts[w] + 1
embedding[self.word_dict[w]].add_(vec)
for w, c in vec_counts.items():
embedding[self.word_dict[w]].div_(c)
logger.info('Loaded %d embeddings (%.2f%%)' %
(len(vec_counts), 100 * len(vec_counts) / len(words)))