in src/evaluation/word_translation.py [0:0]
def load_dictionary(path, word2id1, word2id2):
"""
Return a torch tensor of size (n, 2) where n is the size of the
loader dictionary, and sort it by source word frequency.
"""
assert os.path.isfile(path)
pairs = []
not_found = 0
not_found1 = 0
not_found2 = 0
with io.open(path, 'r', encoding='utf-8') as f:
for index, line in enumerate(f):
assert line == line.lower()
parts = line.rstrip().split()
if len(parts) < 2:
logger.warning("Could not parse line %s (%i)", line, index)
continue
word1, word2 = parts
if word1 in word2id1 and word2 in word2id2:
pairs.append((word1, word2))
else:
not_found += 1
not_found1 += int(word1 not in word2id1)
not_found2 += int(word2 not in word2id2)
logger.info("Found %i pairs of words in the dictionary (%i unique). "
"%i other pairs contained at least one unknown word "
"(%i in lang1, %i in lang2)"
% (len(pairs), len(set([x for x, _ in pairs])),
not_found, not_found1, not_found2))
# sort the dictionary by source word frequencies
pairs = sorted(pairs, key=lambda x: word2id1[x[0]])
dico = torch.LongTensor(len(pairs), 2)
for i, (word1, word2) in enumerate(pairs):
dico[i, 0] = word2id1[word1]
dico[i, 1] = word2id2[word2]
return dico