in src/evaluation/evaluator.py [0:0]
def sent_translation(self, to_log):
"""
Evaluation on sentence translation.
Only available on Europarl, for en - {de, es, fr, it} language pairs.
"""
lg1 = self.src_dico.lang
lg2 = self.tgt_dico.lang
# parameters
n_keys = 200000
n_queries = 2000
n_idf = 300000
# load europarl data
if not hasattr(self, 'europarl_data'):
self.europarl_data = load_europarl_data(
lg1, lg2, n_max=(n_keys + 2 * n_idf)
)
# if no Europarl data for this language pair
if not self.europarl_data:
return
# mapped word embeddings
src_emb = self.mapping(self.src_emb.weight).data
tgt_emb = self.tgt_emb.weight.data
# get idf weights
idf = get_idf(self.europarl_data, lg1, lg2, n_idf=n_idf)
for method in ['nn', 'csls_knn_10']:
# source <- target sentence translation
results = get_sent_translation_accuracy(
self.europarl_data,
self.src_dico.lang, self.src_dico.word2id, src_emb,
self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb,
n_keys=n_keys, n_queries=n_queries,
method=method, idf=idf
)
to_log.update([('tgt_to_src_%s-%s' % (k, method), v) for k, v in results])
# target <- source sentence translation
results = get_sent_translation_accuracy(
self.europarl_data,
self.tgt_dico.lang, self.tgt_dico.word2id, tgt_emb,
self.src_dico.lang, self.src_dico.word2id, src_emb,
n_keys=n_keys, n_queries=n_queries,
method=method, idf=idf
)
to_log.update([('src_to_tgt_%s-%s' % (k, method), v) for k, v in results])