in code/src/evaluator.py [0:0]
def eval_back(self, data_type, scores, hypothesis):
"""
Compute attr_1 -> attr_k -> attr_1 perplexity and BLEU scores.
"""
logger.info("Evaluating back-translation perplexity and BLEU (%s) ..." % data_type)
assert data_type in ['valid', 'test']
self.encoder.eval()
self.decoder.eval()
params = self.params
offset = 0
# for each attribute
for attr_id, attr in enumerate(params.attributes):
# number of labels for this attribute
n_attr = len(params.attr_values[attr])
# for each label
for label_id, label in enumerate(params.attr_values[attr]):
# try all labels
for new_label_id, new_label in enumerate(params.attr_values[attr]):
orig_sent = list(self.get_iterator(data_type, (attr_id, label_id)))
inter_sent = hypothesis[attr][(label, new_label)]
assert len(orig_sent) == len(inter_sent)
assert all([x[0].size(1) == x[1].size(0) == y[0].size(1) == y[1].size(0) for x, y in zip(orig_sent, inter_sent)])
hypothesis[attr][(label, new_label, label)] = []
# for all sentences with this label
for (sent1, len1, attr1), (sent2, len2, attr2) in zip(orig_sent, inter_sent):
# sanity check
assert sent1.size(1) == sent2.size(1) == len1.size(0) == len2.size(0)
assert (attr1[:, attr_id] - offset == label_id).sum().item() == attr1.size(0)
assert (attr2[:, attr_id] - offset == new_label_id).sum().item() == attr2.size(0)
# cuda batch / encode sentence
sent1, attr1 = sent1.cuda(), attr1.cuda()
sent2, attr2 = sent2.cuda(), attr2.cuda()
encoded = self.encoder(sent2, len2)
# update attribute / generate hypothesis with new attributes
max_len = int(1.5 * len2.max() + 10)
sent3, len3, _ = self.decoder.generate(encoded, attr1, max_len=max_len)
# save hypothesis
hypothesis[attr][(label, new_label, label)].append((sent3, len3, attr1.clone()))
offset += n_attr
#
# export references / hypothesis - compute self BLEU
#
PATTERN1 = 'BLEU - {:>5}: {:.3f}'
PATTERN2 = 'BLEU - {:>5} - {:>10}: {:.3f}'
PATTERN3 = 'BLEU - {:>5} - {:>10} - {:>10} -> {}'
# for each attribute
for attr in params.attributes:
labels = params.attr_values[attr]
# for each label
for label_id, label in enumerate(labels):
# for each new label
for new_label_id, new_label in enumerate(labels):
# convert hypothesis to text
txt = []
for sent, lengths, _ in hypothesis[attr][(label, new_label, label)]:
txt.extend(convert_to_text(sent, lengths, self.dico, params))
# export hypothesis / restore BPE segmentation
filename = 'hyp.%s.%s.%s.%s.%s.%i' % (data_type, attr, label, new_label, label, scores['epoch'])
hyp_path = os.path.join(params.hyp_path, filename)
with open(hyp_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(txt) + '\n')
restore_segmentation(hyp_path)
# new label self BLEU
filename = 'ref.%s.%s.%s' % (data_type, attr, label)
ref_path = os.path.join(params.hyp_path, filename)
bleu = self.eval_moses_bleu(ref_path, hyp_path)
scores['back_bleu_%s_%s_%s_%s_%s' % (data_type, attr, label, new_label, label)] = bleu
# label self BLEU
bleus = [scores['back_bleu_%s_%s_%s_%s_%s' % (data_type, attr, label, new_label, label)] for new_label in labels]
bleu = np.mean(bleus)
scores['back_bleu_%s_%s_%s' % (data_type, attr, label)] = bleu
if label_id == 0:
logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
logger.info(PATTERN3.format(data_type, attr, label, " | ".join(["%10.2f" % b for b in bleus] + ["%10.2f" % bleu])))
# attribute self BLEU
bleu = np.mean([scores['back_bleu_%s_%s_%s' % (data_type, attr, label)] for label in labels])
scores['back_bleu_%s_%s' % (data_type, attr)] = bleu
logger.info(PATTERN2.format(data_type, attr, bleu))
# overall self BLEU
bleu = np.mean([scores['back_bleu_%s_%s' % (data_type, attr)] for attr in params.attributes])
scores['back_bleu_%s' % data_type] = bleu
logger.info(PATTERN1.format(data_type, bleu))