in code/src/evaluator.py [0:0]
def eval_swap_bleu_clf(self, data_type, scores):
"""
Classify sentences with swapped attributes using pretrained classifiers.
"""
logger.info("Evaluating sentences using pretrained classifiers (%s) ..." % data_type)
assert data_type in ['valid', 'test']
self.encoder.eval()
self.decoder.eval()
if self.cnn_clf is not None:
self.cnn_clf.eval()
if self.lm is not None:
self.lm.eval()
params = self.params
# initialize confusion matrices
confusion_cnn = []
confusion_ftt = []
for a in params.attributes:
n_attr = len(params.attr_values[a])
confusion_cnn.append(np.zeros((n_attr, n_attr, n_attr), dtype=np.int32))
confusion_ftt.append(np.zeros((n_attr, n_attr, n_attr), dtype=np.int32))
# initialize hypothesis sentences
hypothesis = {
a: {(l1, l2): [] for l1 in params.attr_values[a] for l2 in params.attr_values[a]}
for a in params.attributes
}
offset = 0
# for each attribute
for attr_id, attr in enumerate(params.attributes):
# number of labels for this attribute
n_attr = len(params.attr_values[attr])
# for each label
for label_id, label in enumerate(params.attr_values[attr]):
# for all sentences with this label
for (sent1, len1, attr1) in self.get_iterator(data_type, (attr_id, label_id)):
# check attribute / cuda batch / encode sentence
assert (attr1[:, attr_id] - offset == label_id).sum() == attr1.size(0)
sent1, attr1 = sent1.cuda(), attr1.cuda()
encoded = self.encoder(sent1, len1)
# try all labels
for new_label_id, new_label in enumerate(params.attr_values[attr]):
# update attribute / generate hypothesis with new attributes
attr1[:, attr_id] = new_label_id + offset
max_len = int(1.5 * len1.max() + 10)
sent2, len2, _ = self.decoder.generate(encoded, attr1, max_len=max_len)
# save hypothesis
hypothesis[attr][(label, new_label)].append((sent2, len2, attr1.clone()))
# CNN classifier
if self.cnn_clf is not None:
clf_scores = self.cnn_clf(sent2, len2)
predictions = clf_scores[:, offset:offset + n_attr].cpu().numpy().argmax(1)
for p in predictions:
confusion_cnn[attr_id][label_id, new_label_id, p] += 1
# fastText classifier
if self.ftt_clfs is not None:
# length label (small hack to include length in a fastText classifier)
if attr.startswith('length_'):
predictions = (len2 - 2).float().div(params.bucket_size).sub(1).clamp(0, n_attr - 1).long()
else:
samples = convert_to_text(sent2, len2, self.dico, params)
# get top 5 predictions
predictions = self.ftt_clfs[attr].predict(samples, k=5)[0]
##
# this section is to deal with -1 / 1 labels for binary sentiment classifier. TODO: remove in the end
if attr == 'binary_sentiment':
predictions = [[l.replace('__0', '__-1') for l in p] for p in predictions]
##
# remove __label__ prefix and ignored labels (9 is the length of __label__)
predictions = [[l[9:] for l in p if l[9:] in params.attr_values[attr]][0] for p in predictions]
predictions = [params.attr_values[attr].index(p) for p in predictions]
for p in predictions:
confusion_ftt[attr_id][label_id, new_label_id, p] += 1
offset += n_attr
#
# export references / hypothesis - compute self BLEU
#
PATTERN1 = 'BLEU - {:>5}: {:.3f}'
PATTERN2 = 'BLEU - {:>5} - {:>10}: {:.3f}'
PATTERN3 = 'BLEU - {:>5} - {:>10} - {:>10} -> {}'
# for each attribute
for attr in params.attributes:
labels = params.attr_values[attr]
# for each label
for label_id, label in enumerate(labels):
# for each new label
for new_label_id, new_label in enumerate(labels):
# convert hypothesis to text
txt = []
for sent, lengths, _ in hypothesis[attr][(label, new_label)]:
txt.extend(convert_to_text(sent, lengths, self.dico, params))
# export hypothesis / restore BPE segmentation
filename = 'hyp.%s.%s.%s.%s.%i' % (data_type, attr, label, new_label, scores['epoch'])
hyp_path = os.path.join(params.hyp_path, filename)
with open(hyp_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(txt) + '\n')
restore_segmentation(hyp_path)
# new label self BLEU
filename = 'ref.%s.%s.%s' % (data_type, attr, label)
ref_path = os.path.join(params.hyp_path, filename)
bleu = self.eval_moses_bleu(ref_path, hyp_path)
scores['self_bleu_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = bleu
# label self BLEU
bleus = [scores['self_bleu_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for new_label in labels]
bleu = np.mean(bleus)
scores['self_bleu_%s_%s_%s' % (data_type, attr, label)] = bleu
if label_id == 0:
logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
logger.info(PATTERN3.format(data_type, attr, label, " | ".join(["%10.2f" % b for b in bleus] + ["%10.2f" % bleu])))
# attribute self BLEU
bleu = np.mean([scores['self_bleu_%s_%s_%s' % (data_type, attr, label)] for label in labels])
scores['self_bleu_%s_%s' % (data_type, attr)] = bleu
logger.info(PATTERN2.format(data_type, attr, bleu))
# overall self BLEU
bleu = np.mean([scores['self_bleu_%s_%s' % (data_type, attr)] for attr in params.attributes])
scores['self_bleu_%s' % data_type] = bleu
logger.info(PATTERN1.format(data_type, bleu))
#
# evaluate language model perplexity
#
if self.lm is not None:
PATTERN1 = 'PPL - {:>5}: {:.3f}'
PATTERN2 = 'PPL - {:>5} - {:>10}: {:.3f}'
PATTERN3 = 'PPL - {:>5} - {:>10} - {:>10} -> {}'
# for each attribute
for attr in params.attributes:
labels = params.attr_values[attr]
# for each label
for label_id, label in enumerate(labels):
# for each new label
for new_label_id, new_label in enumerate(labels):
total_loss = 0
total_words = 0
for sent, lengths, attributes in hypothesis[attr][(label, new_label)]:
log_probs = self.lm(sent[:-1], lengths - 1, attributes)
total_loss += F.cross_entropy(
log_probs.view(-1, self.params.n_words),
sent[1:].view(-1),
size_average=False
)
total_words += (lengths - 1).sum()
# new label perplexity
ppl = np.exp(total_loss.item() / total_words.item())
scores['ppl_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = ppl
# label perplexity
ppls = [scores['ppl_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for new_label in labels]
ppl = np.mean(ppls)
scores['ppl_%s_%s_%s' % (data_type, attr, label)] = ppl
if label_id == 0:
logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
logger.info(PATTERN3.format(data_type, attr, label, " | ".join(["%10.2f" % b for b in ppls] + ["%10.2f" % ppl])))
# attribute perplexity
ppl = np.mean([scores['ppl_%s_%s_%s' % (data_type, attr, label)] for label in labels])
scores['ppl_%s_%s' % (data_type, attr)] = ppl
logger.info(PATTERN2.format(data_type, attr, ppl))
# overall perplexity
ppl = np.mean([scores['ppl_%s_%s' % (data_type, attr)] for attr in params.attributes])
scores['ppl_%s' % data_type] = ppl
logger.info(PATTERN1.format(data_type, ppl))
#
# report CNN classifier accuracy for each attribute
#
if self.cnn_clf is not None:
PATTERN1 = 'Accu - {:>5}: {:.3f}'
PATTERN2 = 'Accu - {:>5} - {:>10}: {:.3f}'
PATTERN3 = 'Accu - {:>5} - {:>10} - {:>10} -> {}'
# for each attribute
for attr_id, attr in enumerate(params.attributes):
labels = params.attr_values[attr]
# for each new label
for new_label_id, new_label in enumerate(labels):
# for each original label
for label_id, label in enumerate(labels):
correct = confusion_cnn[attr_id][label_id, new_label_id, new_label_id]
total = confusion_cnn[attr_id][label_id, new_label_id].sum()
accuracy = 100 * float(correct) / float(total)
scores['cnn_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = accuracy
# new label accuracy
accus = [scores['cnn_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for label in labels]
accu = np.mean(accus)
scores['cnn_clf_%s_%s_%s' % (data_type, attr, new_label)] = accu
if new_label_id == 0:
logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
logger.info(PATTERN3.format(data_type, attr, new_label, " | ".join(["%10.2f" % a for a in accus] + ["%10.2f" % accu])))
# attribute accuracy
accu = np.mean([scores['cnn_clf_%s_%s_%s' % (data_type, attr, new_label)] for new_label in labels])
scores['cnn_clf_%s_%s' % (data_type, attr)] = accu
logger.info(PATTERN2.format(data_type, attr, accu))
# log attribute confusion matrix
logger.info("Confusion matrix for %s:" % attr)
logger.info(confusion_cnn[attr_id])
# overall accuracy
accuracy = np.mean([scores['cnn_clf_%s_%s' % (data_type, a)] for a in params.attributes])
scores['cnn_clf_%s' % data_type] = accuracy
logger.info(PATTERN1.format(data_type, accuracy))
if self.ftt_clfs is not None:
PATTERN1 = 'Accu - {:>5}: {:.3f}'
PATTERN2 = 'Accu - {:>5} - {:>10}: {:.3f}'
PATTERN3 = 'Accu - {:>5} - {:>10} - {:>10} -> {}'
# for each attribute
for attr_id, attr in enumerate(params.attributes):
labels = params.attr_values[attr]
# for each new label
for new_label_id, new_label in enumerate(labels):
# for each original label
for label_id, label in enumerate(labels):
correct = confusion_ftt[attr_id][label_id, new_label_id, new_label_id]
total = confusion_ftt[attr_id][label_id, new_label_id].sum()
accuracy = 100 * float(correct) / float(total)
scores['ftt_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = accuracy
# new label accuracy
accus = [scores['ftt_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for label in labels]
accu = np.mean(accus)
scores['ftt_clf_%s_%s_%s' % (data_type, attr, new_label)] = accu
if new_label_id == 0:
logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
logger.info(PATTERN3.format(data_type, attr, new_label, " | ".join(["%10.2f" % a for a in accus] + ["%10.2f" % accu])))
# attribute accuracy
accu = np.mean([scores['ftt_clf_%s_%s_%s' % (data_type, attr, new_label)] for new_label in labels])
scores['ftt_clf_%s_%s' % (data_type, attr)] = accu
logger.info(PATTERN2.format(data_type, attr, accu))
# log attribute confusion matrix
logger.info("Confusion matrix for %s:" % attr)
logger.info(confusion_ftt[attr_id])
# overall accuracy
accuracy = np.mean([scores['ftt_clf_%s_%s' % (data_type, a)] for a in params.attributes])
scores['ftt_clf_%s' % data_type] = accuracy
logger.info(PATTERN1.format(data_type, accuracy))
# return hypothesis for fast back-translation evaluation
return hypothesis