def eval_swap_bleu

def eval_swap_bleu_clf()

in code/src/evaluator.py [0:0]
167 lines of code
84 McCabe index (conditional complexity)

    def eval_swap_bleu_clf(self, data_type, scores):
        """
        Classify sentences with swapped attributes using pretrained classifiers.
        """
        logger.info("Evaluating sentences using pretrained classifiers (%s) ..." % data_type)
        assert data_type in ['valid', 'test']
        self.encoder.eval()
        self.decoder.eval()
        if self.cnn_clf is not None:
            self.cnn_clf.eval()
        if self.lm is not None:
            self.lm.eval()
        params = self.params

        # initialize confusion matrices
        confusion_cnn = []
        confusion_ftt = []
        for a in params.attributes:
            n_attr = len(params.attr_values[a])
            confusion_cnn.append(np.zeros((n_attr, n_attr, n_attr), dtype=np.int32))
            confusion_ftt.append(np.zeros((n_attr, n_attr, n_attr), dtype=np.int32))

        # initialize hypothesis sentences
        hypothesis = {
            a: {(l1, l2): [] for l1 in params.attr_values[a] for l2 in params.attr_values[a]}
            for a in params.attributes
        }

        offset = 0
        # for each attribute
        for attr_id, attr in enumerate(params.attributes):

            # number of labels for this attribute
            n_attr = len(params.attr_values[attr])

            # for each label
            for label_id, label in enumerate(params.attr_values[attr]):

                # for all sentences with this label
                for (sent1, len1, attr1) in self.get_iterator(data_type, (attr_id, label_id)):

                    # check attribute / cuda batch / encode sentence
                    assert (attr1[:, attr_id] - offset == label_id).sum() == attr1.size(0)
                    sent1, attr1 = sent1.cuda(), attr1.cuda()
                    encoded = self.encoder(sent1, len1)

                    # try all labels
                    for new_label_id, new_label in enumerate(params.attr_values[attr]):

                        # update attribute / generate hypothesis with new attributes
                        attr1[:, attr_id] = new_label_id + offset
                        max_len = int(1.5 * len1.max() + 10)
                        sent2, len2, _ = self.decoder.generate(encoded, attr1, max_len=max_len)

                        # save hypothesis
                        hypothesis[attr][(label, new_label)].append((sent2, len2, attr1.clone()))

                        # CNN classifier
                        if self.cnn_clf is not None:
                            clf_scores = self.cnn_clf(sent2, len2)
                            predictions = clf_scores[:, offset:offset + n_attr].cpu().numpy().argmax(1)
                            for p in predictions:
                                confusion_cnn[attr_id][label_id, new_label_id, p] += 1

                        # fastText classifier
                        if self.ftt_clfs is not None:
                            # length label (small hack to include length in a fastText classifier)
                            if attr.startswith('length_'):
                                predictions = (len2 - 2).float().div(params.bucket_size).sub(1).clamp(0, n_attr - 1).long()
                            else:
                                samples = convert_to_text(sent2, len2, self.dico, params)
                                # get top 5 predictions
                                predictions = self.ftt_clfs[attr].predict(samples, k=5)[0]
                                ##
                                # this section is to deal with -1 / 1 labels for binary sentiment classifier. TODO: remove in the end
                                if attr == 'binary_sentiment':
                                    predictions = [[l.replace('__0', '__-1') for l in p] for p in predictions]
                                ##
                                # remove __label__ prefix and ignored labels (9 is the length of __label__)
                                predictions = [[l[9:] for l in p if l[9:] in params.attr_values[attr]][0] for p in predictions]
                                predictions = [params.attr_values[attr].index(p) for p in predictions]
                            for p in predictions:
                                confusion_ftt[attr_id][label_id, new_label_id, p] += 1

            offset += n_attr

        #
        # export references / hypothesis - compute self BLEU
        #
        PATTERN1 = 'BLEU - {:>5}: {:.3f}'
        PATTERN2 = 'BLEU - {:>5} - {:>10}: {:.3f}'
        PATTERN3 = 'BLEU - {:>5} - {:>10} - {:>10} -> {}'

        # for each attribute
        for attr in params.attributes:
            labels = params.attr_values[attr]

            # for each label
            for label_id, label in enumerate(labels):

                # for each new label
                for new_label_id, new_label in enumerate(labels):

                    # convert hypothesis to text
                    txt = []
                    for sent, lengths, _ in hypothesis[attr][(label, new_label)]:
                        txt.extend(convert_to_text(sent, lengths, self.dico, params))

                    # export hypothesis / restore BPE segmentation
                    filename = 'hyp.%s.%s.%s.%s.%i' % (data_type, attr, label, new_label, scores['epoch'])
                    hyp_path = os.path.join(params.hyp_path, filename)
                    with open(hyp_path, 'w', encoding='utf-8') as f:
                        f.write('\n'.join(txt) + '\n')
                    restore_segmentation(hyp_path)

                    # new label self BLEU
                    filename = 'ref.%s.%s.%s' % (data_type, attr, label)
                    ref_path = os.path.join(params.hyp_path, filename)
                    bleu = self.eval_moses_bleu(ref_path, hyp_path)
                    scores['self_bleu_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = bleu

                # label self BLEU
                bleus = [scores['self_bleu_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for new_label in labels]
                bleu = np.mean(bleus)
                scores['self_bleu_%s_%s_%s' % (data_type, attr, label)] = bleu
                if label_id == 0:
                    logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
                logger.info(PATTERN3.format(data_type, attr, label, " | ".join(["%10.2f" % b for b in bleus] + ["%10.2f" % bleu])))

            # attribute self BLEU
            bleu = np.mean([scores['self_bleu_%s_%s_%s' % (data_type, attr, label)] for label in labels])
            scores['self_bleu_%s_%s' % (data_type, attr)] = bleu
            logger.info(PATTERN2.format(data_type, attr, bleu))

        # overall self BLEU
        bleu = np.mean([scores['self_bleu_%s_%s' % (data_type, attr)] for attr in params.attributes])
        scores['self_bleu_%s' % data_type] = bleu
        logger.info(PATTERN1.format(data_type, bleu))

        #
        # evaluate language model perplexity
        #
        if self.lm is not None:

            PATTERN1 = 'PPL  - {:>5}: {:.3f}'
            PATTERN2 = 'PPL  - {:>5} - {:>10}: {:.3f}'
            PATTERN3 = 'PPL  - {:>5} - {:>10} - {:>10} -> {}'

            # for each attribute
            for attr in params.attributes:
                labels = params.attr_values[attr]

                # for each label
                for label_id, label in enumerate(labels):

                    # for each new label
                    for new_label_id, new_label in enumerate(labels):

                        total_loss = 0
                        total_words = 0

                        for sent, lengths, attributes in hypothesis[attr][(label, new_label)]:
                            log_probs = self.lm(sent[:-1], lengths - 1, attributes)
                            total_loss += F.cross_entropy(
                                log_probs.view(-1, self.params.n_words),
                                sent[1:].view(-1),
                                size_average=False
                            )
                            total_words += (lengths - 1).sum()

                        # new label perplexity
                        ppl = np.exp(total_loss.item() / total_words.item())
                        scores['ppl_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = ppl

                    # label perplexity
                    ppls = [scores['ppl_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for new_label in labels]
                    ppl = np.mean(ppls)
                    scores['ppl_%s_%s_%s' % (data_type, attr, label)] = ppl
                    if label_id == 0:
                        logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
                    logger.info(PATTERN3.format(data_type, attr, label, " | ".join(["%10.2f" % b for b in ppls] + ["%10.2f" % ppl])))

                # attribute perplexity
                ppl = np.mean([scores['ppl_%s_%s_%s' % (data_type, attr, label)] for label in labels])
                scores['ppl_%s_%s' % (data_type, attr)] = ppl
                logger.info(PATTERN2.format(data_type, attr, ppl))

            # overall perplexity
            ppl = np.mean([scores['ppl_%s_%s' % (data_type, attr)] for attr in params.attributes])
            scores['ppl_%s' % data_type] = ppl
            logger.info(PATTERN1.format(data_type, ppl))

        #
        # report CNN classifier accuracy for each attribute
        #
        if self.cnn_clf is not None:

            PATTERN1 = 'Accu - {:>5}: {:.3f}'
            PATTERN2 = 'Accu - {:>5} - {:>10}: {:.3f}'
            PATTERN3 = 'Accu - {:>5} - {:>10} - {:>10} -> {}'

            # for each attribute
            for attr_id, attr in enumerate(params.attributes):
                labels = params.attr_values[attr]

                # for each new label
                for new_label_id, new_label in enumerate(labels):

                    # for each original label
                    for label_id, label in enumerate(labels):
                        correct = confusion_cnn[attr_id][label_id, new_label_id, new_label_id]
                        total = confusion_cnn[attr_id][label_id, new_label_id].sum()
                        accuracy = 100 * float(correct) / float(total)
                        scores['cnn_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = accuracy

                    # new label accuracy
                    accus = [scores['cnn_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for label in labels]
                    accu = np.mean(accus)
                    scores['cnn_clf_%s_%s_%s' % (data_type, attr, new_label)] = accu
                    if new_label_id == 0:
                        logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
                    logger.info(PATTERN3.format(data_type, attr, new_label, " | ".join(["%10.2f" % a for a in accus] + ["%10.2f" % accu])))

                # attribute accuracy
                accu = np.mean([scores['cnn_clf_%s_%s_%s' % (data_type, attr, new_label)] for new_label in labels])
                scores['cnn_clf_%s_%s' % (data_type, attr)] = accu
                logger.info(PATTERN2.format(data_type, attr, accu))

                # log attribute confusion matrix
                logger.info("Confusion matrix for %s:" % attr)
                logger.info(confusion_cnn[attr_id])

            # overall accuracy
            accuracy = np.mean([scores['cnn_clf_%s_%s' % (data_type, a)] for a in params.attributes])
            scores['cnn_clf_%s' % data_type] = accuracy
            logger.info(PATTERN1.format(data_type, accuracy))

        if self.ftt_clfs is not None:

            PATTERN1 = 'Accu - {:>5}: {:.3f}'
            PATTERN2 = 'Accu - {:>5} - {:>10}: {:.3f}'
            PATTERN3 = 'Accu - {:>5} - {:>10} - {:>10} -> {}'

            # for each attribute
            for attr_id, attr in enumerate(params.attributes):
                labels = params.attr_values[attr]

                # for each new label
                for new_label_id, new_label in enumerate(labels):

                    # for each original label
                    for label_id, label in enumerate(labels):
                        correct = confusion_ftt[attr_id][label_id, new_label_id, new_label_id]
                        total = confusion_ftt[attr_id][label_id, new_label_id].sum()
                        accuracy = 100 * float(correct) / float(total)
                        scores['ftt_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] = accuracy

                    # new label accuracy
                    accus = [scores['ftt_clf_%s_%s_%s_%s' % (data_type, attr, label, new_label)] for label in labels]
                    accu = np.mean(accus)
                    scores['ftt_clf_%s_%s_%s' % (data_type, attr, new_label)] = accu
                    if new_label_id == 0:
                        logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
                    logger.info(PATTERN3.format(data_type, attr, new_label, " | ".join(["%10.2f" % a for a in accus] + ["%10.2f" % accu])))

                # attribute accuracy
                accu = np.mean([scores['ftt_clf_%s_%s_%s' % (data_type, attr, new_label)] for new_label in labels])
                scores['ftt_clf_%s_%s' % (data_type, attr)] = accu
                logger.info(PATTERN2.format(data_type, attr, accu))

                # log attribute confusion matrix
                logger.info("Confusion matrix for %s:" % attr)
                logger.info(confusion_ftt[attr_id])

            # overall accuracy
            accuracy = np.mean([scores['ftt_clf_%s_%s' % (data_type, a)] for a in params.attributes])
            scores['ftt_clf_%s' % data_type] = accuracy
            logger.info(PATTERN1.format(data_type, accuracy))

        # return hypothesis for fast back-translation evaluation
        return hypothesis