def eval_back()

in code/src/evaluator.py [0:0]


    def eval_back(self, data_type, scores, hypothesis):
        """
        Compute attr_1 -> attr_k -> attr_1 perplexity and BLEU scores.
        """
        logger.info("Evaluating back-translation perplexity and BLEU (%s) ..." % data_type)
        assert data_type in ['valid', 'test']
        self.encoder.eval()
        self.decoder.eval()
        params = self.params

        offset = 0
        # for each attribute
        for attr_id, attr in enumerate(params.attributes):

            # number of labels for this attribute
            n_attr = len(params.attr_values[attr])

            # for each label
            for label_id, label in enumerate(params.attr_values[attr]):

                # try all labels
                for new_label_id, new_label in enumerate(params.attr_values[attr]):

                    orig_sent = list(self.get_iterator(data_type, (attr_id, label_id)))
                    inter_sent = hypothesis[attr][(label, new_label)]
                    assert len(orig_sent) == len(inter_sent)
                    assert all([x[0].size(1) == x[1].size(0) == y[0].size(1) == y[1].size(0) for x, y in zip(orig_sent, inter_sent)])

                    hypothesis[attr][(label, new_label, label)] = []

                    # for all sentences with this label
                    for (sent1, len1, attr1), (sent2, len2, attr2) in zip(orig_sent, inter_sent):

                        # sanity check
                        assert sent1.size(1) == sent2.size(1) == len1.size(0) == len2.size(0)
                        assert (attr1[:, attr_id] - offset == label_id).sum().item() == attr1.size(0)
                        assert (attr2[:, attr_id] - offset == new_label_id).sum().item() == attr2.size(0)

                        #  cuda batch / encode sentence
                        sent1, attr1 = sent1.cuda(), attr1.cuda()
                        sent2, attr2 = sent2.cuda(), attr2.cuda()
                        encoded = self.encoder(sent2, len2)

                        # update attribute / generate hypothesis with new attributes
                        max_len = int(1.5 * len2.max() + 10)
                        sent3, len3, _ = self.decoder.generate(encoded, attr1, max_len=max_len)

                        # save hypothesis
                        hypothesis[attr][(label, new_label, label)].append((sent3, len3, attr1.clone()))

            offset += n_attr

        #
        # export references / hypothesis - compute self BLEU
        #
        PATTERN1 = 'BLEU - {:>5}: {:.3f}'
        PATTERN2 = 'BLEU - {:>5} - {:>10}: {:.3f}'
        PATTERN3 = 'BLEU - {:>5} - {:>10} - {:>10} -> {}'

        # for each attribute
        for attr in params.attributes:
            labels = params.attr_values[attr]

            # for each label
            for label_id, label in enumerate(labels):

                # for each new label
                for new_label_id, new_label in enumerate(labels):

                    # convert hypothesis to text
                    txt = []
                    for sent, lengths, _ in hypothesis[attr][(label, new_label, label)]:
                        txt.extend(convert_to_text(sent, lengths, self.dico, params))

                    # export hypothesis / restore BPE segmentation
                    filename = 'hyp.%s.%s.%s.%s.%s.%i' % (data_type, attr, label, new_label, label, scores['epoch'])
                    hyp_path = os.path.join(params.hyp_path, filename)
                    with open(hyp_path, 'w', encoding='utf-8') as f:
                        f.write('\n'.join(txt) + '\n')
                    restore_segmentation(hyp_path)

                    # new label self BLEU
                    filename = 'ref.%s.%s.%s' % (data_type, attr, label)
                    ref_path = os.path.join(params.hyp_path, filename)
                    bleu = self.eval_moses_bleu(ref_path, hyp_path)
                    scores['back_bleu_%s_%s_%s_%s_%s' % (data_type, attr, label, new_label, label)] = bleu

                # label self BLEU
                bleus = [scores['back_bleu_%s_%s_%s_%s_%s' % (data_type, attr, label, new_label, label)] for new_label in labels]
                bleu = np.mean(bleus)
                scores['back_bleu_%s_%s_%s' % (data_type, attr, label)] = bleu
                if label_id == 0:
                    logger.info(PATTERN3.format(data_type, attr, '', " | ".join(["%10s" % l for l in labels + ['Total']])))
                logger.info(PATTERN3.format(data_type, attr, label, " | ".join(["%10.2f" % b for b in bleus] + ["%10.2f" % bleu])))

            # attribute self BLEU
            bleu = np.mean([scores['back_bleu_%s_%s_%s' % (data_type, attr, label)] for label in labels])
            scores['back_bleu_%s_%s' % (data_type, attr)] = bleu
            logger.info(PATTERN2.format(data_type, attr, bleu))

        # overall self BLEU
        bleu = np.mean([scores['back_bleu_%s_%s' % (data_type, attr)] for attr in params.attributes])
        scores['back_bleu_%s' % data_type] = bleu
        logger.info(PATTERN1.format(data_type, bleu))