def evaluate_mt()

in XLM/src/evaluation/evaluator.py [0:0]


    def evaluate_mt(self, scores, data_set, lang1, lang2, eval_bleu):
        """
        Evaluate perplexity and next word prediction accuracy.
        """
        params = self.params
        assert data_set in ['valid', 'test']
        assert lang1 in params.langs
        assert lang2 in params.langs

        self.encoder.eval()
        self.decoder.eval()
        encoder = self.encoder.module if params.multi_gpu else self.encoder
        decoder = self.decoder.module if params.multi_gpu else self.decoder

        params = params
        lang1_id = params.lang2id[lang1]
        lang2_id = params.lang2id[lang2]

        n_words = 0
        xe_loss = 0
        n_valid = 0

        # only save states / evaluate usage on the validation set
        eval_memory = params.use_memory and data_set == 'valid' and self.params.is_master
        HashingMemory.EVAL_MEMORY = eval_memory
        if eval_memory:
            all_mem_att = {k: [] for k, _ in self.memory_list}

        # store hypothesis to compute BLEU score
        if eval_bleu:
            hypothesis = []
            back_hypothesis = []

        for batch in self.get_iterator(data_set, lang1, lang2):

            # generate batch
            (x1, len1), (x2, len2) = batch
            langs1 = x1.clone().fill_(lang1_id)
            langs2 = x2.clone().fill_(lang2_id)

            # target words to predict
            alen = torch.arange(len2.max(), dtype=torch.long, device=len2.device)
            pred_mask = alen[:, None] < len2[None] - 1  # do not predict anything given the last target word
            y = x2[1:].masked_select(pred_mask[:-1])
            assert len(y) == (len2 - 1).sum().item()

            # cuda
            x1, len1, langs1, x2, len2, langs2, y = to_cuda(x1, len1, langs1, x2, len2, langs2, y)

            # encode source sentence
            enc1 = encoder('fwd', x=x1, lengths=len1, langs=langs1, causal=False)
            enc1 = enc1.transpose(0, 1)
            enc1 = enc1.half() if params.fp16 else enc1

            # decode target sentence
            dec2 = decoder('fwd', x=x2, lengths=len2, langs=langs2, causal=True, src_enc=enc1, src_len=len1)

            # loss
            word_scores, loss = decoder('predict', tensor=dec2, pred_mask=pred_mask, y=y, get_scores=True)

            # update stats
            n_words += y.size(0)
            xe_loss += loss.item() * len(y)
            n_valid += (word_scores.max(1)[1] == y).sum().item()
            if eval_memory:
                for k, v in self.memory_list:
                    all_mem_att[k].append((v.last_indices, v.last_scores))

            # generate translation - translate / convert to text
            if eval_bleu:
                max_len = int(1.5 * len1.max().item() + 10)
                if params.beam_size == 1:
                    generated, lengths = decoder.generate(enc1, len1, lang2_id, max_len=max_len)
                else:
                    generated, lengths = decoder.generate_beam(
                        enc1, len1, lang2_id, beam_size=params.beam_size,
                        length_penalty=params.length_penalty,
                        early_stopping=params.early_stopping,
                        max_len=max_len
                    )
                hypothesis.extend(convert_to_text(generated, lengths, self.dico, params))

                # Back-bleu: encode generated sentence
                langs2_generated = generated.clone().fill_(lang2_id)
                enc2 = encoder('fwd', x=generated, lengths=lengths, langs=langs2_generated, causal=False)
                enc2 = enc2.transpose(0, 1)
                enc2 = enc2.half() if params.fp16 else enc2

                if params.beam_size == 1:
                    back_generated, back_lengths = decoder.generate(enc2, lengths, lang1_id, max_len=max_len)
                else:
                    back_generated, back_lengths = decoder.generate_beam(
                        enc2, lengths, lang1_id, beam_size=params.beam_size,
                        length_penalty=params.length_penalty,
                        early_stopping=params.early_stopping,
                        max_len=max_len
                    )
                back_hypothesis.extend(convert_to_text(back_generated, back_lengths, self.dico, params))

        # compute perplexity and prediction accuracy
        scores['%s_%s-%s_mt_ppl' % (data_set, lang1, lang2)] = np.exp(xe_loss / n_words)
        scores['%s_%s-%s_mt_acc' % (data_set, lang1, lang2)] = 100. * n_valid / n_words

        # compute memory usage
        if eval_memory:
            for mem_name, mem_att in all_mem_att.items():
                eval_memory_usage(scores, '%s_%s-%s_%s' % (data_set, lang1, lang2, mem_name), mem_att, params.mem_size)

        # compute BLEU
        if eval_bleu:

            # hypothesis / reference paths
            hyp_name = 'hyp{0}.{1}-{2}.{3}.txt'.format(scores['epoch'], lang1, lang2, data_set)
            hyp_path = os.path.join(params.hyp_path, hyp_name)
            back_hyp_name = 'hyp{0}.{1}-{2}-{3}.{4}.txt'.format(scores['epoch'], lang1, lang2, lang1, data_set)
            back_hyp_path = os.path.join(params.hyp_path, back_hyp_name)
            ref_path = params.ref_paths[(lang1, lang2, data_set)]
            input_path = params.ref_paths[(lang2, lang1, data_set)]

            # export sentences to hypothesis file / restore BPE segmentation
            with open(hyp_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(hypothesis) + '\n')
            restore_segmentation(hyp_path)

            with open(back_hyp_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(back_hypothesis) + '\n')
            restore_segmentation(back_hyp_path)

            # evaluate BLEU score
            bleu = eval_moses_bleu(ref_path, hyp_path)
            logger.info("BLEU %s %s : %f" % (hyp_path, ref_path, bleu))
            scores['%s_%s-%s_mt_bleu' % (data_set, lang1, lang2)] = bleu

            # evaluate Back-BLEU score
            back_bleu = eval_moses_bleu(input_path, back_hyp_path)
            logger.info("Back-BLEU %s %s : %f" % (back_hyp_path, input_path, back_bleu))
            scores['%s_%s-%s-%s_mt_back_bleu' % (data_set, lang1, lang2, lang1)] = back_bleu

            # calculate ratio of generation length to training distribution length (1 is ideal)
            hyp_mean_num_words = mean_num_words(hyp_path)
            train_tgt_path = f"{params.data_path.rstrip('/').rsplit('/', 1)[0]}/train.{lang2}.tok"
            if os.path.exists(train_tgt_path):
                train_tgt_mean_num_words = mean_num_words(train_tgt_path)
                scores['%s_%s-%s_mt_hyp2train_num_words_ratio' % (data_set, lang1, lang2)] = hyp_mean_num_words / train_tgt_mean_num_words

            # BLEU with input (shouldn't be too high or low)
            input_bleu = eval_moses_bleu(input_path, hyp_path)
            logger.info("Input BLEU %s %s : %f" % (hyp_path, input_path, input_bleu))
            scores['%s_%s-%s_mt_input_bleu' % (data_set, lang1, lang2)] = input_bleu

            # Calculate other unsupervised stats (against input or just on hyp)
            hyp_lines = read_lines_from_path(hyp_path)
            input_lines = read_lines_from_path(input_path)
            back_hyp_lines = read_lines_from_path(back_hyp_path)

            doubles, contains, unchanged, too_few_qs, too_many_qs, all_q_words_in_subq, subq_longer_than_q, bads = 0, 0, 0, 0, 0, 0, 0, 0
            good_inps, good_hyps, good_back_hyps = [], [], []
            for inp, hyp, back_hyp in zip(input_lines, hyp_lines, back_hyp_lines):
                bad = False
                if hyp.count('?') == 2:
                    l, r, _ = hyp.split('?')
                    l = l + '?'
                    r = r + '?'
                    if l == r:
                        doubles += 1
                        bad = True  # Unnecessary to use doubles for the "bad" criteria
                    l_toks = l.split()
                    r_toks = r.split()
                    inp_toks = inp.split()
                    for subq_toks in [l_toks, r_toks]:
                        if set(inp_toks).issubset(set(subq_toks)):
                            all_q_words_in_subq += 1
                            bad = True
                            break
                    for subq_toks in [l_toks, r_toks]:
                        if len(subq_toks) >= len(inp_toks):
                            subq_longer_than_q += 1
                            bad = True
                            break
                elif hyp.count('?') < 2:
                    too_few_qs += 1
                    bad = True
                else:
                    too_many_qs += 1
                    if not self.params.one_to_variable:
                        bad = True
                if inp in hyp:
                    contains += 1
                    bad = True
                    if inp == hyp:
                        unchanged += 1
                bads += bad
                if not bad:
                    good_inps.append(inp)
                    good_hyps.append(hyp)
                    good_back_hyps.append(back_hyp)
            scores['%s_%s-%s_mt_doubles' % (data_set, lang1, lang2)] = 100. * doubles / len(hyp_lines)
            scores['%s_%s-%s_mt_contains' % (data_set, lang1, lang2)] = 100. * contains / len(hyp_lines)
            scores['%s_%s-%s_mt_unchanged' % (data_set, lang1, lang2)] = 100. * unchanged / len(hyp_lines)
            scores['%s_%s-%s_mt_too_few_qs' % (data_set, lang1, lang2)] = 100. * too_few_qs / len(hyp_lines)
            scores['%s_%s-%s_mt_too_many_qs' % (data_set, lang1, lang2)] = 100. * too_many_qs / len(hyp_lines)
            scores['%s_%s-%s_mt_all_q_words_in_subq' % (data_set, lang1, lang2)] = 100. * all_q_words_in_subq / len(hyp_lines)
            scores['%s_%s-%s_mt_subq_longer_than_q' % (data_set, lang1, lang2)] = 100. * subq_longer_than_q / len(hyp_lines)
            scores['%s_%s-%s_mt_bads' % (data_set, lang1, lang2)] = 100. * bads / len(hyp_lines)

            # evaluate BLEU score on good generations
            good_hyp_path = hyp_path.replace('.txt', '.good.txt')
            with open(good_hyp_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(good_hyps) + '\n')

            good_inp_path = good_hyp_path.replace(f'/hyp{scores["epoch"]}', f'/ref{scores["epoch"]}')
            with open(good_inp_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(good_inps) + '\n')

            good_back_hyp_path = back_hyp_path.replace('.txt', '.good.txt')
            with open(good_back_hyp_path, 'w', encoding='utf-8') as f:
                f.write('\n'.join(good_back_hyps) + '\n')

            goods_frac = 1. - (bads / len(hyp_lines))

            goods_input_bleu = eval_moses_bleu(good_inp_path, good_hyp_path)
            logger.info("Input BLEU on Good Hyps %s %s : %f" % (good_hyp_path, good_inp_path, goods_input_bleu))
            scores['%s_%s-%s_mt_goods_input_bleu' % (data_set, lang1, lang2)] = goods_input_bleu
            scores['%s_%s-%s_mt_effective_goods_input_bleu' % (data_set, lang1, lang2)] = goods_input_bleu * goods_frac

            goods_back_bleu = eval_moses_bleu(good_inp_path, good_back_hyp_path)
            logger.info("Input BLEU on Good Hyps %s %s : %f" % (good_back_hyp_path, good_inp_path, goods_back_bleu))
            scores['%s_%s-%s-%s_mt_goods_back_bleu' % (data_set, lang1, lang2, lang1)] = goods_back_bleu
            scores['%s_%s-%s-%s_mt_effective_goods_back_bleu' % (data_set, lang1, lang2, lang1)] = goods_back_bleu * goods_frac