def evaluate_classif()

in codegen_sources/model/src/evaluation/evaluator.py [0:0]


    def evaluate_classif(self, scores, data_set, lang1, lang2):
        params = self.params
        assert data_set in EVAL_DATASET_SPLITS
        assert lang1 in params.langs
        lang1_id = params.lang2id[lang1]

        model = self.model[0] if params.encoder_only else self.encoder[0]
        model.eval()
        model = model.module if params.multi_gpu else model
        assert self.classifier is not None
        classifier = self.classifier[0].eval()

        n_words = 0
        n_valid = 0
        labels = []
        word_probas = []
        n_words_by_cl = [0 for c in range(self.params.n_classes_classif)]
        n_valid_by_cl = [0 for c in range(self.params.n_classes_classif)]
        n_attribution_by_cl = [0 for c in range(self.params.n_classes_classif)]

        for batch in self.get_iterator(data_set, lang1, lang2, stream=False):
            (x1, len1, _, _), (y, len2, _, _) = batch
            pred_mask = (x1 != self.params.eos_index) * (x1 != self.params.pad_index)
            assert len1.equal(len2)
            langs1 = x1.clone().fill_(lang1_id)

            # cuda
            x1, len1, langs1, y = to_cuda(x1, len1, langs1, y)

            # encode source sentence
            enc1 = model("fwd", x=x1, lengths=len1, langs=langs1, causal=False)
            if self.params.fp16:
                enc1 = enc1.half()

            # classification + loss
            word_scores, loss = classifier(enc1, y, pred_mask)

            # update stats
            y_ = y[pred_mask].view(-1,)
            n_words += len(y_)
            n_valid += (word_scores.max(1)[1] == y_).sum().item()
            labels.extend(y_.cpu().numpy())
            word_probas.extend(word_scores.cpu().numpy())

            for cl in range(self.params.n_classes_classif):
                n_words_by_cl[cl] += (y_ == cl).sum().item()
                n_valid_by_cl[cl] += (
                    ((word_scores.max(1)[1] == y_) * (y_ == cl)).sum().item()
                )
                n_attribution_by_cl[cl] += (word_scores.max(1)[1] == cl).sum().item()

        if len(set(labels)) > 1:
            for target_label in range(self.params.n_classes_classif):
                roc_auc_name = "%s_%s-%s_roc_auc_label_cl%i" % (
                    data_set,
                    lang1,
                    lang2,
                    target_label,
                )
                new_labels = [1 if l == target_label else 0 for l in labels]
                word_level_scores = [wp[target_label] for wp in word_probas]
                scores[roc_auc_name] = roc_auc_score(new_labels, word_level_scores)

                pr_auc_name = "%s_%s-%s_pr_auc_cl%i" % (
                    data_set,
                    lang1,
                    lang2,
                    target_label,
                )
                scores[pr_auc_name] = average_precision_score(
                    new_labels, word_level_scores
                )

            roc_auc_name = "%s_%s-%s_roc_auc_label_all_changes" % (
                data_set,
                lang1,
                lang2,
            )
            new_labels = [1 if l > 0 else 0 for l in labels]
            word_level_scores = [1 - s[0] for s in word_probas]
            scores[roc_auc_name] = roc_auc_score(new_labels, word_level_scores)

            pr_auc_name = "%s_%s-%s_pr_auc_label_all_changes" % (data_set, lang1, lang2)
            scores[pr_auc_name] = average_precision_score(new_labels, word_level_scores)

        # compute perplexity and prediction accuracy
        class_proportion_name = "%s_%s-%s_class_proportion" % (data_set, lang1, lang2)
        acc_name = "%s_%s-%s_classif_acc" % (data_set, lang1, lang2)
        recall_name = "%s_%s-%s_classif_recall" % (data_set, lang1, lang2)
        precision_name = "%s_%s-%s_classif_precision" % (data_set, lang1, lang2)

        scores[class_proportion_name] = [
            (100.0 * x / n_words) if n_words > 0 else 0.0 for x in n_words_by_cl
        ]
        scores[acc_name] = (100.0 * n_valid / n_words) if n_words > 0 else 0.0
        # scores[recall_name] = [(100. * n_valid_by_cl[cl] / n_words_by_cl[cl]) if n_words_by_cl[cl] > 0 else 0 for cl in range(self.params.n_classes_classif)]
        # scores[precision_name] = [(100. * n_valid_by_cl[cl] / n_attribution_by_cl[cl]) if n_attribution_by_cl[cl] > 0 else 0 for cl in range(self.params.n_classes_classif)]

        for cl in range(params.n_classes_classif):
            scores[f"{recall_name}_{cl}"] = (
                100.0 * n_valid_by_cl[cl] / n_words_by_cl[cl]
                if n_words_by_cl[cl] > 0
                else 0
            )
        for cl in range(params.n_classes_classif):
            scores[f"{precision_name}_{cl}"] = (
                100.0 * n_valid_by_cl[cl] / n_attribution_by_cl[cl]
                if n_attribution_by_cl[cl] > 0
                else 0
            )