in codegen_sources/model/src/evaluation/evaluator.py [0:0]
def evaluate_classif(self, scores, data_set, lang1, lang2):
params = self.params
assert data_set in EVAL_DATASET_SPLITS
assert lang1 in params.langs
lang1_id = params.lang2id[lang1]
model = self.model[0] if params.encoder_only else self.encoder[0]
model.eval()
model = model.module if params.multi_gpu else model
assert self.classifier is not None
classifier = self.classifier[0].eval()
n_words = 0
n_valid = 0
labels = []
word_probas = []
n_words_by_cl = [0 for c in range(self.params.n_classes_classif)]
n_valid_by_cl = [0 for c in range(self.params.n_classes_classif)]
n_attribution_by_cl = [0 for c in range(self.params.n_classes_classif)]
for batch in self.get_iterator(data_set, lang1, lang2, stream=False):
(x1, len1, _, _), (y, len2, _, _) = batch
pred_mask = (x1 != self.params.eos_index) * (x1 != self.params.pad_index)
assert len1.equal(len2)
langs1 = x1.clone().fill_(lang1_id)
# cuda
x1, len1, langs1, y = to_cuda(x1, len1, langs1, y)
# encode source sentence
enc1 = model("fwd", x=x1, lengths=len1, langs=langs1, causal=False)
if self.params.fp16:
enc1 = enc1.half()
# classification + loss
word_scores, loss = classifier(enc1, y, pred_mask)
# update stats
y_ = y[pred_mask].view(-1,)
n_words += len(y_)
n_valid += (word_scores.max(1)[1] == y_).sum().item()
labels.extend(y_.cpu().numpy())
word_probas.extend(word_scores.cpu().numpy())
for cl in range(self.params.n_classes_classif):
n_words_by_cl[cl] += (y_ == cl).sum().item()
n_valid_by_cl[cl] += (
((word_scores.max(1)[1] == y_) * (y_ == cl)).sum().item()
)
n_attribution_by_cl[cl] += (word_scores.max(1)[1] == cl).sum().item()
if len(set(labels)) > 1:
for target_label in range(self.params.n_classes_classif):
roc_auc_name = "%s_%s-%s_roc_auc_label_cl%i" % (
data_set,
lang1,
lang2,
target_label,
)
new_labels = [1 if l == target_label else 0 for l in labels]
word_level_scores = [wp[target_label] for wp in word_probas]
scores[roc_auc_name] = roc_auc_score(new_labels, word_level_scores)
pr_auc_name = "%s_%s-%s_pr_auc_cl%i" % (
data_set,
lang1,
lang2,
target_label,
)
scores[pr_auc_name] = average_precision_score(
new_labels, word_level_scores
)
roc_auc_name = "%s_%s-%s_roc_auc_label_all_changes" % (
data_set,
lang1,
lang2,
)
new_labels = [1 if l > 0 else 0 for l in labels]
word_level_scores = [1 - s[0] for s in word_probas]
scores[roc_auc_name] = roc_auc_score(new_labels, word_level_scores)
pr_auc_name = "%s_%s-%s_pr_auc_label_all_changes" % (data_set, lang1, lang2)
scores[pr_auc_name] = average_precision_score(new_labels, word_level_scores)
# compute perplexity and prediction accuracy
class_proportion_name = "%s_%s-%s_class_proportion" % (data_set, lang1, lang2)
acc_name = "%s_%s-%s_classif_acc" % (data_set, lang1, lang2)
recall_name = "%s_%s-%s_classif_recall" % (data_set, lang1, lang2)
precision_name = "%s_%s-%s_classif_precision" % (data_set, lang1, lang2)
scores[class_proportion_name] = [
(100.0 * x / n_words) if n_words > 0 else 0.0 for x in n_words_by_cl
]
scores[acc_name] = (100.0 * n_valid / n_words) if n_words > 0 else 0.0
# scores[recall_name] = [(100. * n_valid_by_cl[cl] / n_words_by_cl[cl]) if n_words_by_cl[cl] > 0 else 0 for cl in range(self.params.n_classes_classif)]
# scores[precision_name] = [(100. * n_valid_by_cl[cl] / n_attribution_by_cl[cl]) if n_attribution_by_cl[cl] > 0 else 0 for cl in range(self.params.n_classes_classif)]
for cl in range(params.n_classes_classif):
scores[f"{recall_name}_{cl}"] = (
100.0 * n_valid_by_cl[cl] / n_words_by_cl[cl]
if n_words_by_cl[cl] > 0
else 0
)
for cl in range(params.n_classes_classif):
scores[f"{precision_name}_{cl}"] = (
100.0 * n_valid_by_cl[cl] / n_attribution_by_cl[cl]
if n_attribution_by_cl[cl] > 0
else 0
)