in src/run_sentiment.py [0:0]
def evaluate(gold_labels, pred_scores, threshold=0.0):
stats = {
'ap': average_precision_score(gold_labels, pred_scores),
'auroc': roc_auc_score(gold_labels, pred_scores),
}
if threshold is not None:
pred_labels = [int(x > threshold) for x in pred_scores]
accuracy, f1, precision, recall = get_accuracy_f1_precision_recall(gold_labels, pred_labels)
stats['acc'] = accuracy
stats['f1'] = f1
stats['precision'] = precision
stats['recall'] = recall
best_accuracy, best_f1 = accuracy_f1_at_best_thresh(gold_labels, pred_scores)
stats['acc_best'] = best_accuracy
stats['f1_best'] = best_f1
return stats