def evaluate()

in src/run_sentiment.py [0:0]


def evaluate(gold_labels, pred_scores, threshold=0.0):
    stats = {
            'ap': average_precision_score(gold_labels, pred_scores),
            'auroc': roc_auc_score(gold_labels, pred_scores),
    }
    if threshold is not None:
        pred_labels = [int(x > threshold) for x in pred_scores]
        accuracy, f1, precision, recall = get_accuracy_f1_precision_recall(gold_labels, pred_labels)
        stats['acc'] = accuracy
        stats['f1'] = f1
        stats['precision'] = precision
        stats['recall'] = recall
    best_accuracy, best_f1 = accuracy_f1_at_best_thresh(gold_labels, pred_scores)
    stats['acc_best'] = best_accuracy
    stats['f1_best'] =  best_f1
    return stats