def evaluate()

in train.py [0:0]


def evaluate(model, data_loader, task):
    """Evaluate model on dataset and return metrics.

    Args:
        datum: a Data object to determine input shapes for GVP-based models.
        model_name: choose from ['bert', 'gvp', 'bert_gvp', 'gat', 'bert_gat']
        num_outputs: number of output units
        weights: label weights for multi-output models

    Returns:
        model object (One of: bert, gat, bert_gat, gvp or bert_gvp)
    """
    # make predictions on test set
    device = torch.device("cuda:0")
    model = model.to(device)
    model.eval()
    y_preds = []
    y_true = []
    with torch.no_grad():
        for batch in data_loader:
            if isinstance(batch, Sequence):
                y_true.append(batch[-1])
                batch = [b.to(device) for b in batch]
            else:
                y_true.append(batch["labels"])
                batch = {key: val.to(device) for key, val in batch.items()}
            y_pred = model(batch)
            if y_pred.ndim == 1:
                y_pred = y_pred.unsqueeze(1)
            y_preds.append(y_pred.cpu())
    y_preds = torch.vstack(y_preds).numpy()
    y_true = torch.vstack(y_true).numpy()
    print(y_preds.shape, y_true.shape)
    if task in ("cc", "bp", "mf"):
        # multi-label classification
        f_max, micro_aupr = deepfrier_utils.evaluate_multilabel(
            y_true.numpy(), y_preds.numpy()
        )
        scores = {"f_max": f_max, "aupr": micro_aupr}
        print("F_max = {:1.3f}".format(scores["f_max"]))
        print("AUPR = {:1.3f}".format(scores["aupr"]))
    else:
        # single task regression
        mse = metrics.mean_squared_error(y_true, y_preds)
        rmse = np.sqrt(mse)
        r2 = metrics.r2_score(y_true, y_preds)
        rho, _ = stats.spearmanr(y_true, y_preds)
        scores = {"mse": float(mse), "rmse": float(rmse), "r2": r2, "rho": rho}
        for key, score in scores.items():
            print("{} = {:1.3f}".format(key, score))
    return scores