def wbless_setup()

in hypernymysuite/evaluation.py [0:0]


def wbless_setup(model):
    """
    Accuracy using a threshold, with a dataset that explicitly contains reverse pairs.
    """
    ds = Dataset(os.path.join(DATA_DIR, "wbless.tsv"), model.vocab)

    # Ensure we always get the same results
    rng = np.random.RandomState(42)
    VAL_PROB = .02
    NUM_TRIALS = 1000

    # We have no way of handling oov
    h = model.predict_many(ds.hypos[ds.invocab_mask], ds.hypers[ds.invocab_mask])
    y = ds.y[ds.invocab_mask]

    val_scores = []
    test_scores = []

    for _ in range(NUM_TRIALS):
        # Generate a new mask every time
        m_val = rng.rand(len(y)) < VAL_PROB
        # Test is everything except val
        m_test = ~m_val
        _, _, t = precision_recall_curve(y[m_val], h[m_val])
        # pick the highest accuracy on the validation set
        thr_accs = np.mean((h[m_val, np.newaxis] >= t) == y[m_val, np.newaxis], axis=0)
        best_t = t[thr_accs.argmax()]
        preds_val = h[m_val] >= best_t
        preds_test = h[m_test] >= best_t
        # Evaluate
        val_scores.append(np.mean(preds_val == y[m_val]))
        test_scores.append(np.mean(preds_test == y[m_test]))
        # sanity check
        assert np.allclose(val_scores[-1], thr_accs.max())

    # report average across many folds
    return {"acc_val_inv": np.mean(val_scores), "acc_test_inv": np.mean(test_scores)}