in hypernymysuite/evaluation.py [0:0]
def wbless_setup(model):
"""
Accuracy using a threshold, with a dataset that explicitly contains reverse pairs.
"""
ds = Dataset(os.path.join(DATA_DIR, "wbless.tsv"), model.vocab)
# Ensure we always get the same results
rng = np.random.RandomState(42)
VAL_PROB = .02
NUM_TRIALS = 1000
# We have no way of handling oov
h = model.predict_many(ds.hypos[ds.invocab_mask], ds.hypers[ds.invocab_mask])
y = ds.y[ds.invocab_mask]
val_scores = []
test_scores = []
for _ in range(NUM_TRIALS):
# Generate a new mask every time
m_val = rng.rand(len(y)) < VAL_PROB
# Test is everything except val
m_test = ~m_val
_, _, t = precision_recall_curve(y[m_val], h[m_val])
# pick the highest accuracy on the validation set
thr_accs = np.mean((h[m_val, np.newaxis] >= t) == y[m_val, np.newaxis], axis=0)
best_t = t[thr_accs.argmax()]
preds_val = h[m_val] >= best_t
preds_test = h[m_test] >= best_t
# Evaluate
val_scores.append(np.mean(preds_val == y[m_val]))
test_scores.append(np.mean(preds_test == y[m_test]))
# sanity check
assert np.allclose(val_scores[-1], thr_accs.max())
# report average across many folds
return {"acc_val_inv": np.mean(val_scores), "acc_test_inv": np.mean(test_scores)}