in src/weakly_sup.py [0:0]
def get_optimal_parameters(
pos_training_set, neg_training_set, train_lexicon, criss,
lexicon_inducer, info, configs,
):
pred_train_lexicon = collections.defaultdict(collections.Counter)
probs = extract_probs(
pos_training_set + neg_training_set, criss, lexicon_inducer, info, configs
)
for i, (x, y) in enumerate(pos_training_set + neg_training_set):
pred_train_lexicon[x][y] = max(pred_train_lexicon[x][y], probs[i].item())
possible_predictions = list()
for tsw in set([x[0] for x in train_lexicon]):
ssw = to_simplified(tsw)
for stw in pred_train_lexicon[ssw]:
ttw = to_traditional(stw)
pos = 1 if (tsw, ttw) in train_lexicon else 0
possible_predictions.append([tsw, ttw, pred_train_lexicon[ssw][stw], pos])
possible_predictions = sorted(possible_predictions, key=lambda x:-x[-2])
best_f1 = -1e10
best_threshold = best_n_cand = 0
for n_cand in range(1, 6):
word_cnt = collections.Counter()
correct_predictions = 0
bar = tqdm(possible_predictions)
for i, item in enumerate(bar):
if word_cnt[item[0]] == n_cand:
continue
word_cnt[item[0]] += 1
if item[-1] == 1:
correct_predictions += 1
prec = correct_predictions / (sum(word_cnt.values()) + 1) * 100.0
rec = correct_predictions / len(train_lexicon) * 100.0
f1 = 2 * prec * rec / (rec + prec)
if f1 > best_f1:
best_f1 = f1
best_threshold = item[-2]
best_n_cand = n_cand
bar.set_description(
f'Best F1={f1:.1f}, Prec={prec:.1f}, Rec={rec:.1f}, NCand={n_cand}, Threshold={item[-2]}'
)
return best_threshold, best_n_cand