in src/weakly_sup.py [0:0]
def extract_dataset(train_lexicon, test_lexicon, coocc, configs):
cooccs = [coocc]
test_set = set()
pos_training_set = set()
neg_training_set = set()
for tsw in set([x[0] for x in train_lexicon]):
for coocc in cooccs:
ssw = to_simplified(tsw) if configs.src_lang == 'zh_CN' else tsw
for stw in coocc[ssw]:
if stw == ssw:
added_self = True
ttw = to_traditional(stw) if configs.trg_lang == 'zh_CN' else stw
if (tsw, ttw) in train_lexicon:
pos_training_set.add((ssw, stw))
else:
neg_training_set.add((ssw, stw))
if (ssw, ssw) in train_lexicon:
pos_training_set.add((ssw, ssw))
else:
neg_training_set.add((ssw, ssw))
for tsw in set([x[0] for x in test_lexicon]):
for coocc in cooccs:
ssw = to_simplified(tsw) if configs.src_lang == 'zh_CN' else tsw
added_self = False
for stw in coocc[ssw]:
if stw == ssw:
added_self = True
test_set.add((ssw, stw))
test_set.add((ssw, ssw))
pos_training_set = list(pos_training_set)
neg_training_set = list(neg_training_set)
test_set = list(test_set)
return pos_training_set, neg_training_set, test_set