def extract_dataset()

in src/weakly_sup.py [0:0]


def extract_dataset(train_lexicon, test_lexicon, coocc, configs):
    cooccs = [coocc]
    test_set = set()
    pos_training_set = set()
    neg_training_set = set()
    for tsw in set([x[0] for x in train_lexicon]):
        for coocc in cooccs:
            ssw = to_simplified(tsw) if configs.src_lang == 'zh_CN' else tsw
            for stw in coocc[ssw]:
                if stw == ssw:
                    added_self = True
                ttw = to_traditional(stw) if configs.trg_lang == 'zh_CN' else stw
                if (tsw, ttw) in train_lexicon:
                    pos_training_set.add((ssw, stw))
                else:
                    neg_training_set.add((ssw, stw))
            if (ssw, ssw) in train_lexicon:
                pos_training_set.add((ssw, ssw))
            else:
                neg_training_set.add((ssw, ssw))
    for tsw in set([x[0] for x in test_lexicon]):
        for coocc in cooccs:
            ssw = to_simplified(tsw) if configs.src_lang == 'zh_CN' else tsw
            added_self = False
            for stw in coocc[ssw]:
                if stw == ssw:
                    added_self = True
                test_set.add((ssw, stw))
            test_set.add((ssw, ssw))
    pos_training_set = list(pos_training_set)
    neg_training_set = list(neg_training_set)
    test_set = list(test_set)
    return pos_training_set, neg_training_set, test_set