def main()

in preprocessing/preprocess_i2b2_2014_ner.py [0:0]


def main(gold_set_1_dir, gold_set_2_dir, test_gold_set_dir, task_dir):

    final_train_text, final_dev_text = reprocess_PHI_labels(
        [gold_set_1_dir, gold_set_2_dir], PHI_tag_type='ALL_CHILDREN',
        dev_set_size=0.1, match_text=True
    )
    test_text, _ = reprocess_PHI_labels(
        [test_gold_set_dir], PHI_tag_type='ALL_CHILDREN', match_text=False, dev_set_size=None
    )

    labels = {}
    for s in final_train_text, final_dev_text, test_text:
        for line in s.split('\n'):
            if line == '': continue
            label = line.split()[-1]
            assert label == 'O' or label.startswith('B-') or label.startswith('I-'), "label wrong! %s" % label
            if label not in labels: labels[label] = 1
            else: labels[label] += 1

    with open(os.path.join(task_dir, 'train.txt.conll'), mode='w') as f:
        f.write(final_train_text)
    with open(os.path.join(task_dir,'dev.txt.conll'), mode='w') as f:
        f.write(final_dev_text)
    with open(os.path.join(task_dir,'test.txt.conll'), mode='w') as f:
        f.write(test_text)