def main()

in preprocessing/preprocess_i2b2_2012_ner.py [0:0]


def main(raw_data_dir, task_dir):

    final_train_text, final_dev_text = reprocess_event_labels(
        [os.path.join(raw_data_dir, '2012-07-15.original-annotation.release')], dev_set_size=0.1, match_text=True
    )

    test_text, _ = reprocess_event_labels(
        [os.path.join(raw_data_dir, '2012-08-08.test-data.event-timex-groundtruth/xml')], match_text=False, dev_set_size=None
    )

    labels = {}
    for s in final_train_text, final_dev_text, test_text:
        for line in s.split('\n'):
            if line == '': continue
            label = line.split()[-1]
            assert label == 'O' or label.startswith('B-') or label.startswith('I-'), "label wrong! %s" % label
            if label not in labels: labels[label] = 1
            else: labels[label] += 1

    with open(os.path.join(task_dir, 'train.txt.conll'), mode='w') as f:
        f.write(final_train_text)
    with open(os.path.join(task_dir, 'dev.txt.conll'), mode='w') as f:
        f.write(final_dev_text)
    with open(os.path.join(task_dir, 'test.txt.conll'), mode='w') as f:
        f.write(test_text)