in preprocessing/preprocess_i2b2_2012_ner.py [0:0]
def main(raw_data_dir, task_dir):
final_train_text, final_dev_text = reprocess_event_labels(
[os.path.join(raw_data_dir, '2012-07-15.original-annotation.release')], dev_set_size=0.1, match_text=True
)
test_text, _ = reprocess_event_labels(
[os.path.join(raw_data_dir, '2012-08-08.test-data.event-timex-groundtruth/xml')], match_text=False, dev_set_size=None
)
labels = {}
for s in final_train_text, final_dev_text, test_text:
for line in s.split('\n'):
if line == '': continue
label = line.split()[-1]
assert label == 'O' or label.startswith('B-') or label.startswith('I-'), "label wrong! %s" % label
if label not in labels: labels[label] = 1
else: labels[label] += 1
with open(os.path.join(task_dir, 'train.txt.conll'), mode='w') as f:
f.write(final_train_text)
with open(os.path.join(task_dir, 'dev.txt.conll'), mode='w') as f:
f.write(final_dev_text)
with open(os.path.join(task_dir, 'test.txt.conll'), mode='w') as f:
f.write(test_text)