in preprocessing/preprocess_i2b2_2014_ner.py [0:0]
def main(gold_set_1_dir, gold_set_2_dir, test_gold_set_dir, task_dir):
final_train_text, final_dev_text = reprocess_PHI_labels(
[gold_set_1_dir, gold_set_2_dir], PHI_tag_type='ALL_CHILDREN',
dev_set_size=0.1, match_text=True
)
test_text, _ = reprocess_PHI_labels(
[test_gold_set_dir], PHI_tag_type='ALL_CHILDREN', match_text=False, dev_set_size=None
)
labels = {}
for s in final_train_text, final_dev_text, test_text:
for line in s.split('\n'):
if line == '': continue
label = line.split()[-1]
assert label == 'O' or label.startswith('B-') or label.startswith('I-'), "label wrong! %s" % label
if label not in labels: labels[label] = 1
else: labels[label] += 1
with open(os.path.join(task_dir, 'train.txt.conll'), mode='w') as f:
f.write(final_train_text)
with open(os.path.join(task_dir,'dev.txt.conll'), mode='w') as f:
f.write(final_dev_text)
with open(os.path.join(task_dir,'test.txt.conll'), mode='w') as f:
f.write(test_text)