in preprocessing/preprocess_i2b2_2012_ner.py [0:0]
def reprocess_event_labels(folders, base_path='.', event_tag_type='event', match_text=True, dev_set_size=None):
all_texts_by_patient, all_labels_by_patient = {}, {}
for folder in folders:
folder_dir = os.path.join(base_path, folder)
xml_filenames = [x for x in os.listdir(folder_dir) if x.endswith('xml')]
for xml_filename in xml_filenames:
patient_num = int(xml_filename[:-4])
xml_filepath = os.path.join(folder_dir, xml_filename)
results = read_xml_file(
xml_filepath,
event_tag_type=event_tag_type,
match_text=match_text
)
if results is None:
continue
text_by_char, labels_by_char = results
text_by_word, labels_by_word = merge_into_words(text_by_char, labels_by_char)
if patient_num not in all_texts_by_patient:
all_texts_by_patient[patient_num] = []
all_labels_by_patient[patient_num] = []
all_texts_by_patient[patient_num].extend(text_by_word)
all_labels_by_patient[patient_num].extend(labels_by_word)
patients = set(all_texts_by_patient.keys())
if dev_set_size is None:
train_patients, dev_patients = list(patients), []
else:
N_train = int(len(patients) * (1 - dev_set_size))
patients_random = np.random.permutation(list(patients))
train_patients = list(patients_random[:N_train])
dev_patients = list(patients_random[N_train:])
train_texts, train_labels = [], []
dev_texts, dev_labels = [], []
for patient_num in train_patients:
train_texts.extend(all_texts_by_patient[patient_num])
train_labels.extend(all_labels_by_patient[patient_num])
for patient_num in dev_patients:
dev_texts.extend(all_texts_by_patient[patient_num])
dev_labels.extend(all_labels_by_patient[patient_num])
train_out_text_by_sentence = []
for text, labels in zip(train_texts, train_labels):
train_out_text_by_sentence.append('\n'.join('%s %s' % x for x in zip(text, labels)))
dev_out_text_by_sentence = []
for text, labels in zip(dev_texts, dev_labels):
dev_out_text_by_sentence.append('\n'.join('%s %s' % x for x in zip(text, labels)))
return '\n\n'.join(train_out_text_by_sentence), '\n\n'.join(dev_out_text_by_sentence)