in preprocessing/preprocess_i2b2_2010_ner.py [0:0]
def main(beth_dir, partners_dir, test_dir, test_txt_dir, task_dir):
label_vocab, label_vocab_size = build_label_vocab([beth_dir, partners_dir])
reprocessed_texts = {
'beth': reformatter(beth_dir, label_vocab),
'partners': reformatter(partners_dir, label_vocab),
'test': reformatter(
test_dir, label_vocab,
txt_dir=test_txt_dir,
concept_dir=os.path.join(test_dir, 'concepts')
),
}
np.random.seed(1)
all_partners_train_ids = np.random.permutation(list(reprocessed_texts['partners'].keys()))
N = len(all_partners_train_ids)
N_train = int(0.9 * N)
partners_train_ids = all_partners_train_ids[:N_train]
partners_dev_ids = all_partners_train_ids[N_train:]
print("Partners # Patients: Train: %d, Dev: %d" %(len(partners_train_ids), len(partners_dev_ids)))
all_beth_train_ids = np.random.permutation(list(reprocessed_texts['beth'].keys()))
N = len(all_beth_train_ids)
N_train = int(0.9 * N)
beth_train_ids = all_beth_train_ids[:N_train]
beth_dev_ids = all_beth_train_ids[N_train:]
print("Beth # Patients: Train: %d, Dev: %d" % (len(beth_train_ids), len(beth_dev_ids)))
print("Merged # Patients: Train: %d, Dev: %d" % (
len(partners_train_ids) + len(beth_train_ids), len(beth_dev_ids) + len(partners_dev_ids)
))
merged_train_txt = '\n\n'.join(np.random.permutation(
[reprocessed_texts['partners'][i] for i in partners_train_ids] +
[reprocessed_texts['beth'][i] for i in beth_train_ids]
))
merged_dev_txt = '\n\n'.join(np.random.permutation(
[reprocessed_texts['partners'][i] for i in partners_dev_ids] +
[reprocessed_texts['beth'][i] for i in beth_dev_ids]
))
merged_test_txt = '\n\n'.join(np.random.permutation(list(reprocessed_texts['test'].values())))
print("Merged # Samples: Train: %d, Dev: %d, Test: %d" % (
len(merged_train_txt.split('\n\n')),
len(merged_dev_txt.split('\n\n')),
len(merged_test_txt.split('\n\n'))
))
partners_train_txt = '\n\n'.join(np.random.permutation(
[reprocessed_texts['partners'][i] for i in partners_train_ids]
))
partners_dev_txt = '\n\n'.join(np.random.permutation(
[reprocessed_texts['partners'][i] for i in partners_dev_ids]
))
partners_test_txt = '\n\n'.join(np.random.permutation(list(reprocessed_texts['test'].values())))
OUT_FILES = {
'merged_train': os.path.join(task_dir, 'merged', 'train.tsv'),
'merged_dev': os.path.join(task_dir, 'merged', 'dev.tsv'),
'merged_test': os.path.join(task_dir, 'merged', 'test.tsv'),
'partners_train': os.path.join(task_dir, 'merged', 'train.tsv'),
'partners_dev': os.path.join(task_dir, 'merged', 'dev.tsv'),
'partners_test': os.path.join(task_dir, 'merged', 'test.tsv'),
'vocab': os.path.join(task_dir, 'merged' 'labels.txt')
}
os.makedirs(os.path.join(task_dir, 'merged'), exist_ok=True)
os.makedirs(os.path.join(task_dir, 'partners'), exist_ok=True)
with open(OUT_FILES['merged_train'], mode='w') as f: f.write(merged_train_txt)
with open(OUT_FILES['merged_dev'], mode='w') as f: f.write(merged_dev_txt)
with open(OUT_FILES['merged_test'], mode='w') as f: f.write(merged_test_txt)
with open(OUT_FILES['partners_train'], mode='w') as f: f.write(partners_train_txt)
with open(OUT_FILES['partners_dev'], mode='w') as f: f.write(partners_dev_txt)
with open(OUT_FILES['partners_test'], mode='w') as f: f.write(partners_test_txt)
with open(OUT_FILES['vocab'], mode='w') as f: f.write('\n'.join(label_vocab.keys()))