def main()

in preprocessing/preprocess_i2b2_2010_ner.py [0:0]


def main(beth_dir, partners_dir, test_dir, test_txt_dir, task_dir):
    label_vocab, label_vocab_size = build_label_vocab([beth_dir, partners_dir])

    reprocessed_texts = {
        'beth':     reformatter(beth_dir, label_vocab),
        'partners': reformatter(partners_dir, label_vocab),
        'test':     reformatter(
            test_dir, label_vocab,
            txt_dir=test_txt_dir,
            concept_dir=os.path.join(test_dir, 'concepts')
        ),
    }
    np.random.seed(1)
    all_partners_train_ids = np.random.permutation(list(reprocessed_texts['partners'].keys()))
    N = len(all_partners_train_ids)
    N_train = int(0.9 * N)

    partners_train_ids = all_partners_train_ids[:N_train]
    partners_dev_ids = all_partners_train_ids[N_train:]
    print("Partners # Patients: Train: %d, Dev: %d" %(len(partners_train_ids), len(partners_dev_ids)))
    all_beth_train_ids = np.random.permutation(list(reprocessed_texts['beth'].keys()))
    N = len(all_beth_train_ids)
    N_train = int(0.9 * N)

    beth_train_ids = all_beth_train_ids[:N_train]
    beth_dev_ids = all_beth_train_ids[N_train:]
    print("Beth # Patients: Train: %d, Dev: %d" % (len(beth_train_ids), len(beth_dev_ids)))

    print("Merged # Patients: Train: %d, Dev: %d" % (
      len(partners_train_ids) + len(beth_train_ids), len(beth_dev_ids) + len(partners_dev_ids)
    ))

    merged_train_txt = '\n\n'.join(np.random.permutation(
        [reprocessed_texts['partners'][i] for i in partners_train_ids] +
        [reprocessed_texts['beth'][i] for i in beth_train_ids]
    ))
    merged_dev_txt = '\n\n'.join(np.random.permutation(
        [reprocessed_texts['partners'][i] for i in partners_dev_ids] +
        [reprocessed_texts['beth'][i] for i in beth_dev_ids]
    ))
    merged_test_txt = '\n\n'.join(np.random.permutation(list(reprocessed_texts['test'].values())))

    print("Merged # Samples: Train: %d, Dev: %d, Test: %d" % (
        len(merged_train_txt.split('\n\n')),
        len(merged_dev_txt.split('\n\n')),
        len(merged_test_txt.split('\n\n'))
    ))

    partners_train_txt = '\n\n'.join(np.random.permutation(
        [reprocessed_texts['partners'][i] for i in partners_train_ids]
    ))
    partners_dev_txt = '\n\n'.join(np.random.permutation(
        [reprocessed_texts['partners'][i] for i in partners_dev_ids]
    ))
    partners_test_txt = '\n\n'.join(np.random.permutation(list(reprocessed_texts['test'].values())))

    OUT_FILES = {
        'merged_train': os.path.join(task_dir, 'merged', 'train.tsv'),
        'merged_dev':  os.path.join(task_dir, 'merged', 'dev.tsv'),
        'merged_test':  os.path.join(task_dir, 'merged', 'test.tsv'),
        'partners_train': os.path.join(task_dir, 'merged', 'train.tsv'),
        'partners_dev':  os.path.join(task_dir, 'merged', 'dev.tsv'),
        'partners_test':  os.path.join(task_dir, 'merged', 'test.tsv'),
        'vocab': os.path.join(task_dir, 'merged' 'labels.txt')
    }
    os.makedirs(os.path.join(task_dir, 'merged'), exist_ok=True)
    os.makedirs(os.path.join(task_dir, 'partners'), exist_ok=True)

    with open(OUT_FILES['merged_train'], mode='w') as f: f.write(merged_train_txt)
    with open(OUT_FILES['merged_dev'], mode='w') as f: f.write(merged_dev_txt)
    with open(OUT_FILES['merged_test'], mode='w') as f: f.write(merged_test_txt)
    with open(OUT_FILES['partners_train'], mode='w') as f: f.write(partners_train_txt)
    with open(OUT_FILES['partners_dev'], mode='w') as f: f.write(partners_dev_txt)
    with open(OUT_FILES['partners_test'], mode='w') as f: f.write(partners_test_txt)
    with open(OUT_FILES['vocab'], mode='w') as f: f.write('\n'.join(label_vocab.keys()))