def build_label_vocab()

in preprocessing/preprocess_i2b2_2010_ner.py [0:0]


def build_label_vocab(base_dirs):
    seen, label_vocab, label_vocab_size = set(['O']), {'O': 'O'}, 0

    for base_dir in base_dirs:
        concept_dir = os.path.join(base_dir, 'concept')

        assert os.path.isdir(concept_dir), "Directory structure doesn't match!"

        ids = set([x[:-4] for x in os.listdir(concept_dir) if x.endswith('.con')])

        for i in ids:
            with open(os.path.join(concept_dir, '%s.con' % i)) as f:
                concepts = [process_concept(x.strip()) for x in f.readlines()]
            for c in concepts:
                if c['t'] not in seen:
                    label_vocab_size += 1
                    label_vocab['B-%s' % c['t']] = 'B-%s' % c['t'] # label_vocab_size
                    label_vocab_size += 1
                    label_vocab['I-%s' % c['t']] = 'I-%s' % c['t'] # label_vocab_size
                    seen.update([c['t']])
    return label_vocab, label_vocab_size