in preprocessing/preprocess_i2b2_2010_ner.py [0:0]
def build_label_vocab(base_dirs):
seen, label_vocab, label_vocab_size = set(['O']), {'O': 'O'}, 0
for base_dir in base_dirs:
concept_dir = os.path.join(base_dir, 'concept')
assert os.path.isdir(concept_dir), "Directory structure doesn't match!"
ids = set([x[:-4] for x in os.listdir(concept_dir) if x.endswith('.con')])
for i in ids:
with open(os.path.join(concept_dir, '%s.con' % i)) as f:
concepts = [process_concept(x.strip()) for x in f.readlines()]
for c in concepts:
if c['t'] not in seen:
label_vocab_size += 1
label_vocab['B-%s' % c['t']] = 'B-%s' % c['t'] # label_vocab_size
label_vocab_size += 1
label_vocab['I-%s' % c['t']] = 'I-%s' % c['t'] # label_vocab_size
seen.update([c['t']])
return label_vocab, label_vocab_size