def reformatter()

in preprocessing/preprocess_i2b2_2010_ner.py [0:0]


def reformatter(base, label_vocab, txt_dir = None, concept_dir = None):
    if txt_dir is None: txt_dir = os.path.join(base, 'txt')
    if concept_dir is None: concept_dir = os.path.join(base, 'concept')
    assert os.path.isdir(txt_dir) and os.path.isdir(concept_dir), "Directory structure doesn't match!"

    txt_ids = set([x[:-4] for x in os.listdir(txt_dir) if x.endswith('.txt')])
    concept_ids = set([x[:-4] for x in os.listdir(concept_dir) if x.endswith('.con')])

    assert txt_ids == concept_ids, (
        "id set doesn't match: txt - concept = %s, concept - txt = %s"
        "" % (str(txt_ids - concept_ids), str(concept_ids - txt_ids))
    )

    ids = txt_ids

    reprocessed_texts = {}
    for i in ids:
        with open(os.path.join(txt_dir, '%s.txt' % i), mode='r') as f:
            lines = f.readlines()
            txt = [[y for y in x.strip().split(' ') if y.strip() != ''] for x in lines]
            line_starts_with_space = [x.startswith(' ') for x in lines]
        with open(os.path.join(concept_dir, '%s.con' % i), mode='r') as f:
            concepts = [process_concept(x.strip()) for x in f.readlines()]

        labels = [['O' for _ in line] for line in txt]
        for c in concepts:
            if c['start_line'] == c['end_line']:
                line = c['start_line']-1
                p_modifier = -1 if line_starts_with_space[line] else 0
                text = (' '.join(txt[line][c['start_pos']+p_modifier:c['end_pos']+1+p_modifier])).lower()
                assert text == c['c'], (
                    "Text mismatch! %s vs. %s (id: %s, line: %d)\nFull line: %s"
                    "" % (c['c'], text, i, line, txt[line])
                )

            for line in range(c['start_line']-1, c['end_line']):
                p_modifier = -1 if line_starts_with_space[line] else 0
                start_pos = c['start_pos']+p_modifier if line == c['start_line']-1 else 0
                end_pos   = c['end_pos']+1+p_modifier if line == c['end_line']-1 else len(txt[line])

                if line == c['end_line'] - 1: labels[line][end_pos-1] = label_vocab['I-%s' % c['t']]
                if line == c['start_line'] - 1: labels[line][start_pos] = label_vocab['B-%s' % c['t']]
                for j in range(start_pos + 1, end_pos-1): labels[line][j] = label_vocab['I-%s' % c['t']]

        joined_words_and_labels = [zip(txt_line, label_line) for txt_line, label_line in zip(txt, labels)]

        out_str = '\n\n'.join(
            ['\n'.join(['%s %s' % p for p in joined_line]) for joined_line in joined_words_and_labels]
        )

        reprocessed_texts[i] = out_str

    return reprocessed_texts