def read_xml_file()

in preprocessing/preprocess_i2b2_2014_ner.py [0:0]


def read_xml_file(xml_path, PHI_tag_type='ALL_CHILDREN', match_text=True):
    with open(xml_path, mode='r') as f:
        lines = f.readlines()
        text, in_text = [], False
        for i, l in enumerate(lines):
            if START_CDATA in l:
                text.append(list(l[l.find(START_CDATA) + len(START_CDATA):]))
                in_text = True
            elif END_CDATA in l:
                text.append(list(l[:l.find(END_CDATA)]))
                break
            elif in_text:
                if xml_path.endswith('180-03.xml') and '0808' in l and 'Effingham' in l:
                    print("Adjusting known error")
                    l = l[:9] + ' ' * 4 + l[9:]

                text.append(list(l))

    pos_transformer = {}

    linear_pos = 1
    for line, sentence in enumerate(text):
        for char_pos, char in enumerate(sentence):
            pos_transformer[linear_pos] = (line, char_pos)
            linear_pos += 1

    xml_parsed = ET.parse(xml_path)
    tag_containers = xml_parsed.findall('TAGS')
    assert len(tag_containers) == 1, "Found multiple tag sets!"
    tag_container = tag_containers[0]

    PHI_tags = tag_container.getchildren() if PHI_tag_type == 'ALL_CHILDREN' else tag_container.findall('PHI')
    PHI_labels = [['O'] * len(sentence) for sentence in text]
    for PHI_tag in PHI_tags:
        base_label = PHI_tag.attrib['TYPE']
        start_pos, end_pos, PHI_text = PHI_tag.attrib['start'], PHI_tag.attrib['end'], PHI_tag.attrib['text']
        start_pos, end_pos = int(start_pos) + 1, int(end_pos)
        PHI_text = ' '.join(PHI_text.split())

        if PHI_text == 'Johnson and Johnson' and xml_path.endswith('188-05.xml'):
            print("Adjusting known error")
            PHI_text = 'Johnson & Johnson'

        (start_line, start_char), (end_line, end_char) = pos_transformer[start_pos], pos_transformer[end_pos]

        obs_text = []
        for line in range(start_line, end_line + 1):
            t = text[line]
            s = start_char if line == start_line else 0
            e = end_char if line == end_line else len(t)
            obs_text.append(''.join(t[s:e + 1]).strip())
        obs_text = ' '.join(obs_text)
        obs_text = ' '.join(obs_text.split())

        if match_text: assert obs_text == PHI_text, (
                ("Texts don't match! %s v %s" % (PHI_text, obs_text)) + '\n' + str((
            start_pos, end_pos, line, s, e, t, xml_path
        ))
        )

        PHI_labels[end_line][end_char] = 'I-%s' % base_label
        PHI_labels[start_line][start_char] = 'B-%s' % base_label

        for line in range(start_line, end_line + 1):
            t = text[line]
            s = start_char + 1 if line == start_line else 0
            e = end_char - 1 if line == end_line else len(t) - 1
            for i in range(s, e + 1): PHI_labels[line][i] = 'I-%s' % base_label

    return text, PHI_labels