def read_xml_file()

in preprocessing/preprocess_i2b2_2012_ner.py [0:0]


def read_xml_file(xml_path, event_tag_type='ALL_CHILDREN', match_text=True):
    with open(xml_path, mode='r') as f:
        lines = f.readlines()
        text, in_text = [], False
        for i, l in enumerate(lines):
            if START_CDATA in l:
                text.append(list(l[l.find(START_CDATA) + len(START_CDATA):]))
                in_text = True
            elif END_CDATA in l:
                text.append(list(l[:l.find(END_CDATA)]))
                break
            elif in_text:
                text.append(list(l))

    pos_transformer = {}

    linear_pos = 1
    for line, sentence in enumerate(text):
        for char_pos, char in enumerate(sentence):
            pos_transformer[linear_pos] = (line, char_pos)
            linear_pos += 1

    try:
        xml_parsed = ET.parse(xml_path)
    except:
        with open(xml_path) as f:
            txt = f.read()
        before = txt.split('<TEXT>')[0]
        after = txt.split('</TEXT>')[1]
        with open('temporary.xml', 'w') as f:
            f.write((before + after).replace('&', 'AMPERSAND'))
        xml_parsed = ET.parse('temporary.xml')

        # raise

    tag_containers = xml_parsed.findall('TAGS')
    assert len(tag_containers) == 1, "Found multiple tag sets!"
    tag_container = tag_containers[0]

    event_tags = tag_container.findall('EVENT')
    event_labels = [['O'] * len(sentence) for sentence in text]
    for event_tag in event_tags:
        base_label = event_tag.attrib['type']
        start_pos, end_pos, event_text = event_tag.attrib['start'], event_tag.attrib['end'], event_tag.attrib['text']
        start_pos, end_pos = int(start_pos) + 1, int(end_pos)
        event_text = ' '.join(event_text.split())
        event_text = event_text.replace('AMPERSAND', '&')

        (start_line, start_char), (end_line, end_char) = pos_transformer[start_pos], pos_transformer[end_pos]

        obs_text = []
        for line in range(start_line, end_line + 1):
            t = text[line]
            s = start_char if line == start_line else 0
            e = end_char if line == end_line else len(t)
            obs_text.append(''.join(t[s:e + 1]).strip())
        obs_text = ' '.join(obs_text)
        obs_text = ' '.join(obs_text.split())

        if '&apos;' in obs_text and '&apos;' not in event_text: event_text = event_text.replace("'", "&apos;")
        if '&quot;' in obs_text and '&quot;' not in event_text: event_text = event_text.replace('"', '&quot;')

        if match_text: assert obs_text == event_text, (
                ("Texts don't match! %s v %s" % (event_text, obs_text)) + '\n' + str((
            start_pos, end_pos, line, s, e, t, xml_path
        ))
        )

        if base_label.strip() == '': continue

        event_labels[end_line][end_char] = 'I-%s' % base_label
        event_labels[start_line][start_char] = 'B-%s' % base_label

        for line in range(start_line, end_line + 1):
            t = text[line]
            s = start_char + 1 if line == start_line else 0
            e = end_char - 1 if line == end_line else len(t) - 1
            for i in range(s, e + 1): event_labels[line][i] = 'I-%s' % base_label

    return text, event_labels