in preprocessing/preprocess_i2b2_2012_ner.py [0:0]
def read_xml_file(xml_path, event_tag_type='ALL_CHILDREN', match_text=True):
with open(xml_path, mode='r') as f:
lines = f.readlines()
text, in_text = [], False
for i, l in enumerate(lines):
if START_CDATA in l:
text.append(list(l[l.find(START_CDATA) + len(START_CDATA):]))
in_text = True
elif END_CDATA in l:
text.append(list(l[:l.find(END_CDATA)]))
break
elif in_text:
text.append(list(l))
pos_transformer = {}
linear_pos = 1
for line, sentence in enumerate(text):
for char_pos, char in enumerate(sentence):
pos_transformer[linear_pos] = (line, char_pos)
linear_pos += 1
try:
xml_parsed = ET.parse(xml_path)
except:
with open(xml_path) as f:
txt = f.read()
before = txt.split('<TEXT>')[0]
after = txt.split('</TEXT>')[1]
with open('temporary.xml', 'w') as f:
f.write((before + after).replace('&', 'AMPERSAND'))
xml_parsed = ET.parse('temporary.xml')
# raise
tag_containers = xml_parsed.findall('TAGS')
assert len(tag_containers) == 1, "Found multiple tag sets!"
tag_container = tag_containers[0]
event_tags = tag_container.findall('EVENT')
event_labels = [['O'] * len(sentence) for sentence in text]
for event_tag in event_tags:
base_label = event_tag.attrib['type']
start_pos, end_pos, event_text = event_tag.attrib['start'], event_tag.attrib['end'], event_tag.attrib['text']
start_pos, end_pos = int(start_pos) + 1, int(end_pos)
event_text = ' '.join(event_text.split())
event_text = event_text.replace('AMPERSAND', '&')
(start_line, start_char), (end_line, end_char) = pos_transformer[start_pos], pos_transformer[end_pos]
obs_text = []
for line in range(start_line, end_line + 1):
t = text[line]
s = start_char if line == start_line else 0
e = end_char if line == end_line else len(t)
obs_text.append(''.join(t[s:e + 1]).strip())
obs_text = ' '.join(obs_text)
obs_text = ' '.join(obs_text.split())
if ''' in obs_text and ''' not in event_text: event_text = event_text.replace("'", "'")
if '"' in obs_text and '"' not in event_text: event_text = event_text.replace('"', '"')
if match_text: assert obs_text == event_text, (
("Texts don't match! %s v %s" % (event_text, obs_text)) + '\n' + str((
start_pos, end_pos, line, s, e, t, xml_path
))
)
if base_label.strip() == '': continue
event_labels[end_line][end_char] = 'I-%s' % base_label
event_labels[start_line][start_char] = 'B-%s' % base_label
for line in range(start_line, end_line + 1):
t = text[line]
s = start_char + 1 if line == start_line else 0
e = end_char - 1 if line == end_line else len(t) - 1
for i in range(s, e + 1): event_labels[line][i] = 'I-%s' % base_label
return text, event_labels