in preprocessing/preprocess_i2b2_2014_ner.py [0:0]
def read_xml_file(xml_path, PHI_tag_type='ALL_CHILDREN', match_text=True):
with open(xml_path, mode='r') as f:
lines = f.readlines()
text, in_text = [], False
for i, l in enumerate(lines):
if START_CDATA in l:
text.append(list(l[l.find(START_CDATA) + len(START_CDATA):]))
in_text = True
elif END_CDATA in l:
text.append(list(l[:l.find(END_CDATA)]))
break
elif in_text:
if xml_path.endswith('180-03.xml') and '0808' in l and 'Effingham' in l:
print("Adjusting known error")
l = l[:9] + ' ' * 4 + l[9:]
text.append(list(l))
pos_transformer = {}
linear_pos = 1
for line, sentence in enumerate(text):
for char_pos, char in enumerate(sentence):
pos_transformer[linear_pos] = (line, char_pos)
linear_pos += 1
xml_parsed = ET.parse(xml_path)
tag_containers = xml_parsed.findall('TAGS')
assert len(tag_containers) == 1, "Found multiple tag sets!"
tag_container = tag_containers[0]
PHI_tags = tag_container.getchildren() if PHI_tag_type == 'ALL_CHILDREN' else tag_container.findall('PHI')
PHI_labels = [['O'] * len(sentence) for sentence in text]
for PHI_tag in PHI_tags:
base_label = PHI_tag.attrib['TYPE']
start_pos, end_pos, PHI_text = PHI_tag.attrib['start'], PHI_tag.attrib['end'], PHI_tag.attrib['text']
start_pos, end_pos = int(start_pos) + 1, int(end_pos)
PHI_text = ' '.join(PHI_text.split())
if PHI_text == 'Johnson and Johnson' and xml_path.endswith('188-05.xml'):
print("Adjusting known error")
PHI_text = 'Johnson & Johnson'
(start_line, start_char), (end_line, end_char) = pos_transformer[start_pos], pos_transformer[end_pos]
obs_text = []
for line in range(start_line, end_line + 1):
t = text[line]
s = start_char if line == start_line else 0
e = end_char if line == end_line else len(t)
obs_text.append(''.join(t[s:e + 1]).strip())
obs_text = ' '.join(obs_text)
obs_text = ' '.join(obs_text.split())
if match_text: assert obs_text == PHI_text, (
("Texts don't match! %s v %s" % (PHI_text, obs_text)) + '\n' + str((
start_pos, end_pos, line, s, e, t, xml_path
))
)
PHI_labels[end_line][end_char] = 'I-%s' % base_label
PHI_labels[start_line][start_char] = 'B-%s' % base_label
for line in range(start_line, end_line + 1):
t = text[line]
s = start_char + 1 if line == start_line else 0
e = end_char - 1 if line == end_line else len(t) - 1
for i in range(s, e + 1): PHI_labels[line][i] = 'I-%s' % base_label
return text, PHI_labels