in modeling/coval/conll/reader.py [0:0]
def extract_annotated_parse(mention_lines, start_index,
parse_column=5, word_column=3, POS_column=4):
"""Extracting gold parse annotation according to the CoNLL format."""
open_nodes = []
tag_started = False
tag_name = []
terminal_nodes = []
pos_tags = []
root = None
roots = []
for i, line in enumerate(mention_lines):
parse = line.split()[parse_column]
for j, c in enumerate(parse):
if c == '(':
if tag_started:
node = mention.TreeNode(''.join(tag_name), pos_tags,
start_index + i, False)
if open_nodes:
if open_nodes[-1].children:
open_nodes[-1].children.append(node)
else:
open_nodes[-1].children = [node]
open_nodes.append(node)
tag_name = []
if terminal_nodes:
# skipping words like commas, quotations and parantheses
if any(c.isalpha() for c in terminal_nodes) or \
any(c.isdigit() for c in terminal_nodes):
node = mention.TreeNode(' '.join(terminal_nodes),
pos_tags, start_index + i, True)
if open_nodes:
if open_nodes[-1].children:
open_nodes[-1].children.append(node)
else:
open_nodes[-1].children = [node]
else:
open_nodes.append(node)
terminal_nodes = []
pos_tags = []
tag_started = True
elif c == '*':
terminal_nodes.append(line.split()[word_column])
pos_tags.append(line.split()[POS_column])
node = mention.TreeNode(''.join(tag_name), None,
start_index+i, False)
if tag_started:
if open_nodes:
if open_nodes[-1].children:
open_nodes[-1].children.append(node)
else:
open_nodes[-1].children = [node]
open_nodes.append(node)
tag_name = []
tag_started = False
elif tag_name:
roots.append(node)
elif c == ')':
if terminal_nodes:
node = mention.TreeNode(' '.join(terminal_nodes),
pos_tags, start_index + i, True)
if open_nodes:
if open_nodes[-1].children:
open_nodes[-1].children.append(node)
else:
open_nodes[-1].children = [node]
else:
open_nodes.append(node)
terminal_nodes = []
pos_tags = []
if open_nodes:
root = open_nodes.pop()
if not open_nodes:
roots.append(root)
tag_started = False
elif c.isalpha():
tag_name.append(c)
if (i == len(mention_lines) - 1 and
j == len(parse) - 1 and terminal_nodes):
node = mention.TreeNode(' '.join(terminal_nodes),
pos_tags, start_index + i, True)
if open_nodes:
if open_nodes[-1].children:
open_nodes[-1].children.append(node)
else:
open_nodes[-1].children = [node]
else:
open_nodes.append(node)
terminal_nodes = []
pos_tags = []
# If there is parsing errors in which starting phrasea are not ended at
# the end of detected mention boundaries
while open_nodes:
root = open_nodes.pop()
if not open_nodes:
roots.append(root)
if len(roots) > 1:
new_root = mention.TreeNode('NP', None, start_index, False)
for node in roots:
new_root.children.append(node)
return new_root
return root