def extract_annotated_parse()

in modeling/coval/conll/reader.py [0:0]


def extract_annotated_parse(mention_lines, start_index,
        parse_column=5, word_column=3, POS_column=4):
    """Extracting gold parse annotation according to the CoNLL format."""
    open_nodes = []
    tag_started = False
    tag_name = []
    terminal_nodes = []
    pos_tags = []
    root = None
    roots = []

    for i, line in enumerate(mention_lines):
        parse = line.split()[parse_column]
        for j, c in enumerate(parse):
            if c == '(':
                if tag_started:
                    node = mention.TreeNode(''.join(tag_name), pos_tags, 
                            start_index + i, False)
                    if open_nodes:
                        if open_nodes[-1].children:
                            open_nodes[-1].children.append(node)
                        else:
                            open_nodes[-1].children = [node]

                    open_nodes.append(node)
                    tag_name = []
                if terminal_nodes:
                    # skipping words like commas, quotations and parantheses
                    if any(c.isalpha() for c in terminal_nodes) or \
                       any(c.isdigit() for c in terminal_nodes):
                        node = mention.TreeNode(' '.join(terminal_nodes),
                                pos_tags, start_index + i, True)
                        if open_nodes:
                            if open_nodes[-1].children:
                                open_nodes[-1].children.append(node)
                            else:
                                open_nodes[-1].children = [node]
                        else:
                            open_nodes.append(node)
                    terminal_nodes = []
                    pos_tags = []


                tag_started = True

            elif c == '*':
                terminal_nodes.append(line.split()[word_column])
                pos_tags.append(line.split()[POS_column])
                node = mention.TreeNode(''.join(tag_name), None,  
                        start_index+i, False)

                if tag_started:
                    if open_nodes:
                        if open_nodes[-1].children:
                            open_nodes[-1].children.append(node)
                        else:
                            open_nodes[-1].children = [node]

                    open_nodes.append(node)
                    tag_name = []
                    tag_started = False

                elif tag_name:
                    roots.append(node)

            elif c == ')':
                if terminal_nodes:
                    node = mention.TreeNode(' '.join(terminal_nodes),
                            pos_tags, start_index + i, True)
                    if open_nodes:
                        if open_nodes[-1].children:
                            open_nodes[-1].children.append(node)
                        else:
                            open_nodes[-1].children = [node]
                    else:
                        open_nodes.append(node)

                    terminal_nodes = []
                    pos_tags = []

                if open_nodes:
                    root = open_nodes.pop()
                    if not open_nodes:
                        roots.append(root)

                tag_started = False

            elif c.isalpha():
                tag_name.append(c)

            if (i == len(mention_lines) - 1 and 
                    j == len(parse) - 1 and terminal_nodes):
                node = mention.TreeNode(' '.join(terminal_nodes),
                        pos_tags, start_index + i, True)
                if open_nodes:
                    if open_nodes[-1].children:
                        open_nodes[-1].children.append(node)
                    else:
                        open_nodes[-1].children = [node]
                else:
                    open_nodes.append(node)

                terminal_nodes = []
                pos_tags = []

    # If there is parsing errors in which starting phrasea are not ended at
    # the end of detected mention boundaries
    while open_nodes:
        root = open_nodes.pop()
        if not open_nodes:
            roots.append(root)

    if len(roots) > 1:
        new_root = mention.TreeNode('NP', None, start_index, False)
        for node in roots:
            new_root.children.append(node)
        return new_root

    return root