def get_doc_mentions()

in modeling/coval/conll/reader.py [0:0]


def get_doc_mentions(doc_name, doc_lines, keep_singletons,
        print_debug=False, word_column=3):
    clusters = {}
    open_mentions = {}
    singletons_num = 0

    for sent_num, sent_line in enumerate(doc_lines):
        sent_words = []
        for word_index, line in enumerate(sent_line):

            sent_words.append(line.split()[word_column]
                    if len(line.split()) > word_column + 1 else '')

            single_token_coref, open_corefs, end_corefs = (
                    extract_coref_annotation(line))

            if single_token_coref:
                m = mention.Mention(doc_name, sent_num, word_index, word_index,
                        [sent_words[word_index]])
                for c in single_token_coref:
                    if c not in clusters:
                        clusters[c] = []
                    clusters[c].append(m)

            for c in open_corefs:
                if c in open_mentions:
                    if print_debug:
                        print('Nested coreferring mentions.\n' + str(line))
                    open_mentions[c].append([sent_num, word_index])
                else:
                    open_mentions[c] = [[sent_num, word_index]]

            for c in end_corefs:
                if c not in clusters:
                    clusters[c] = []
                if c not in open_mentions:
                    if print_debug:
                        print('Problem in the coreference annotation:\n', line)
                else:
                    if open_mentions[c][0][0] != sent_num:
                        if print_debug:
                            print('A mention span should be in a single sentence:')
                            print(line)

                    m = mention.Mention(
                            doc_name, sent_num, open_mentions[c][-1][1],
                            word_index,
                            sent_words[open_mentions[c][-1][1]:word_index + 1])
                    clusters[c].append(m)
                    if len(open_mentions[c]) == 1:
                        open_mentions.pop(c)
                    else:
                        open_mentions[c].pop()

    if not keep_singletons:
        singletons = []
        for c in clusters:
            if len(clusters[c]) == 1:
                singletons.append(c)
        singletons_num += len(singletons)
        for c in sorted(singletons, reverse=True):
            clusters.pop(c)

    return [clusters[c] for c in clusters], singletons_num