in modeling/coval/conll/reader.py [0:0]
def get_doc_mentions(doc_name, doc_lines, keep_singletons,
print_debug=False, word_column=3):
clusters = {}
open_mentions = {}
singletons_num = 0
for sent_num, sent_line in enumerate(doc_lines):
sent_words = []
for word_index, line in enumerate(sent_line):
sent_words.append(line.split()[word_column]
if len(line.split()) > word_column + 1 else '')
single_token_coref, open_corefs, end_corefs = (
extract_coref_annotation(line))
if single_token_coref:
m = mention.Mention(doc_name, sent_num, word_index, word_index,
[sent_words[word_index]])
for c in single_token_coref:
if c not in clusters:
clusters[c] = []
clusters[c].append(m)
for c in open_corefs:
if c in open_mentions:
if print_debug:
print('Nested coreferring mentions.\n' + str(line))
open_mentions[c].append([sent_num, word_index])
else:
open_mentions[c] = [[sent_num, word_index]]
for c in end_corefs:
if c not in clusters:
clusters[c] = []
if c not in open_mentions:
if print_debug:
print('Problem in the coreference annotation:\n', line)
else:
if open_mentions[c][0][0] != sent_num:
if print_debug:
print('A mention span should be in a single sentence:')
print(line)
m = mention.Mention(
doc_name, sent_num, open_mentions[c][-1][1],
word_index,
sent_words[open_mentions[c][-1][1]:word_index + 1])
clusters[c].append(m)
if len(open_mentions[c]) == 1:
open_mentions.pop(c)
else:
open_mentions[c].pop()
if not keep_singletons:
singletons = []
for c in clusters:
if len(clusters[c]) == 1:
singletons.append(c)
singletons_num += len(singletons)
for c in sorted(singletons, reverse=True):
clusters.pop(c)
return [clusters[c] for c in clusters], singletons_num