in modeling/coval/conll/reader.py [0:0]
def extract_coref_annotation(line):
single_token_coref = []
open_corefs = []
ending_corefs = []
last_num = []
coref_opened = False
coref_column = line.split()[-1]
for i, c in enumerate(coref_column):
if c.isdigit():
last_num.append(c)
elif c == '(':
last_num = []
coref_opened = True
elif c == ')':
if coref_opened:
# Coreference annotations that are marked without specifying
# the chain number will be skipped
if len(last_num) > 0:
single_token_coref.append(int(''.join(last_num)))
coref_opened = False
last_num = []
else:
if len(last_num) > 0:
ending_corefs.append(int(''.join(last_num)))
last_num = []
elif c == '|':
if coref_opened:
open_corefs.append(int(''.join(last_num)))
coref_opened = False
last_num = []
elif len(last_num) > 0:
sys.exit("Incorrect coreference annotation: ", coref_column)
if i == len(coref_column) - 1:
if coref_opened and len(last_num) > 0:
open_corefs.append(int(''.join(last_num)))
if len(single_token_coref) > 1:
print('Warning: A single mention is assigned to more than one cluster: %s'
% single_token_coref)
return single_token_coref, open_corefs, ending_corefs