def extract_coref_annotation()

in modeling/coval/conll/reader.py [0:0]


def extract_coref_annotation(line):
    single_token_coref = []
    open_corefs = []
    ending_corefs = []
    last_num = []
    coref_opened = False

    coref_column = line.split()[-1]

    for i, c in enumerate(coref_column):
        if c.isdigit():
            last_num.append(c)
        elif c == '(':
            last_num = []
            coref_opened = True
        elif c == ')':
            if coref_opened:
                # Coreference annotations that are marked without specifying
                # the chain number will be skipped
                if len(last_num) > 0:
                    single_token_coref.append(int(''.join(last_num)))
                coref_opened = False
                last_num = []
            else:
                if len(last_num) > 0:
                    ending_corefs.append(int(''.join(last_num)))
                    last_num = []
        elif c == '|':
            if coref_opened:
                open_corefs.append(int(''.join(last_num)))
                coref_opened = False
                last_num = []
            elif len(last_num) > 0:
                sys.exit("Incorrect coreference annotation: ", coref_column)

        if i == len(coref_column) - 1:
            if coref_opened and len(last_num) > 0:
                open_corefs.append(int(''.join(last_num)))

    if len(single_token_coref) > 1:
        print('Warning: A single mention is assigned to more than one cluster: %s'
                % single_token_coref)

    return single_token_coref, open_corefs, ending_corefs