def get_doc_markables()

in modeling/coval/arrau/reader.py [0:0]


def get_doc_markables(doc_name, doc_lines, extract_MIN, word_column=0,
        markable_column=1, MIN_column=2, print_debug=False):
    markables_cluster = {}
    markables_start = {}
    markables_end = {}
    markables_MIN = {}
    markables_coref_tag = {}

    all_words = []

    for word_index, line in enumerate(doc_lines):
        columns = line.split()
        all_words.append(columns[word_column])

        # If the line contains annotations
        if len(columns) > 1:

            markable_annotations = columns[markable_column].split("@")
            MIN_annotations = columns[MIN_column].split(
                    "@") if extract_MIN and len(columns) >= 4 else None
            coref_annotations = columns[-1].split(
                    "@") if len(columns) >= 3 else None

            if print_debug:
                if ((MIN_annotations and len(markable_annotations)
                            != len(MIN_annotations))
                        or (coref_annotations and len(markable_annotations)
                            != len(coref_annotations))):
                    print((
                            'There is a problem with the annotation of the '
                            'document %r in line %s\n'
                            'The number of MIN or coref annotations '
                            'for each line should be equal to the the number '
                            'of markable annotations') % (doc_name, line))

            for i, markable_annotation in enumerate(markable_annotations):
                markable_id = int(markable_annotation[
                        11:markable_annotation.find('=')])
                cluster_id = int(markable_annotation[
                        markable_annotation.find('=') + 5:])

                if markable_annotation.startswith("B-markable_"):
                    markables_cluster[markable_id] = cluster_id
                    markables_start[markable_id] = word_index
                    markables_end[markable_id] = word_index

                    if MIN_annotations and len(markable_annotations) == len(
                            MIN_annotations) and MIN_annotations[i].strip():
                        if MIN_annotations[i].find('..') == -1:
                            MIN_start = int(MIN_annotations[i][5:]) - 1
                            MIN_end = MIN_start
                        else:
                            # -1 because word_index starts from zero
                            MIN_start = int(MIN_annotations[i][
                                    5:MIN_annotations[i].find('..')]) - 1
                            MIN_end = int(MIN_annotations[i][
                                    MIN_annotations[i].find('..') + 7:]) - 1
                        markables_MIN[markable_id] = (MIN_start, MIN_end)
                    else:
                        markables_MIN[markable_id] = None

                    if coref_annotations and len(markable_annotations) == len(
                            coref_annotations) and coref_annotations[i].strip(
                            ) == 'non_referring':
                        markables_coref_tag[markable_id] = 'non_referring'
                    else:
                        markables_coref_tag[markable_id] = 'referring'

                elif markable_annotation.startswith("I-markable_"):
                    markables_end[markable_id] = word_index

                else:
                    print((
                            '%r is not a valid annotation for markables.\n',
                            'The annotation of the following markable will be '
                            'skipped then.\n%s') % (markable_annotation, line))

    clusters = {}

    for markable_id in markables_cluster:
        m = markable.Markable(
                doc_name, markables_start[markable_id],
                markables_end[markable_id], markables_MIN[markable_id],
                markables_coref_tag[markable_id],
                all_words[markables_start[markable_id]:
                        markables_end[markable_id] + 1])

        if markables_cluster[markable_id] not in clusters:
            clusters[markables_cluster[markable_id]] = (
                    [], markables_coref_tag[markable_id])
        clusters[markables_cluster[markable_id]][0].append(m)

    return clusters