in modeling/coval/arrau/reader.py [0:0]
def get_doc_markables(doc_name, doc_lines, extract_MIN, word_column=0,
markable_column=1, MIN_column=2, print_debug=False):
markables_cluster = {}
markables_start = {}
markables_end = {}
markables_MIN = {}
markables_coref_tag = {}
all_words = []
for word_index, line in enumerate(doc_lines):
columns = line.split()
all_words.append(columns[word_column])
# If the line contains annotations
if len(columns) > 1:
markable_annotations = columns[markable_column].split("@")
MIN_annotations = columns[MIN_column].split(
"@") if extract_MIN and len(columns) >= 4 else None
coref_annotations = columns[-1].split(
"@") if len(columns) >= 3 else None
if print_debug:
if ((MIN_annotations and len(markable_annotations)
!= len(MIN_annotations))
or (coref_annotations and len(markable_annotations)
!= len(coref_annotations))):
print((
'There is a problem with the annotation of the '
'document %r in line %s\n'
'The number of MIN or coref annotations '
'for each line should be equal to the the number '
'of markable annotations') % (doc_name, line))
for i, markable_annotation in enumerate(markable_annotations):
markable_id = int(markable_annotation[
11:markable_annotation.find('=')])
cluster_id = int(markable_annotation[
markable_annotation.find('=') + 5:])
if markable_annotation.startswith("B-markable_"):
markables_cluster[markable_id] = cluster_id
markables_start[markable_id] = word_index
markables_end[markable_id] = word_index
if MIN_annotations and len(markable_annotations) == len(
MIN_annotations) and MIN_annotations[i].strip():
if MIN_annotations[i].find('..') == -1:
MIN_start = int(MIN_annotations[i][5:]) - 1
MIN_end = MIN_start
else:
# -1 because word_index starts from zero
MIN_start = int(MIN_annotations[i][
5:MIN_annotations[i].find('..')]) - 1
MIN_end = int(MIN_annotations[i][
MIN_annotations[i].find('..') + 7:]) - 1
markables_MIN[markable_id] = (MIN_start, MIN_end)
else:
markables_MIN[markable_id] = None
if coref_annotations and len(markable_annotations) == len(
coref_annotations) and coref_annotations[i].strip(
) == 'non_referring':
markables_coref_tag[markable_id] = 'non_referring'
else:
markables_coref_tag[markable_id] = 'referring'
elif markable_annotation.startswith("I-markable_"):
markables_end[markable_id] = word_index
else:
print((
'%r is not a valid annotation for markables.\n',
'The annotation of the following markable will be '
'skipped then.\n%s') % (markable_annotation, line))
clusters = {}
for markable_id in markables_cluster:
m = markable.Markable(
doc_name, markables_start[markable_id],
markables_end[markable_id], markables_MIN[markable_id],
markables_coref_tag[markable_id],
all_words[markables_start[markable_id]:
markables_end[markable_id] + 1])
if markables_cluster[markable_id] not in clusters:
clusters[markables_cluster[markable_id]] = (
[], markables_coref_tag[markable_id])
clusters[markables_cluster[markable_id]][0].append(m)
return clusters