in modeling/utils/process_data.py [0:0]
def filter_links(original_links, turn, ctx, utt):
'''
1. filter out improper links
2. sort links by the mention order in the current utterance
'''
links = []
# iterate all coreference links in annotation
for link in original_links:
if len(link) == 1: # incomplete link with only mention or reference
continue
reference, mention = link[0]['text'], link[1]['text']
if reference.lower() == 'i' or mention.lower() == "i": # skip mention or refearence which is "I"
continue
if reference.lower() in ["you", "your"] or mention.lower() in ["you", "your"]: # skip link involving "you" which refers to the system
continue
if reference.lower() in ["my", "me"] or mention.lower() in ["my", "me"]:
continue
if reference.lower() in ["she", "her"] and mention.lower() in ["she", "her"]:
continue
if reference.lower() in ["he", "his", "him"] and mention.lower() in ["he", "his", "him"]:
continue
if reference.lower() in ['it', 'its', 'he', 'his', 'him', 'she', 'her', 'hers', 'they', 'them', 'their', 'i', 'my', 'mine', 'that', 'you', 'your', 'one']: # remove a link if reference is a pronoun
continue
# deal with some rare cases with invalid links
r_turn_id = link[0]['turn_id']-1
if r_turn_id > len(ctx): # refer to future turns
continue
r_end, m_end = link[0]['span']['end'] + spk_offset, link[1]['span']['end'] + spk_offset
if m_end >= len(utt): # M out of index of utt
continue
if len(ctx) == r_turn_id and r_end >= len(utt):
continue
if len(ctx) != r_turn_id and r_end >= len(ctx[r_turn_id]):
continue
if mention == 'l the reminder':
continue
links.append(link)
# sort link by the mention start index
if len(links) == 0:
return []
sorted_links = sorted(links, key=lambda x: x[1]['span']['start'])
# sanity check
mentions, references = [], []
prev_end = -1
for link in sorted_links:
m_start, m_end = link[1]['span']['start'], link[1]['span']['end']
assert m_start > prev_end # check no overlapping between mentions
prev_end = m_end
return sorted_links