def filter_links()

in modeling/utils/process_data.py [0:0]


def filter_links(original_links, turn, ctx, utt):
	'''
		1. filter out improper links
		2. sort links by the mention order in the current utterance
	'''
	links = []
	# iterate all coreference links in annotation
	for link in original_links:
		if len(link) == 1: # incomplete link with only mention or reference
			continue
		reference, mention = link[0]['text'], link[1]['text']
		if reference.lower() == 'i' or mention.lower() == "i": # skip mention or refearence which is "I"
			continue
		if reference.lower() in ["you", "your"] or mention.lower() in ["you", "your"]: # skip link involving "you" which refers to the system
			continue
		if reference.lower() in ["my", "me"] or mention.lower() in ["my", "me"]:
			continue
		if reference.lower() in ["she", "her"] and mention.lower() in ["she", "her"]:
			continue
		if reference.lower() in ["he", "his", "him"] and mention.lower() in ["he", "his", "him"]:
			continue
		if reference.lower() in ['it', 'its', 'he', 'his', 'him', 'she', 'her', 'hers', 'they', 'them', 'their', 'i', 'my', 'mine', 'that', 'you', 'your', 'one']: # remove a link if reference is a pronoun
			continue

		# deal with some rare cases with invalid links
		r_turn_id = link[0]['turn_id']-1
		if r_turn_id > len(ctx): # refer to future turns
			continue
		r_end, m_end = link[0]['span']['end'] + spk_offset, link[1]['span']['end'] + spk_offset
		if m_end >= len(utt): # M out of index of utt
			continue
		if len(ctx) == r_turn_id and r_end >= len(utt):
			continue
		if len(ctx) != r_turn_id and r_end >= len(ctx[r_turn_id]):
			continue
		if mention == 'l the reminder':
			continue
		links.append(link)

	# sort link by the mention start index
	if len(links) == 0:
		return []

	sorted_links = sorted(links, key=lambda x: x[1]['span']['start'])

	# sanity check
	mentions, references = [], []
	prev_end = -1
	for link in sorted_links:
		m_start, m_end = link[1]['span']['start'], link[1]['span']['end']
		assert m_start > prev_end # check no overlapping between mentions
		prev_end = m_end

	return sorted_links