def clean_text()

in data_parsing.py [0:0]


def clean_text(text, match_sense=''):
	#get rid of meta data
	text = re.sub(r'\[\[Category:.*?\]\]', '', text)
	text = re.sub(r'\[\[File:.*?\]\]', '', text)
	text = re.sub(r'/ :*? \'\'\'Usage.*$', '', text)


	#fix math+ symbols
	text = re.sub(r'&lt;', '<', text)
	text = re.sub(r'&gt;', '>', text)
	text = re.sub(r'&amp;', '&', text)
	text = re.sub(r'&nbsp;|&emsp;', ' ', text)
	text = re.sub(r'&hellip;', '...', text)
	text = re.sub(r'<math>|</math>|<sup>|</sup>', '', text)
	text = re.sub(r'\\forall', '∀', text)
	text = re.sub(r'\\exists', '∃', text)
	text = re.sub(r'\\pi', 'π', text)
	text = re.sub(r'\\dot', '·', text)
	text = re.sub(r'<br/?>', '/ ', text)

	#parse these metadata links out
	matches = re.finditer(r'{{.*?}}', text)
	for m in matches:
		value = m.group(0)
		value = re.sub(r'{{|}}', '', value)
		value = '('+value.strip().split('|')[-1]+')'
		text = re.sub(r'{{.*?}}', value, text, count=1)

	#parse out links
	matches = re.finditer(r'\[\[.*?\]\]', text)
	for m in matches:
		value = m.group(0)
		value = re.sub(r'\[\[|\]\]', '', value)
		value = ''+value.strip().split('|')[-1]+''
		if value == '\\': value = '\\\\'
		text = re.sub(r'\[\[.*?\]\]', value, text, count=1)
	text = re.sub(r'\[\[|\]\]', '', text)


	text = re.sub(r'{{|}}', '', text)

	if match_sense != '':
		word = match_sense.split('.')[0].replace('_', ' ')
		matches = re.finditer(r'\'\'\'.*?\'\'\'', text)
		for m in matches:
			value = m.group(0)
			value = re.sub(r'\'\'\'', '', value)
			if len(value) > 0 and len(lcs(value.lower(), word))/len(value) > MIN_MENTION_RATIO: 
				text = re.sub(r'\'\'\'.*?\'\'\'', '<WSD>'+value+'</WSD>', text, count=1)
			else:
				text = re.sub(r'\'\'\'.*?\'\'\'', value, text, count=1)

	#fix quotation marks
	text = re.sub(r'’', '\'', text)
	text = re.sub(r'&quot;', '"', text)
	text = re.sub(r'(?<!\')\'{2}(?!\')', '"', text)
	text = re.sub(r'&ldquo;|&rdquo;', '"', text)

	#cleaning whitespace in text
	text = ' '.join([t.strip() for t in text.split(' ')])

	text = text.strip()
	if match_sense != '':
		context = re.sub(r'<WSD>.*?</WSD>', '', text)
		if '<WSD>' in text and ' ' in context:
			return text
		else:
			return ''
	else:
		return text