in data_parsing.py [0:0]
def clean_text(text, match_sense=''):
#get rid of meta data
text = re.sub(r'\[\[Category:.*?\]\]', '', text)
text = re.sub(r'\[\[File:.*?\]\]', '', text)
text = re.sub(r'/ :*? \'\'\'Usage.*$', '', text)
#fix math+ symbols
text = re.sub(r'<', '<', text)
text = re.sub(r'>', '>', text)
text = re.sub(r'&', '&', text)
text = re.sub(r' | ', ' ', text)
text = re.sub(r'…', '...', text)
text = re.sub(r'<math>|</math>|<sup>|</sup>', '', text)
text = re.sub(r'\\forall', '∀', text)
text = re.sub(r'\\exists', '∃', text)
text = re.sub(r'\\pi', 'π', text)
text = re.sub(r'\\dot', '·', text)
text = re.sub(r'<br/?>', '/ ', text)
#parse these metadata links out
matches = re.finditer(r'{{.*?}}', text)
for m in matches:
value = m.group(0)
value = re.sub(r'{{|}}', '', value)
value = '('+value.strip().split('|')[-1]+')'
text = re.sub(r'{{.*?}}', value, text, count=1)
#parse out links
matches = re.finditer(r'\[\[.*?\]\]', text)
for m in matches:
value = m.group(0)
value = re.sub(r'\[\[|\]\]', '', value)
value = ''+value.strip().split('|')[-1]+''
if value == '\\': value = '\\\\'
text = re.sub(r'\[\[.*?\]\]', value, text, count=1)
text = re.sub(r'\[\[|\]\]', '', text)
text = re.sub(r'{{|}}', '', text)
if match_sense != '':
word = match_sense.split('.')[0].replace('_', ' ')
matches = re.finditer(r'\'\'\'.*?\'\'\'', text)
for m in matches:
value = m.group(0)
value = re.sub(r'\'\'\'', '', value)
if len(value) > 0 and len(lcs(value.lower(), word))/len(value) > MIN_MENTION_RATIO:
text = re.sub(r'\'\'\'.*?\'\'\'', '<WSD>'+value+'</WSD>', text, count=1)
else:
text = re.sub(r'\'\'\'.*?\'\'\'', value, text, count=1)
#fix quotation marks
text = re.sub(r'’', '\'', text)
text = re.sub(r'"', '"', text)
text = re.sub(r'(?<!\')\'{2}(?!\')', '"', text)
text = re.sub(r'“|”', '"', text)
#cleaning whitespace in text
text = ' '.join([t.strip() for t in text.split(' ')])
text = text.strip()
if match_sense != '':
context = re.sub(r'<WSD>.*?</WSD>', '', text)
if '<WSD>' in text and ' ' in context:
return text
else:
return ''
else:
return text