in data_parsing.py [0:0]
def process_page(lines):
senses = []
#get title/word for page
title = [line.replace('<title>', '').replace('</title>', '') for line in lines if line.startswith('<title>')][0]
#ignoring structural, management pages
if re.match(r'^\w*?:', title): return -1 #ignore these pages
else: word = title
#remove html from text to process clean page
l = []
for line in lines:
line = re.sub('<.*?>.*?</.*?>', '', line)
line = re.sub('<.*?>', '', line)
line = line.strip()
if len(line) != 0: l.append(line)
if len(l) == 0: return -1 #ignore pages with no text outside of html code
else: lines = l
#check if there are languages, and process each language seperately
langs_count = len([1 for line in lines if re.match(r'^==[^=]*?==$', line)])
if langs_count > 0:
in_lang = False
lang_lines = []
lang = ''
for line in lines:
if re.match(r'^==[^=]*?==$', line):
lang = line.replace('==', '')
lang_lines = []
if lang == 'English': in_lang = True
else: in_lang = False
elif in_lang:
if line == '----':
l = process_language(title, lang_lines)
if l != -1:
senses.extend(l)
in_lang = False
lang_lines = []
else:
lang_lines.append(line)
#process last language
if in_lang:
l = process_language(title, lang_lines)
if l != -1:
senses.extend(l)
#otherwise assumed to be only English and processed as one language
else:
l = process_language(title, lines)
if l != -1:
senses.extend(l)
if len(senses) > 0:
return senses
else:
return -1