def process_page()

in data_parsing.py [0:0]


def process_page(lines):
	senses = []

	#get title/word for page
	title = [line.replace('<title>', '').replace('</title>', '') for line in lines if line.startswith('<title>')][0]
	#ignoring structural, management pages
	if re.match(r'^\w*?:', title): return -1 #ignore these pages
	else: word = title

	#remove html from text to process clean page
	l = []
	for line in lines:
		line = re.sub('<.*?>.*?</.*?>', '', line)
		line = re.sub('<.*?>', '', line)
		line = line.strip()
		if len(line) != 0: l.append(line)
	if len(l) == 0: return -1 #ignore pages with no text outside of html code
	else: lines = l

	#check if there are languages, and process each language seperately 
	langs_count = len([1 for line in lines if re.match(r'^==[^=]*?==$', line)])

	if langs_count > 0:
		in_lang = False
		lang_lines = []
		lang = ''
		for line in lines:
			if re.match(r'^==[^=]*?==$', line):
				lang = line.replace('==', '')
				lang_lines = []
				if lang == 'English': in_lang = True
				else: in_lang = False 
			elif in_lang:
				if line == '----':
					l = process_language(title, lang_lines)
					if l != -1: 
						senses.extend(l)
					in_lang = False
					lang_lines = []
				else:
					lang_lines.append(line)
		#process last language 
		if in_lang:
			l = process_language(title, lang_lines)
			if l != -1: 
				senses.extend(l)
	#otherwise assumed to be only English and processed as one language
	else:
		l = process_language(title, lines)
		if l != -1: 
			senses.extend(l)

	if len(senses) > 0:
		return senses
	else:
		return -1