in preprocess/WikiExtractor.py [0:0]
def clean(self, text):
"""
Removes irrelevant parts from :param: text.
"""
# Collect spans
spans = []
# Drop HTML comments
for m in comment.finditer(text):
spans.append((m.start(), m.end()))
# Drop self-closing tags
for pattern in selfClosing_tag_patterns:
for m in pattern.finditer(text):
spans.append((m.start(), m.end()))
# Drop ignored tags
for left, right in options.ignored_tag_patterns:
for m in left.finditer(text):
spans.append((m.start(), m.end()))
for m in right.finditer(text):
spans.append((m.start(), m.end()))
# Bulk remove all spans
text = dropSpans(spans, text)
# Drop discarded elements
for tag in options.discardElements:
text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
if not options.toHTML:
# Turn into text what is left (&nbsp;) and <syntaxhighlight>
text = unescape(text)
# Expand placeholders
for pattern, placeholder in placeholder_tag_patterns:
index = 1
for match in pattern.finditer(text):
text = text.replace(match.group(), '%s_%d' % (placeholder, index))
index += 1
text = text.replace('<<', '«').replace('>>', '»')
#############################################
# Cleanup text
text = text.replace('\t', ' ')
text = spaces.sub(' ', text)
text = dots.sub('...', text)
text = re.sub(' (,:\.\)\]»)', r'\1', text)
text = re.sub('(\[\(«) ', r'\1', text)
text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations
text = text.replace(',,', ',').replace(',.', '.')
if options.keep_tables:
# the following regular expressions are used to remove the wikiml chartacters around table strucutures
# yet keep the content. The order here is imporant so we remove certain markup like {| and then
# then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells.
text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text)
text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text)
text = text.replace('|-', '')
text = text.replace('|', '')
if options.toHTML:
text = cgi.escape(text)
return text