in preprocess/WikiExtractor.py [0:0]
def extract(self, out):
"""
:param out: a memory file.
"""
logging.info('%s\t%s', self.id, self.title)
# Separate header from text with a newline.
if options.toHTML:
title_str = '<h1>' + self.title + '</h1>'
else:
title_str = self.title + '\n'
# https://www.mediawiki.org/wiki/Help:Magic_words
colon = self.title.find(':')
if colon != -1:
ns = self.title[:colon]
pagename = self.title[colon+1:]
else:
ns = '' # Main
pagename = self.title
self.magicWords['NAMESPACE'] = ns
self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, '0')
self.magicWords['PAGENAME'] = pagename
self.magicWords['FULLPAGENAME'] = self.title
slash = pagename.rfind('/')
if slash != -1:
self.magicWords['BASEPAGENAME'] = pagename[:slash]
self.magicWords['SUBPAGENAME'] = pagename[slash+1:]
else:
self.magicWords['BASEPAGENAME'] = pagename
self.magicWords['SUBPAGENAME'] = ''
slash = pagename.find('/')
if slash != -1:
self.magicWords['ROOTPAGENAME'] = pagename[:slash]
else:
self.magicWords['ROOTPAGENAME'] = pagename
self.magicWords['CURRENTYEAR'] = time.strftime('%Y')
self.magicWords['CURRENTMONTH'] = time.strftime('%m')
self.magicWords['CURRENTDAY'] = time.strftime('%d')
self.magicWords['CURRENTHOUR'] = time.strftime('%H')
self.magicWords['CURRENTTIME'] = time.strftime('%H:%M:%S')
text = self.text
self.text = '' # save memory
#
# @see https://doc.wikimedia.org/mediawiki-core/master/php/classParser.html
# This does the equivalent of internalParse():
#
# $dom = $this->preprocessToDom( $text, $flag );
# $text = $frame->expand( $dom );
#
text = self.transform(text)
text = self.wiki2text(text)
text = compact(self.clean(text))
# from zwChan
text = [title_str] + text
if sum(len(line) for line in text) < options.min_text_length:
return
self.write_output(out, text)
errs = (self.template_title_errs,
self.recursion_exceeded_1_errs,
self.recursion_exceeded_2_errs,
self.recursion_exceeded_3_errs)
if any(errs):
logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)",
self.title, self.id, *errs)