def extract()

in preprocess/WikiExtractor.py [0:0]


    def extract(self, out):
        """
        :param out: a memory file.
        """
        logging.info('%s\t%s', self.id, self.title)

        # Separate header from text with a newline.
        if options.toHTML:
            title_str = '<h1>' + self.title + '</h1>'
        else:
            title_str = self.title + '\n'
        # https://www.mediawiki.org/wiki/Help:Magic_words
        colon = self.title.find(':')
        if colon != -1:
            ns = self.title[:colon]
            pagename = self.title[colon+1:]
        else:
            ns = '' # Main
            pagename = self.title
        self.magicWords['NAMESPACE'] = ns
        self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, '0')
        self.magicWords['PAGENAME'] = pagename
        self.magicWords['FULLPAGENAME'] = self.title
        slash = pagename.rfind('/')
        if slash != -1:
            self.magicWords['BASEPAGENAME'] = pagename[:slash]
            self.magicWords['SUBPAGENAME'] = pagename[slash+1:]
        else:
            self.magicWords['BASEPAGENAME'] = pagename
            self.magicWords['SUBPAGENAME'] = ''
        slash = pagename.find('/')
        if slash != -1:
            self.magicWords['ROOTPAGENAME'] = pagename[:slash]
        else:
            self.magicWords['ROOTPAGENAME'] = pagename
        self.magicWords['CURRENTYEAR'] = time.strftime('%Y')
        self.magicWords['CURRENTMONTH'] = time.strftime('%m')
        self.magicWords['CURRENTDAY'] = time.strftime('%d')
        self.magicWords['CURRENTHOUR'] = time.strftime('%H')
        self.magicWords['CURRENTTIME'] = time.strftime('%H:%M:%S')
        text = self.text
        self.text = ''          # save memory
        #
        # @see https://doc.wikimedia.org/mediawiki-core/master/php/classParser.html
        # This does the equivalent of internalParse():
        #
        # $dom = $this->preprocessToDom( $text, $flag );
        # $text = $frame->expand( $dom );
        #
        text = self.transform(text)
        text = self.wiki2text(text)
        text = compact(self.clean(text))
        # from zwChan
        text = [title_str] + text

        if sum(len(line) for line in text) < options.min_text_length:
            return

        self.write_output(out, text)

        errs = (self.template_title_errs,
                self.recursion_exceeded_1_errs,
                self.recursion_exceeded_2_errs,
                self.recursion_exceeded_3_errs)
        if any(errs):
            logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)",
                         self.title, self.id, *errs)