in preprocess/WikiExtractor.py [0:0]
def load_templates(file, output_file=None):
"""
Load templates from :param file:.
:param output_file: file where to save templates and modules.
"""
options.templatePrefix = options.templateNamespace + ':'
options.modulePrefix = options.moduleNamespace + ':'
if output_file:
output = codecs.open(output_file, 'wb', 'utf-8')
for page_count, page_data in enumerate(pages_from(file)):
id, revid, title, ns,catSet, page = page_data
if not output_file and (not options.templateNamespace or
not options.moduleNamespace): # do not know it yet
# reconstruct templateNamespace and moduleNamespace from the first title
if ns in templateKeys:
colon = title.find(':')
if colon > 1:
if ns == '10':
options.templateNamespace = title[:colon]
options.templatePrefix = title[:colon + 1]
elif ns == '828':
options.moduleNamespace = title[:colon]
options.modulePrefix = title[:colon + 1]
if ns in templateKeys:
text = ''.join(page)
define_template(title, text)
# save templates and modules to file
if output_file:
output.write('<page>\n')
output.write(' <title>%s</title>\n' % title)
output.write(' <ns>%s</ns>\n' % ns)
output.write(' <id>%s</id>\n' % id)
output.write(' <text>')
for line in page:
output.write(line)
output.write(' </text>\n')
output.write('</page>\n')
if page_count and page_count % 100000 == 0:
logging.info("Preprocessed %d pages", page_count)
if output_file:
output.close()
logging.info("Saved %d templates to '%s'", len(options.templates), output_file)