in preprocess/WikiExtractor.py [0:0]
def main():
parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__)
parser.add_argument("input",
help="XML wiki dump file")
groupO = parser.add_argument_group('Output')
groupO.add_argument("-o", "--output", default="text",
help="directory for extracted files (or '-' for dumping to stdout)")
groupO.add_argument("-b", "--bytes", default="1M",
help="maximum bytes per output file (default %(default)s)",
metavar="n[KMG]")
groupO.add_argument("-c", "--compress", action="store_true",
help="compress output files using bzip")
groupO.add_argument("--json", action="store_true",
help="write output in json format instead of the default one")
groupP = parser.add_argument_group('Processing')
groupP.add_argument("--html", action="store_true",
help="produce HTML output, subsumes --links")
groupP.add_argument("-l", "--links", action="store_true",
help="preserve links")
groupP.add_argument("-s", "--sections", action="store_true",
help="preserve sections")
groupP.add_argument("--lists", action="store_true",
help="preserve lists")
groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
help="accepted namespaces in links")
groupP.add_argument("--templates",
help="use or create file containing templates")
groupP.add_argument("--no_templates", action="store_false",
help="Do not expand templates")
groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision,
help="Include the document revision id (default=%(default)s)")
groupP.add_argument("--min_text_length", type=int, default=options.min_text_length,
help="Minimum expanded text length required to write document (default=%(default)s)")
groupP.add_argument("--filter_disambig_pages", action="store_true", default=options.filter_disambig_pages,
help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big",
help="comma separated list of tags that will be dropped, keeping their content")
groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude",
help="comma separated list of elements that will be removed from the article text")
groupP.add_argument("--keep_tables", action="store_true", default=options.keep_tables,
help="Preserve tables in the output article text (default=%(default)s)")
default_process_count = max(1, cpu_count() - 1)
parser.add_argument("--processes", type=int, default=default_process_count,
help="Number of processes to use (default %(default)s)")
groupS = parser.add_argument_group('Special')
groupS.add_argument("-q", "--quiet", action="store_true",
help="suppress reporting progress info")
groupS.add_argument("--debug", action="store_true",
help="print debug info")
groupS.add_argument("-a", "--article", action="store_true",
help="analyze a file containing a single article (debug option)")
groupS.add_argument("--log_file",
help="path to save the log info")
groupS.add_argument("-v", "--version", action="version",
version='%(prog)s ' + version,
help="print program version")
groupP.add_argument("--filter_category",
help="specify the file that listing the Categories you want to include or exclude. One line for"
" one category. starting with: 1) '#' comment, ignored; 2) '^' exclude; Note: excluding has higher priority than including")
args = parser.parse_args()
options.keepLinks = args.links
options.keepSections = args.sections
options.keepLists = args.lists
options.toHTML = args.html
options.write_json = args.json
options.print_revision = args.revision
options.min_text_length = args.min_text_length
if args.html:
options.keepLinks = True
options.expand_templates = args.no_templates
options.filter_disambig_pages = args.filter_disambig_pages
options.keep_tables = args.keep_tables
try:
power = 'kmg'.find(args.bytes[-1].lower()) + 1
file_size = int(args.bytes[:-1]) * 1024 ** power
if file_size < minFileSize:
raise ValueError()
except ValueError:
logging.error('Insufficient or invalid size: %s', args.bytes)
return
if args.namespaces:
options.acceptedNamespaces = set(args.namespaces.split(','))
# ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
if args.ignored_tags:
ignoredTags = set(args.ignored_tags.split(','))
else:
ignoredTags = [
'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
'p', 'plaintext', 's', 'span', 'strike', 'strong',
'tt', 'u', 'var'
]
# 'a' tag is handled separately
for tag in ignoredTags:
ignoreTag(tag)
if args.discard_elements:
options.discardElements = set(args.discard_elements.split(','))
FORMAT = '%(levelname)s: %(message)s'
logging.basicConfig(format=FORMAT)
options.quiet = args.quiet
options.debug = args.debug
options.log_file = args.log_file
createLogger(options.quiet, options.debug, options.log_file)
input_file = args.input
if not options.keepLinks:
ignoreTag('a')
# sharing cache of parser templates is too slow:
# manager = Manager()
# templateCache = manager.dict()
if args.article:
if args.templates:
if os.path.exists(args.templates):
with open(args.templates) as file:
load_templates(file)
file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
for page_data in pages_from(file):
id, revid, title, ns,catSet, page = page_data
Extractor(id, revid, title, page).extract(sys.stdout)
file.close()
return
output_path = args.output
if output_path != '-' and not os.path.isdir(output_path):
try:
os.makedirs(output_path)
except:
logging.error('Could not create: %s', output_path)
return
filter_category = args.filter_category
if (filter_category != None and len(filter_category)>0):
with open(filter_category) as f:
error_cnt = 0
for line in f.readlines():
try:
line = str(line.strip())
if line.startswith('#') or len(line) == 0:
continue;
elif line.startswith('^'):
options.filter_category_exclude.add(line.lstrip('^'))
else:
options.filter_category_include.add(line)
except Exception as e:
error_cnt += 1
print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt,e))
print(line)
logging.info("Excluding categories:",)
logging.info(str(options.filter_category_exclude))
logging.info("Including categories:")
logging.info(str(len(options.filter_category_include)))
process_dump(input_file, args.templates, output_path, file_size,
args.compress, args.processes)