def main()

in preprocess/WikiExtractor.py [0:0]


def main():

    parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                     formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description=__doc__)
    parser.add_argument("input",
                        help="XML wiki dump file")
    groupO = parser.add_argument_group('Output')
    groupO.add_argument("-o", "--output", default="text",
                        help="directory for extracted files (or '-' for dumping to stdout)")
    groupO.add_argument("-b", "--bytes", default="1M",
                        help="maximum bytes per output file (default %(default)s)",
                        metavar="n[KMG]")
    groupO.add_argument("-c", "--compress", action="store_true",
                        help="compress output files using bzip")
    groupO.add_argument("--json", action="store_true",
                        help="write output in json format instead of the default one")


    groupP = parser.add_argument_group('Processing')
    groupP.add_argument("--html", action="store_true",
                        help="produce HTML output, subsumes --links")
    groupP.add_argument("-l", "--links", action="store_true",
                        help="preserve links")
    groupP.add_argument("-s", "--sections", action="store_true",
                        help="preserve sections")
    groupP.add_argument("--lists", action="store_true",
                        help="preserve lists")
    groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2",
                        help="accepted namespaces in links")
    groupP.add_argument("--templates",
                        help="use or create file containing templates")
    groupP.add_argument("--no_templates", action="store_false",
                        help="Do not expand templates")
    groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision,
                        help="Include the document revision id (default=%(default)s)")
    groupP.add_argument("--min_text_length", type=int, default=options.min_text_length,
                        help="Minimum expanded text length required to write document (default=%(default)s)")
    groupP.add_argument("--filter_disambig_pages", action="store_true", default=options.filter_disambig_pages,
                        help="Remove pages from output that contain disabmiguation markup (default=%(default)s)")
    groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big",
                        help="comma separated list of tags that will be dropped, keeping their content")
    groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude",
                        help="comma separated list of elements that will be removed from the article text")
    groupP.add_argument("--keep_tables", action="store_true", default=options.keep_tables,
                        help="Preserve tables in the output article text (default=%(default)s)")
    default_process_count = max(1, cpu_count() - 1)
    parser.add_argument("--processes", type=int, default=default_process_count,
                        help="Number of processes to use (default %(default)s)")

    groupS = parser.add_argument_group('Special')
    groupS.add_argument("-q", "--quiet", action="store_true",
                        help="suppress reporting progress info")
    groupS.add_argument("--debug", action="store_true",
                        help="print debug info")
    groupS.add_argument("-a", "--article", action="store_true",
                        help="analyze a file containing a single article (debug option)")
    groupS.add_argument("--log_file",
                        help="path to save the log info")
    groupS.add_argument("-v", "--version", action="version",
                        version='%(prog)s ' + version,
                        help="print program version")
    groupP.add_argument("--filter_category",
                        help="specify the file that listing the Categories you want to include or exclude. One line for"
                             " one category. starting with: 1) '#' comment, ignored; 2) '^' exclude; Note: excluding has higher priority than including")
    args = parser.parse_args()

    options.keepLinks = args.links
    options.keepSections = args.sections
    options.keepLists = args.lists
    options.toHTML = args.html
    options.write_json = args.json
    options.print_revision = args.revision
    options.min_text_length = args.min_text_length
    if args.html:
        options.keepLinks = True

    options.expand_templates = args.no_templates
    options.filter_disambig_pages = args.filter_disambig_pages
    options.keep_tables = args.keep_tables

    try:
        power = 'kmg'.find(args.bytes[-1].lower()) + 1
        file_size = int(args.bytes[:-1]) * 1024 ** power
        if file_size < minFileSize:
            raise ValueError()
    except ValueError:
        logging.error('Insufficient or invalid size: %s', args.bytes)
        return

    if args.namespaces:
        options.acceptedNamespaces = set(args.namespaces.split(','))

    # ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
    if args.ignored_tags:
        ignoredTags = set(args.ignored_tags.split(','))
    else:
        ignoredTags = [
            'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em',
            'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
            'p', 'plaintext', 's', 'span', 'strike', 'strong',
            'tt', 'u', 'var'
        ]

    # 'a' tag is handled separately
    for tag in ignoredTags:
        ignoreTag(tag)

    if args.discard_elements:
        options.discardElements = set(args.discard_elements.split(','))

    FORMAT = '%(levelname)s: %(message)s'
    logging.basicConfig(format=FORMAT)

    options.quiet = args.quiet
    options.debug = args.debug
    options.log_file = args.log_file
    createLogger(options.quiet, options.debug, options.log_file)

    input_file = args.input

    if not options.keepLinks:
        ignoreTag('a')

    # sharing cache of parser templates is too slow:
    # manager = Manager()
    # templateCache = manager.dict()

    if args.article:
        if args.templates:
            if os.path.exists(args.templates):
                with open(args.templates) as file:
                    load_templates(file)

        file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed)
        for page_data in pages_from(file):
            id, revid, title, ns,catSet, page = page_data
            Extractor(id, revid, title, page).extract(sys.stdout)
        file.close()
        return

    output_path = args.output
    if output_path != '-' and not os.path.isdir(output_path):
        try:
            os.makedirs(output_path)
        except:
            logging.error('Could not create: %s', output_path)
            return

    filter_category = args.filter_category
    if (filter_category != None and len(filter_category)>0):
        with open(filter_category) as f:
            error_cnt = 0
            for line in f.readlines():
                try:
                    line = str(line.strip())
                    if line.startswith('#') or len(line) == 0:
                        continue;
                    elif line.startswith('^'):
                        options.filter_category_exclude.add(line.lstrip('^'))
                    else:
                        options.filter_category_include.add(line)
                except Exception as e:
                    error_cnt += 1
                    print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt,e))
                    print(line)
            logging.info("Excluding categories:",)
            logging.info(str(options.filter_category_exclude))
            logging.info("Including categories:")
            logging.info(str(len(options.filter_category_include)))

    process_dump(input_file, args.templates, output_path, file_size,
                 args.compress, args.processes)