def main()

in spark_scripts/tokenize_and_ner_inputs.py [0:0]


def main(sc):
    argp = argparse.ArgumentParser()
    argp.add_argument('--corpus', help='input corpus (*.raw)', required=True)
    argp.add_argument('--output-dir', default="output/", help='')
    argp.add_argument('--num-partitions', type=int, default=1000, help='')
    argp.add_argument('--debug-save', help='for debugging purposes', action='store_true')
    args = argp.parse_args()

    metric_filename = os.path.join(args.output_dir, 'metric.txt')
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    with ExitStack() as stack:
        metric_fptr = stack.enter_context(open(metric_filename, 'w'))

        data_raw = sc.textFile(args.corpus)

        assert(args.corpus[-4:] == '.raw')
        print('Number of articles in raw: {}'.format(data_raw.count()), file=metric_fptr)
        metric_fptr.flush()

        input_parser = InputParser(
            num_partitions=args.num_partitions,
            debug_save=args.debug_save)

        # combine the data
        rollup_rdd = input_parser.tokenize_and_perform_rollup(data_raw)

        rollup_rdd.map(lambda x: x.jsonify()).saveAsTextFile(os.path.join(args.output_dir, 'rollup'))

        # do not do rollup_rdd.count(). It will recompute everything.

        logging.info('Output directory: {}'.format(args.output_dir))