def main()

in spark_scripts/create_squad_ner_dataset.py [0:0]


def main(sc):
    argp = argparse.ArgumentParser()
    argp.add_argument('--squad-rc-dir', help='input path of squad data', required=True)
    argp.add_argument('--output-dir', default="output/", help='', required=True)
    argp.add_argument('--num-partitions', type=int, default=1000, help='')
    argp.add_argument('--debug-save', help='for debugging purposes', action='store_true')
    args = argp.parse_args()

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    input_dct = {
        'train': 'squad_rc_train.jsonl',
        'dev': 'squad_rc_dev.jsonl',
        'test': 'squad_rc_test.jsonl',
    }
    if args.debug_save:
        # debug_save mode only runs for dev set
        input_dct = {'dev': input_dct['dev']}

    with ExitStack() as stack:
        for channel, jsonl_basename in input_dct.items():
            metric_filename = os.path.join(args.output_dir, 'metric_{}.txt'.format(channel))
            metric_fptr = stack.enter_context(open(metric_filename, 'w'))

            input_jsonl_filepath = os.path.join(args.squad_rc_dir, jsonl_basename)
            _run_job(input_jsonl_filepath, channel, metric_fptr, args)

        logging.info('Output directory: {}'.format(args.output_dir))