in spark_scripts/create_squad_ner_dataset.py [0:0]
def main(sc):
argp = argparse.ArgumentParser()
argp.add_argument('--squad-rc-dir', help='input path of squad data', required=True)
argp.add_argument('--output-dir', default="output/", help='', required=True)
argp.add_argument('--num-partitions', type=int, default=1000, help='')
argp.add_argument('--debug-save', help='for debugging purposes', action='store_true')
args = argp.parse_args()
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
input_dct = {
'train': 'squad_rc_train.jsonl',
'dev': 'squad_rc_dev.jsonl',
'test': 'squad_rc_test.jsonl',
}
if args.debug_save:
# debug_save mode only runs for dev set
input_dct = {'dev': input_dct['dev']}
with ExitStack() as stack:
for channel, jsonl_basename in input_dct.items():
metric_filename = os.path.join(args.output_dir, 'metric_{}.txt'.format(channel))
metric_fptr = stack.enter_context(open(metric_filename, 'w'))
input_jsonl_filepath = os.path.join(args.squad_rc_dir, jsonl_basename)
_run_job(input_jsonl_filepath, channel, metric_fptr, args)
logging.info('Output directory: {}'.format(args.output_dir))