in spark_scripts/stat_for_ner_category_to_wh_words.py [0:0]
def main(sc):
argp = argparse.ArgumentParser()
argp.add_argument('--squadner', help='input path of squadner data', required=True)
argp.add_argument('--output-dir', default="output/", help='', required=True)
argp.add_argument('--num-partitions', type=int, default=1000, help='')
argp.add_argument('--debug-save', help='for debugging purposes', action='store_true')
args = argp.parse_args()
if not os.path.exists(args.output_dir):
os.mkdir(args.output_dir)
with ExitStack() as stack:
metric_filename = os.path.join(args.output_dir, 'metric.txt')
metric_fptr = stack.enter_context(open(metric_filename, 'w'))
metric_per_category_fptr = stack.enter_context(open(
os.path.join(args.output_dir, 'metric_per_category.txt'), 'w'))
toml_fptr = stack.enter_context(open(
os.path.join(args.output_dir, 'whxx_ngram_table.toml'), 'w'))
question_rdd = sc.textFile(args.squadner, minPartitions=args.num_partitions).map(RcQuestion.deserialize_json)
_run_stat(question_rdd, metric_fptr, metric_per_category_fptr, toml_fptr, args.output_dir)
logging.info('Output dir: {}'.format(args.output_dir))