def main()

in spark_scripts/create_ds_synthetic_dataset.py [0:0]


def main(sc):
    default_config = utils.read_default_config_toml()

    argp = argparse.ArgumentParser()
    argp.add_argument('--corpus', help='input path for corpus data', required=True)
    argp.add_argument('--output-dir', default="output/", help='', required=True)
    argp.add_argument('--ulim-count', type=int, default=None,
                      help='approximate output count. Does not change NER ulim')
    argp.add_argument('--es-hosts', help='', default=os.getenv('AES_HOSTS'))
    argp.add_argument('--es-index-readonly', help='', default=default_config['es_index_readonly'])
    argp.add_argument('--whxx-ngram-table', help='toml config file', default='resources/whxx_ngram_table.toml')
    argp.add_argument('--num-partitions', type=int, default=1000, help='')
    argp.add_argument('--debug-save', help='for debugging purposes', action='store_true')

    argp.add_argument('--ulim-ner', default=None, type=int, help='upper limit of NER')
    argp.add_argument('--ner',
                      help='NER entity2articles folder. If none is given, NER set is computed from corpus.')

    argp.add_argument('--phrase-mode', choices=[e.value for e in PhraseMode], default=PhraseMode.NER_ONLY.value,
                      help='Generate data using ner_only. Skip noun phrases')

    argp.add_argument('--aux-qs', type=int, dest='nb_aux_qs_matches',
                      help='number of auxiliary entity matches with query sentence', default=0)
    argp.add_argument('--aux-awc', type=int, dest='nb_aux_awc_matches',
                      help='number of additional aux matches with anywhere in context (in additional to aux-qs)',
                      default=0)
    args = argp.parse_args()

    assert args.es_hosts and args.es_index_readonly

    logging.info('es index: {}'.format(args.es_index_readonly))

    if args.ner:
        ner_rdd = sc.textFile(args.ner, minPartitions=args.num_partitions).map(
            lambda x: PhraseObj.import_from(json.loads(x)))
    else:
        ner_rdd = None

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    with open(args.whxx_ngram_table) as fptr:
        whxx_ngram_table = WhxxNgramTable.import_from_toml(fptr)

    with ExitStack() as stack:
        metric_fptr = stack.enter_context(open(os.path.join(args.output_dir, 'metric.txt'), 'w'))
        print('CMD: {}'.format(' '.join(sys.argv)), file=metric_fptr)

        article_rdd = sc.textFile(args.corpus, minPartitions=args.num_partitions).map(Article.deserialize_json)

        job = SyntheticDataCreator(
            args.output_dir,
            ulim_count=args.ulim_count,
            es_hosts=args.es_hosts,
            es_index_name=args.es_index_readonly,
            whxx_ngram_table=whxx_ngram_table,
            nb_ner_ulim=args.ulim_ner,
            num_partitions=args.num_partitions,
            nb_aux_qs_matches=args.nb_aux_qs_matches,
            nb_aux_awc_matches=args.nb_aux_awc_matches,
            phrase_mode=PhraseMode(args.phrase_mode),
            debug_save=args.debug_save)
        job.run_job(sc, article_rdd, ner_rdd, metric_fptr)

        logging.info('Output directory: {}'.format(args.output_dir))

    for qstyle in QUESTION_STYLES_FOR_JSONLINES:
        _create_jsonl_training_files(args.output_dir, qstyle)