def _create_jsonl_training_files()

in spark_scripts/create_ds_synthetic_dataset.py [0:0]


def _create_jsonl_training_files(output_dir, qstyle):
    dest_200k_jsonl_filepath = os.path.join(output_dir, '{}_200k.jsonl'.format(qstyle.value))
    cmd_lst = []

    cmd_lst.append('cat {}/part-* | shuf -n 200000 > {}'.format(
        os.path.join(output_dir, qstyle.value),
        dest_200k_jsonl_filepath))

    for kilo_head_count in [50, 100]:
        dest_jsonl_filepath = os.path.join(output_dir, '{}_{}k.jsonl'.format(qstyle.value, kilo_head_count))
        cmd_lst.append('head -n {} {} > {}'.format(
            kilo_head_count * 1000,
            dest_200k_jsonl_filepath,
            dest_jsonl_filepath))

    for cmd in cmd_lst:
        logging.info('# {}'.format(cmd))
        subprocess.run(cmd, shell=True)