in spark_scripts/create_ds_synthetic_dataset.py [0:0]
def _create_jsonl_training_files(output_dir, qstyle):
dest_200k_jsonl_filepath = os.path.join(output_dir, '{}_200k.jsonl'.format(qstyle.value))
cmd_lst = []
cmd_lst.append('cat {}/part-* | shuf -n 200000 > {}'.format(
os.path.join(output_dir, qstyle.value),
dest_200k_jsonl_filepath))
for kilo_head_count in [50, 100]:
dest_jsonl_filepath = os.path.join(output_dir, '{}_{}k.jsonl'.format(qstyle.value, kilo_head_count))
cmd_lst.append('head -n {} {} > {}'.format(
kilo_head_count * 1000,
dest_200k_jsonl_filepath,
dest_jsonl_filepath))
for cmd in cmd_lst:
logging.info('# {}'.format(cmd))
subprocess.run(cmd, shell=True)