misc/reference_datasets/monolingual/te/download_sangraha.py (39 lines of code) (raw):

from datatrove.executor import SlurmPipelineExecutor from datatrove.pipeline.readers import ParquetReader from datatrove.pipeline.writers import JsonlWriter SlurmPipelineExecutor( pipeline=[ ParquetReader("hf://datasets/ai4bharat/sangraha", glob_pattern="synthetic/tel_Telu/*.parquet", id_key="doc_id", default_metadata={"language": "te", "subset": "synthetic"}), ParquetReader("hf://datasets/ai4bharat/sangraha", glob_pattern="verified/tel/*.parquet", id_key="doc_id", default_metadata={"language": "te", "subset": "verified"}), ParquetReader("hf://datasets/ai4bharat/sangraha", glob_pattern="unverified/tel/*.parquet", id_key="doc_id", default_metadata={"language": "te", "subset": "unverified"}), JsonlWriter("/path/to/ref-datasets/monolingual/te/sangraha", output_filename="${rank}.jsonl.gz", max_file_size=2*2**30) ], tasks=32, randomize_start_duration=3 * 60, time="11:59:59", job_name="dl_sangraha", cpus_per_task=64, mem_per_cpu_gb=1, partition="partition", srun_args={"environment": "train"}, logging_dir="/path/to/logs/dataset_download_logs/te/sangraha", ).run() from datatrove.executor import SlurmPipelineExecutor from datatrove.pipeline.writers import JsonlWriter from datatrove.pipeline.readers import ParquetReader from datatrove.io import DataFolder SlurmPipelineExecutor( job_name="sangraha", pipeline=[ ParquetReader("hf://datasets/ai4bharat/sangraha", glob_pattern="**/tel/*.parquet", default_metadata={"language": "te"}, doc_progress=True, file_progress=True, text_key="text"), ParquetReader("hf://datasets/ai4bharat/sangraha", glob_pattern="**/tel_Telu/*.parquet", default_metadata={"language": "te"}, doc_progress=True, file_progress=True, text_key="text"), JsonlWriter("/path/to/ref-datasets/monolingual/te/sangraha", output_filename="${rank}.jsonl.gz", max_file_size=2*2**30) ], logging_dir="/path/to/logs/dataset_download_logs/te/sangraha", randomize_start_duration=3 * 60, tasks=100, mem_per_cpu_gb=4, partition="partition", time="11:59:59", ).run()