misc/reference_datasets/multilingual/download_culturax.py (24 lines of code) (raw):

from datatrove.executor import SlurmPipelineExecutor from datatrove.pipeline.readers import ParquetReader from datatrove.pipeline.writers import JsonlWriter def adapter(self, data: dict, path: str, id_in_file: int | str): """ The default data adapter to adapt input data into the datatrove Document format Args: data: a dictionary with the "raw" representation of the data path: file path or source for this sample id_in_file: its id in this particular file or source Returns: a dictionary with text, id, media and metadata fields """ return { "text": data.pop(self.text_key, ""), "id": data.pop(self.id_key, f"{path}/{id_in_file}"), "media": data.pop("media", []), "metadata": {"language": path.split("/")[0]} | data.pop("metadata", {}) | data, # remaining data goes into metadata } SlurmPipelineExecutor( job_name="culturax", pipeline=[ ParquetReader("hf://datasets/uonlp/CulturaX", glob_pattern="*/*.parquet", adapter=adapter), JsonlWriter("/path/to/ref-datasets/culturax", output_filename="${language}" + "/${rank}.jsonl.gz") ], tasks=1000, mem_per_cpu_gb=4, logging_dir="/path/to/logs/multilingual/copy/culturax", partition="partition", randomize_start_duration=3 * 60, time="20:00:00" ).run()