def run()

in misc/reference_datasets/monolingual/zh/download_mapcc.py [0:0]


    def run(self, data, rank: int = 0, world_size: int = 1):
        from tqdm import tqdm
        import os
        from datatrove.io import get_datafolder
        # Create results directory if it doesn't exist
        os.makedirs("results", exist_ok=True)
        
        # Initialize fsspec with Hugging Face protocol
        df = get_datafolder("hf://datasets/m-a-p/MAP-CC")
        
        # Download from m-a-p/MAP-CC to results folder
        files_to_download = []
        for file in df.list_files(recursive=True, glob_pattern="zh_cc.jsonl.gz*"):
            files_to_download.append(file)
        output_path = "/path/to/ref-datasets/mapcc"

        
        for file in files_to_download[rank::world_size]:
            print(file)
            print(f"Downloading {file}")
            output_file = f"{output_path}/{os.path.basename(file)}"
            
            # Get file size for progress bar
            file_size = df.info(file)['size']
            
            # Open input file in binary mode and write chunks directly to output
            with df.open(file, "rb") as source, open(output_file, "wb") as dest:
                with tqdm(total=file_size, unit='B', unit_scale=True, desc=os.path.basename(file)) as pbar:
                    while True:
                        chunk = source.read(8192)
                        if not chunk:
                            break
                        dest.write(chunk)
                        pbar.update(len(chunk))