in misc/reference_datasets/monolingual/zh/download_mapcc.py [0:0]
def run(self, data, rank: int = 0, world_size: int = 1):
from tqdm import tqdm
import os
from datatrove.io import get_datafolder
# Create results directory if it doesn't exist
os.makedirs("results", exist_ok=True)
# Initialize fsspec with Hugging Face protocol
df = get_datafolder("hf://datasets/m-a-p/MAP-CC")
# Download from m-a-p/MAP-CC to results folder
files_to_download = []
for file in df.list_files(recursive=True, glob_pattern="zh_cc.jsonl.gz*"):
files_to_download.append(file)
output_path = "/path/to/ref-datasets/mapcc"
for file in files_to_download[rank::world_size]:
print(file)
print(f"Downloading {file}")
output_file = f"{output_path}/{os.path.basename(file)}"
# Get file size for progress bar
file_size = df.info(file)['size']
# Open input file in binary mode and write chunks directly to output
with df.open(file, "rb") as source, open(output_file, "wb") as dest:
with tqdm(total=file_size, unit='B', unit_scale=True, desc=os.path.basename(file)) as pbar:
while True:
chunk = source.read(8192)
if not chunk:
break
dest.write(chunk)
pbar.update(len(chunk))