filtering/deduplication/save_dataset.py (9 lines of code) (raw):

import os from multiprocessing import cpu_count from datasets import load_dataset HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") print(HUGGINGFACE_TOKEN) oscar = load_dataset( "oscar-corpus/OSCAR-2201", "en", use_auth_token=HUGGINGFACE_TOKEN, num_proc=cpu_count(), ignore_verifications=True ) oscar.save_to_disk("/home/piktus_huggingface_co/lumi/oscar/")