filtering/deduplication/download_oscar.py (15 lines of code) (raw):

import os from collections import Counter from datasets import load_dataset HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN") oscar = load_dataset( "oscar-corpus/OSCAR-2201", "en", use_auth_token=HUGGINGFACE_TOKEN, num_proc=128, ignore_verifications=True, ) # oscar.save_to_disk("/home/piktus_huggingface_co/lumi/oscar/") oscar_ids = oscar["train"]["id"] print("Number of Oscar IDs", len(oscar_ids)) unique_ids = Counter(oscar_ids) print(unique_ids.most_common(10))