in filtering/deduplication/save_dataset_sample.py [0:0]
def save_dataset(dataset_name, base_dir, sample_size=100000, token=None):
print("Processing", dataset_name)
ds = load_from_disk(base_dir + dataset_name)
ds.shuffle()
while sample_size > len(ds["train"]):
sample_size //= 10
small_ds = DatasetDict({"train": ds["train"].select(range(sample_size))})
small_ds.push_to_hub("ola13/small-" + dataset_name, private=True, token=token)
print("Pushed", dataset_name, "to hub.")