def save_dataset()

in filtering/deduplication/save_dataset_sample.py [0:0]


def save_dataset(dataset_name, base_dir, sample_size=100000, token=None):
    print("Processing", dataset_name)
    ds = load_from_disk(base_dir + dataset_name)
    ds.shuffle()
    while sample_size > len(ds["train"]):
        sample_size //= 10
    small_ds = DatasetDict({"train": ds["train"].select(range(sample_size))})
    small_ds.push_to_hub("ola13/small-" + dataset_name, private=True, token=token)
    print("Pushed", dataset_name, "to hub.")