def save_split_sharded_already_splitted_dataset()

in obelics/processors/web_document_extractor.py [0:0]


def save_split_sharded_already_splitted_dataset(dataset, path_save_dir_sharded_dataset, shard_size):
    def save_split_ds(split_dataset, split_name):
        num_shards = math.ceil(len(split_dataset) / shard_size)
        for idx in tqdm(range(num_shards)):
            shard = split_dataset.shard(num_shards=num_shards, index=idx, contiguous=True)
            shard.save_to_disk(os.path.join(path_save_dir_sharded_dataset, split_name, f"shard_{idx}"))

    os.makedirs(path_save_dir_sharded_dataset, exist_ok=True)

    f = open(os.path.join(path_save_dir_sharded_dataset, "dataset_dict.json"), "w")
    f.write('{"splits": ["train", "valid"]}')
    f.close()

    os.makedirs(os.path.join(path_save_dir_sharded_dataset, "train"), exist_ok=True)
    os.makedirs(os.path.join(path_save_dir_sharded_dataset, "valid"), exist_ok=True)

    logger.info("Starting sharding the dataset")
    train_dataset = dataset["train"]
    valid_dataset = dataset["valid"]

    save_split_ds(train_dataset, "train")
    save_split_ds(valid_dataset, "valid")
    logger.info("Finished sharding the dataset")