in obelics/processors/web_document_extractor.py [0:0]
def save_split_sharded_already_splitted_dataset(dataset, path_save_dir_sharded_dataset, shard_size):
def save_split_ds(split_dataset, split_name):
num_shards = math.ceil(len(split_dataset) / shard_size)
for idx in tqdm(range(num_shards)):
shard = split_dataset.shard(num_shards=num_shards, index=idx, contiguous=True)
shard.save_to_disk(os.path.join(path_save_dir_sharded_dataset, split_name, f"shard_{idx}"))
os.makedirs(path_save_dir_sharded_dataset, exist_ok=True)
f = open(os.path.join(path_save_dir_sharded_dataset, "dataset_dict.json"), "w")
f.write('{"splits": ["train", "valid"]}')
f.close()
os.makedirs(os.path.join(path_save_dir_sharded_dataset, "train"), exist_ok=True)
os.makedirs(os.path.join(path_save_dir_sharded_dataset, "valid"), exist_ok=True)
logger.info("Starting sharding the dataset")
train_dataset = dataset["train"]
valid_dataset = dataset["valid"]
save_split_ds(train_dataset, "train")
save_split_ds(valid_dataset, "valid")
logger.info("Finished sharding the dataset")