in services/worker/src/worker/job_runners/config/parquet_and_info.py [0:0]
def list_generated_parquet_files(builder: DatasetBuilder, partial: bool = False) -> list[LocalParquetFile]:
"""List the parquet files generated by `builder.download_and_prepare` in the `builder.cache_dir`."""
if not builder.info.splits:
raise EmptyDatasetError("No split found after generating parquet files")
split_dict = builder.info.splits
local_parquet_files: list[LocalParquetFile] = []
for split, split_info in split_dict.items():
# We know the `datasets` library uses a template for the shards names:
# - {builder.dataset_name}-{split}.parquet if there is only one shard
# - {builder.dataset_name}-{split}-{shard_idx:05d}-of-{num_shards:05d}.parquet otherwise
num_shards = len(split_info.shard_lengths) if isinstance(split_info.shard_lengths, list) else 1
filename_suffix = "-{shard_idx:05d}-of-" + f"{num_shards:05d}" if num_shards > 1 else ""
filename = f"{builder.dataset_name}-{split}{filename_suffix}.parquet"
local_parquet_files.extend(
[
LocalParquetFile(
local_file=os.path.join(
builder.cache_dir,
filename.format(shard_idx=shard_idx),
),
local_dir=builder.cache_dir,
config=builder.config.name,
split=split,
shard_idx=shard_idx,
num_shards=num_shards,
partial=partial,
)
for shard_idx in range(num_shards)
]
)
return local_parquet_files