in services/worker/src/worker/job_runners/config/parquet_and_info.py [0:0]
def copy_parquet_files(builder: DatasetBuilder) -> list[CommitOperationCopy]:
"""Copy parquet files by copying the git LFS pointer files"""
data_files = builder.config.data_files
if not data_files:
raise EmptyDatasetError("Empty parquet data_files")
empty_splits = [split for split in data_files if not data_files[split]]
if empty_splits:
raise EmptyDatasetError(f"Empty parquet data_files for splits: {empty_splits}")
parquet_operations = []
for split in data_files:
for shard_idx, data_file in enumerate(data_files[split]):
# data_file format for hub files is hf://datasets/{repo_id}@{revision}/{path_in_repo}
src_revision, src_path_in_repo = data_file.split("@")[1].split("/", 1)
src_revision = unquote(src_revision)
src_path_in_repo = unquote(src_path_in_repo)
parquet_file = ParquetFile(
config=builder.config.name, split=split, shard_idx=shard_idx, num_shards=len(data_files[split])
)
parquet_operations.append(
CommitOperationCopy(
src_path_in_repo=src_path_in_repo,
path_in_repo=parquet_file.path_in_repo,
src_revision=src_revision,
)
)
return parquet_operations