def copy_parquet_files()

in services/worker/src/worker/job_runners/config/parquet_and_info.py [0:0]


def copy_parquet_files(builder: DatasetBuilder) -> list[CommitOperationCopy]:
    """Copy parquet files by copying the git LFS pointer files"""
    data_files = builder.config.data_files
    if not data_files:
        raise EmptyDatasetError("Empty parquet data_files")
    empty_splits = [split for split in data_files if not data_files[split]]
    if empty_splits:
        raise EmptyDatasetError(f"Empty parquet data_files for splits: {empty_splits}")
    parquet_operations = []
    for split in data_files:
        for shard_idx, data_file in enumerate(data_files[split]):
            # data_file format for hub files is hf://datasets/{repo_id}@{revision}/{path_in_repo}
            src_revision, src_path_in_repo = data_file.split("@")[1].split("/", 1)
            src_revision = unquote(src_revision)
            src_path_in_repo = unquote(src_path_in_repo)
            parquet_file = ParquetFile(
                config=builder.config.name, split=split, shard_idx=shard_idx, num_shards=len(data_files[split])
            )
            parquet_operations.append(
                CommitOperationCopy(
                    src_path_in_repo=src_path_in_repo,
                    path_in_repo=parquet_file.path_in_repo,
                    src_revision=src_revision,
                )
            )
    return parquet_operations