def get_total_files

def get_total_files_size()

in services/worker/src/worker/job_runners/config/parquet_and_info.py [0:0]

24 lines of code
13 McCabe index (conditional complexity)


def get_total_files_size(urlpaths: list[str], storage_options: dict[str, Any]) -> int:
    total_size = 0
    fs = HfFileSystem(**storage_options["hf"])
    # fastest way to get hf files sizes is using get_paths_info
    hf_paths = [fs.resolve_path(path.split("::")[-1]) for path in urlpaths if path.startswith("hf://")]
    for repo_id, hf_paths_in_repo in groupby(hf_paths, key=lambda path: path.repo_id):
        batches = list(batched((path.path_in_repo for path in hf_paths_in_repo), 200))  # max is 1k files per request
        paths_info_per_batch = thread_map(
            functools.partial(fs._api.get_paths_info, repo_type="dataset"), [repo_id] * len(batches), batches
        )
        total_size += sum(
            path_info.size
            for paths_info in paths_info_per_batch
            for path_info in paths_info
            if isinstance(path_info, RepoFile)
        )
    # for other files we simply use fsspec
    external_paths = [path for path in urlpaths if not path.startswith("hf://")]
    total_size += sum(
        size
        for size in thread_map(
            functools.partial(_fsspec_request_size, storage_options=storage_options), external_paths
        )
        if size
    )
    return total_size