in services/worker/src/worker/job_runners/dataset/info.py [0:0]
def compute_dataset_info_response(dataset: str) -> tuple[DatasetInfoResponse, float]:
"""
Get the response of 'dataset-info' for one specific dataset on huggingface.co.
Args:
dataset (`str`):
A namespace (user or an organization) and a repo name separated by a `/`.
Raises:
[~`libcommon.simple_cache.CachedArtifactError`]:
If the previous step gave an error.
[~`libcommon.exceptions.PreviousStepFormatError`]:
If the content of the previous step doesn't have the expected format.
Returns:
`tuple[DatasetInfoResponse, float]`: Tuple of an object with the dataset_info response and
progress float value from 0. to 1. which corresponds to the percentage of dataset configs
correctly processed and included in current response (some configs might not exist in cache yet
or raise errors).
"""
logging.info(f"compute 'dataset-info' for {dataset=}")
config_names_response = get_previous_step_or_raise(kind="dataset-config-names", dataset=dataset)
content = config_names_response["content"]
if "config_names" not in content:
raise PreviousStepFormatError("Previous step did not return the expected content: 'config_names'.")
try:
config_infos: dict[str, Any] = {}
total = 0
pending, failed = [], []
partial = False
for config_item in content["config_names"]:
config = config_item["config"]
total += 1
try:
config_response = get_response(kind="config-info", dataset=dataset, config=config)
except CachedArtifactNotFoundError:
logging.debug(f"No response found in previous step for {dataset=} {config=}: 'config-info'.")
pending.append(
PreviousJob(
kind="config-info",
dataset=dataset,
config=config,
split=None,
)
)
continue
if config_response["http_status"] != HTTPStatus.OK:
logging.debug(f"Previous step gave an error: {config_response['http_status']}")
failed.append(
PreviousJob(
kind="config-info",
dataset=dataset,
config=config,
split=None,
)
)
continue
config_infos[config] = config_response["content"]["dataset_info"]
partial = partial or config_response["content"]["partial"]
except Exception as e:
raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
progress = (total - len(pending)) / total if total else 1.0
return DatasetInfoResponse(dataset_info=config_infos, pending=pending, failed=failed, partial=partial), progress