in services/worker/src/worker/job_runners/dataset/modalities.py [0:0]
def detect_modalities_from_url_columns(dataset: str) -> set[DatasetModality]:
"""
Detect modalities of a dataset using the type of URL columns.
E.g. if a column contains URLs of images.
Args:
dataset (`str`):
A namespace (user or an organization) and a repo name separated by a `/`.
Raises:
[~`libcommon.simple_cache.CachedArtifactError`]:
If the previous step gave an error.
[~`libcommon.exceptions.PreviousStepFormatError`]:
If the content of the previous step has not the expected format
Returns:
`set[DatasetModality]`: A set of modalities.
"""
split_names_response = get_previous_step_or_raise(kind="dataset-split-names", dataset=dataset)
content = split_names_response["content"]
if "splits" not in content and not isinstance(content["splits"], list):
raise PreviousStepFormatError("Previous step did not return the expected content: 'splits'.")
try:
for split_item in content["splits"][:10]: # no need to check all the configs
config = split_item["config"]
split = split_item["split"]
try:
response = get_response(kind="split-image-url-columns", dataset=dataset, config=config, split=split)
except CachedArtifactNotFoundError:
logging.debug("No response found in previous step for this dataset: 'split-image-url-columns'.")
continue
if response["http_status"] != HTTPStatus.OK:
logging.debug(f"Previous step gave an error: {response['http_status']}.")
continue
else:
try:
if response["content"]["columns"]:
return {"image"}
except Exception as e:
raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
except Exception as e:
raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
return set()