def detect_modalities_from_url_columns()

in services/worker/src/worker/job_runners/dataset/modalities.py [0:0]


def detect_modalities_from_url_columns(dataset: str) -> set[DatasetModality]:
    """
    Detect modalities of a dataset using the type of URL columns.
    E.g. if a column contains URLs of images.

    Args:
        dataset (`str`):
            A namespace (user or an organization) and a repo name separated by a `/`.

    Raises:
        [~`libcommon.simple_cache.CachedArtifactError`]:
            If the previous step gave an error.
        [~`libcommon.exceptions.PreviousStepFormatError`]:
            If the content of the previous step has not the expected format

    Returns:
        `set[DatasetModality]`: A set of modalities.
    """
    split_names_response = get_previous_step_or_raise(kind="dataset-split-names", dataset=dataset)
    content = split_names_response["content"]
    if "splits" not in content and not isinstance(content["splits"], list):
        raise PreviousStepFormatError("Previous step did not return the expected content: 'splits'.")

    try:
        for split_item in content["splits"][:10]:  # no need to check all the configs
            config = split_item["config"]
            split = split_item["split"]
            try:
                response = get_response(kind="split-image-url-columns", dataset=dataset, config=config, split=split)
            except CachedArtifactNotFoundError:
                logging.debug("No response found in previous step for this dataset: 'split-image-url-columns'.")
                continue
            if response["http_status"] != HTTPStatus.OK:
                logging.debug(f"Previous step gave an error: {response['http_status']}.")
                continue
            else:
                try:
                    if response["content"]["columns"]:
                        return {"image"}
                except Exception as e:
                    raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
    except Exception as e:
        raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e

    return set()