def detect_modalities_from

def detect_modalities_from_filetypes()

in services/worker/src/worker/job_runners/dataset/modalities.py [0:0]
39 lines of code
20 McCabe index (conditional complexity)

def detect_modalities_from_filetypes(dataset: str) -> set[DatasetModality]:
    """
    Detect modalities of a dataset using the repository file extensions.

    Args:
        dataset (`str`):
            A namespace (user or an organization) and a repo name separated by a `/`.

    Raises:
        [~`libcommon.simple_cache.CachedArtifactError`]:
            If the previous step gave an error.
        [~`libcommon.exceptions.PreviousStepFormatError`]:
            If the content of the previous step has not the expected format

    Returns:
        `set[DatasetModality]`: A set of modalities.
    """
    dataset_filetypes_response = get_previous_step_or_raise(kind="dataset-filetypes", dataset=dataset)
    content = dataset_filetypes_response["content"]
    if "filetypes" not in content or not isinstance(content["filetypes"], list):
        raise PreviousStepFormatError("Previous step did not return the expected content: 'filetypes'.")

    try:
        modalities: set[DatasetModality] = set()
        total_count = sum(
            filetype["count"] for filetype in content["filetypes"] if filetype["extension"] in ALL_EXTENSIONS
        )
        has_multi_rows_files = any(
            filetype["count"] for filetype in content["filetypes"] if filetype["extension"] in MULTI_ROWS_EXTENSIONS
        )
        min_count = round(0.1 * total_count)  # ignore files that are <10% of the data files to avoid false positives
        min_count_for_image = (
            10 if has_multi_rows_files else 1
        )  # images are often used as figures in README, so we also add this threshold
        for filetype in content["filetypes"]:
            # we condition by a number of files (filetype["count"] > threshold) to avoid false positives
            if filetype["count"] < min_count:
                continue
            elif filetype["extension"] in IMAGE_EXTENSIONS and filetype["count"] < min_count_for_image:
                continue
            if filetype["extension"] in IMAGE_EXTENSIONS:
                modalities.add("image")
            elif filetype["extension"] in AUDIO_EXTENSIONS | AUDIO_BUT_COULD_ALSO_BE_VIDEO_EXTENSIONS:
                modalities.add("audio")
            elif filetype["extension"] in VIDEO_EXTENSIONS | VIDEO_BUT_COULD_ALSO_BE_AUDIO_EXTENSIONS:
                modalities.add("video")
            elif filetype["extension"] in GEOSPATIAL_EXTENSIONS:
                modalities.add("geospatial")
            elif filetype["extension"] in _3D_EXTENSIONS:
                modalities.add("3d")
            elif filetype["extension"] in TEXT_EXTENSIONS:
                modalities.add("text")
            elif filetype["extension"] in DOCUMENT_EXTENSIONS:
                modalities.add("document")
    except Exception as e:
        raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e

    return modalities