in services/worker/src/worker/job_runners/dataset/modalities.py [0:0]
def detect_modalities_from_filetypes(dataset: str) -> set[DatasetModality]:
"""
Detect modalities of a dataset using the repository file extensions.
Args:
dataset (`str`):
A namespace (user or an organization) and a repo name separated by a `/`.
Raises:
[~`libcommon.simple_cache.CachedArtifactError`]:
If the previous step gave an error.
[~`libcommon.exceptions.PreviousStepFormatError`]:
If the content of the previous step has not the expected format
Returns:
`set[DatasetModality]`: A set of modalities.
"""
dataset_filetypes_response = get_previous_step_or_raise(kind="dataset-filetypes", dataset=dataset)
content = dataset_filetypes_response["content"]
if "filetypes" not in content or not isinstance(content["filetypes"], list):
raise PreviousStepFormatError("Previous step did not return the expected content: 'filetypes'.")
try:
modalities: set[DatasetModality] = set()
total_count = sum(
filetype["count"] for filetype in content["filetypes"] if filetype["extension"] in ALL_EXTENSIONS
)
has_multi_rows_files = any(
filetype["count"] for filetype in content["filetypes"] if filetype["extension"] in MULTI_ROWS_EXTENSIONS
)
min_count = round(0.1 * total_count) # ignore files that are <10% of the data files to avoid false positives
min_count_for_image = (
10 if has_multi_rows_files else 1
) # images are often used as figures in README, so we also add this threshold
for filetype in content["filetypes"]:
# we condition by a number of files (filetype["count"] > threshold) to avoid false positives
if filetype["count"] < min_count:
continue
elif filetype["extension"] in IMAGE_EXTENSIONS and filetype["count"] < min_count_for_image:
continue
if filetype["extension"] in IMAGE_EXTENSIONS:
modalities.add("image")
elif filetype["extension"] in AUDIO_EXTENSIONS | AUDIO_BUT_COULD_ALSO_BE_VIDEO_EXTENSIONS:
modalities.add("audio")
elif filetype["extension"] in VIDEO_EXTENSIONS | VIDEO_BUT_COULD_ALSO_BE_AUDIO_EXTENSIONS:
modalities.add("video")
elif filetype["extension"] in GEOSPATIAL_EXTENSIONS:
modalities.add("geospatial")
elif filetype["extension"] in _3D_EXTENSIONS:
modalities.add("3d")
elif filetype["extension"] in TEXT_EXTENSIONS:
modalities.add("text")
elif filetype["extension"] in DOCUMENT_EXTENSIONS:
modalities.add("document")
except Exception as e:
raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e
return modalities