in services/worker/src/worker/job_runners/dataset/modalities.py [0:0]
def detect_features_modalities(features: Features) -> set[DatasetModality]:
"""
Detect modalities of a dataset using the features (column types).
Args:
features (`datasets.Features`):
The features of a config.
Returns:
`set[DatasetModality]`: A set of modalities.
"""
modalities: set[DatasetModality] = set()
def classify_modality(feature: FeatureType) -> None:
nonlocal modalities
if isinstance(feature, Audio):
modalities.add("audio")
elif isinstance(feature, Image):
modalities.add("image")
elif isinstance(feature, Value) and feature.dtype in ("string", "large_string"):
modalities.add("text")
elif isinstance(feature, (Translation, TranslationVariableLanguages)):
modalities.add("text")
elif isinstance(feature, Pdf):
modalities.add("document")
_visit(features, classify_modality)
# detection of tabular data: if there are at least two top-level numerical columns, and no "media" columns
if (
not ("audio" in modalities or "image" in modalities)
and len(
[
feature
for feature in features.values()
if isinstance(feature, Value) and ("int" in feature.dtype or "float" in feature.dtype)
]
)
>= 2
):
modalities.add("tabular")
# detection of time series
if any(
"emb" not in column_name # ignore lists of floats that may be embeddings
and (
(isinstance(feature, (LargeList, Sequence)) and feature.feature == Value("float32"))
or (isinstance(feature, list) and feature[0] == Value("float32"))
)
for column_name, feature in features.items()
):
modalities.add("timeseries")
# other idea: detect datasets with only numerical columns and one timestamp column
# (and ideally be able to detect dates/timestamps even from a column with string type)
return modalities