in src/datasets/load.py [0:0]
def get_module(self) -> DatasetModule:
readme_path = os.path.join(self.path, config.REPOCARD_FILENAME)
standalone_yaml_path = os.path.join(self.path, config.REPOYAML_FILENAME)
dataset_card_data = DatasetCard.load(readme_path).data if os.path.isfile(readme_path) else DatasetCardData()
if os.path.exists(standalone_yaml_path):
with open(standalone_yaml_path, encoding="utf-8") as f:
standalone_yaml_data = yaml.safe_load(f.read())
if standalone_yaml_data:
_dataset_card_data_dict = dataset_card_data.to_dict()
_dataset_card_data_dict.update(standalone_yaml_data)
dataset_card_data = DatasetCardData(**_dataset_card_data_dict)
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
# we need a set of data files to find which dataset builder to use
# because we need to infer module name by files extensions
base_path = Path(self.path, self.data_dir or "").expanduser().resolve().as_posix()
if self.data_files is not None:
patterns = sanitize_patterns(self.data_files)
elif metadata_configs and not self.data_dir and "data_files" in next(iter(metadata_configs.values())):
patterns = sanitize_patterns(next(iter(metadata_configs.values()))["data_files"])
else:
patterns = get_data_patterns(base_path)
data_files = DataFilesDict.from_patterns(
patterns,
base_path=base_path,
allowed_extensions=ALL_ALLOWED_EXTENSIONS,
)
module_name, default_builder_kwargs = infer_module_for_data_files(
data_files=data_files,
path=self.path,
)
data_files = data_files.filter(
extensions=_MODULE_TO_EXTENSIONS[module_name], file_names=_MODULE_TO_METADATA_FILE_NAMES[module_name]
)
module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
if metadata_configs:
builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
module_path,
metadata_configs,
base_path=base_path,
default_builder_kwargs=default_builder_kwargs,
)
else:
builder_configs: list[BuilderConfig] = [
import_main_class(module_path).BUILDER_CONFIG_CLASS(
data_files=data_files,
**default_builder_kwargs,
)
]
default_config_name = None
builder_kwargs = {
"base_path": self.path,
"dataset_name": camelcase_to_snakecase(Path(self.path).name),
}
if self.data_dir:
builder_kwargs["data_files"] = data_files
# this file is deprecated and was created automatically in old versions of push_to_hub
if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)):
with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
legacy_dataset_infos = DatasetInfosDict(
{
config_name: DatasetInfo.from_dict(dataset_info_dict)
for config_name, dataset_info_dict in json.load(f).items()
}
)
if len(legacy_dataset_infos) == 1:
# old config e.g. named "username--dataset_name"
legacy_config_name = next(iter(legacy_dataset_infos))
legacy_dataset_infos["default"] = legacy_dataset_infos.pop(legacy_config_name)
legacy_dataset_infos.update(dataset_infos)
dataset_infos = legacy_dataset_infos
if default_config_name is None and len(dataset_infos) == 1:
default_config_name = next(iter(dataset_infos))
hash = Hasher.hash({"dataset_infos": dataset_infos, "builder_configs": builder_configs})
return DatasetModule(
module_path,
hash,
builder_kwargs,
dataset_infos=dataset_infos,
builder_configs_parameters=BuilderConfigsParameters(
metadata_configs=metadata_configs,
builder_configs=builder_configs,
default_config_name=default_config_name,
),
)