in src/datasets/builder.py [0:0]
def _check_legacy_cache2(self, dataset_module: "DatasetModule") -> Optional[str]:
"""Check for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15"""
if (
self.__module__.startswith("datasets.")
and not is_remote_url(self._cache_dir_root)
and not (set(self.config_kwargs) - {"data_files", "data_dir"})
):
from .packaged_modules import _PACKAGED_DATASETS_MODULES_2_15_HASHES
from .utils._dill import Pickler
def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:
"""
Used to update hash of packaged modules which is used for creating unique cache directories to reflect
different config parameters which are passed in metadata from readme.
"""
params_to_exclude = {"config_name", "version", "description"}
params_to_add_to_hash = {
param: value
for param, value in sorted(config_parameters.items())
if param not in params_to_exclude
}
m = Hasher()
m.update(hash)
m.update(params_to_add_to_hash)
return m.hexdigest()
namespace = self.repo_id.split("/")[0] if self.repo_id and self.repo_id.count("/") > 0 else None
with patch.object(Pickler, "_legacy_no_dict_keys_sorting", True):
config_id = self.config.name + "-" + Hasher.hash({"data_files": self.config.data_files})
hash = _PACKAGED_DATASETS_MODULES_2_15_HASHES.get(self.name, "missing")
if (
dataset_module.builder_configs_parameters.metadata_configs
and self.config.name in dataset_module.builder_configs_parameters.metadata_configs
):
hash = update_hash_with_config_parameters(
hash, dataset_module.builder_configs_parameters.metadata_configs[self.config.name]
)
legacy_relative_data_dir = posixpath.join(
self.dataset_name if namespace is None else f"{namespace}___{self.dataset_name}",
config_id,
"0.0.0",
hash,
)
legacy_cache_dir = posixpath.join(self._cache_dir_root, legacy_relative_data_dir)
if os.path.isdir(legacy_cache_dir):
return legacy_relative_data_dir