in utils/dataset_utils.py [0:0]
def pull_cache_from_hub(cache_path, dataset_cache_dir):
"""
This function tries to pull a datasets cache from the huggingface hub if a
cache for the dataset does not already exist locally. The function expects you
to have you HUB_CACHE_ORGANIZATION=<the organization you've set up on the hub to store your cache>
and HF_TOKEN=<your hf token> on separate lines in a file named .env at the root of this repo.
Args:
cache_path (string):
The path to the local dataset cache that you want.
dataset_cache_dir (string):
The name of the dataset repo on the huggingface hub.
"""
hub_cache_organization, hf_token = _load_dotenv_for_cache_on_hub()
clone_source = pjoin(hub_cache_organization, dataset_cache_dir)
if isdir(cache_path):
logs.warning("Already a local cache for the dataset, so not pulling from the hub.")
else:
# Here, dataset_info.id is of the form: <hub cache organization>/<dataset cache dir>
if dataset_cache_dir in [
dataset_info.id.split("/")[-1] for dataset_info in
list_datasets(author=hub_cache_organization,
use_auth_token=hf_token)]:
Repository(local_dir=cache_path,
clone_from=clone_source,
repo_type="dataset", use_auth_token=hf_token)
logs.info("Pulled cache from hub!")
else:
logs.warning("Asking to pull cache from hub but cannot find cached repo on the hub.")