def pull_cache_from_hub()

in utils/dataset_utils.py [0:0]


def pull_cache_from_hub(cache_path, dataset_cache_dir):
    """
    This function tries to pull a datasets cache from the huggingface hub if a
    cache for the dataset does not already exist locally. The function expects you
    to have you HUB_CACHE_ORGANIZATION=<the organization you've set up on the hub to store your cache>
    and HF_TOKEN=<your hf token> on separate lines in a file named .env at the root of this repo.

    Args:
        cache_path (string):
            The path to the local dataset cache that you want.
        dataset_cache_dir (string):
            The name of the dataset repo on the huggingface hub.

    """

    hub_cache_organization, hf_token = _load_dotenv_for_cache_on_hub()
    clone_source = pjoin(hub_cache_organization, dataset_cache_dir)

    if isdir(cache_path):
        logs.warning("Already a local cache for the dataset, so not pulling from the hub.")
    else:
        # Here, dataset_info.id is of the form: <hub cache organization>/<dataset cache dir>
        if dataset_cache_dir in [
            dataset_info.id.split("/")[-1] for dataset_info in
            list_datasets(author=hub_cache_organization,
                          use_auth_token=hf_token)]:
            Repository(local_dir=cache_path,
                       clone_from=clone_source,
                       repo_type="dataset", use_auth_token=hf_token)
            logs.info("Pulled cache from hub!")
        else:
            logs.warning("Asking to pull cache from hub but cannot find cached repo on the hub.")