in tensorflow_datasets/core/dataset_builder.py [0:0]
def _should_cache_ds(self, split, shuffle_files, read_config):
"""Returns True if TFDS should auto-cache the dataset."""
# The user can explicitly opt-out from auto-caching
if not read_config.try_autocache:
return False
# Skip datasets with unknown size.
# Even by using heuristic with `download_size` and
# `MANUAL_DOWNLOAD_INSTRUCTIONS`, it wouldn't catch datasets which hardcode
# the non-processed data-dir, nor DatasetBuilder not based on tf-record.
if not self.info.dataset_size:
return False
# Do not cache big datasets
# Instead of using the global size, we could infer the requested bytes:
# `self.info.splits[split].num_bytes`
# The info is available for full splits, and could be approximated
# for subsplits `train[:50%]`.
# However if the user is creating multiple small splits from a big
# dataset, those could adds up and fill up the entire RAM.
# 250 MiB is arbitrary picked. For comparison, Cifar10 is about 150 MiB.
if self.info.dataset_size > 250 * units.MiB:
return False
# We do not want to cache data which has more than one shards when
# shuffling is enabled, as this would effectively disable shuffling.
# An exception is for single shard (as shuffling is a no-op).
# Another exception is if reshuffle is disabled (shuffling already cached)
num_shards = len(self.info.splits[split].file_instructions)
if (shuffle_files and
# Shuffling only matter when reshuffle is True or None (default)
read_config.shuffle_reshuffle_each_iteration is not False and # pylint: disable=g-bool-id-comparison
num_shards > 1):
return False
# If the dataset satisfy all the right conditions, activate autocaching.
return True