in tensorflow_datasets/core/dataset_builder.py [0:0]
def download_and_prepare(self, *, download_dir=None, download_config=None):
"""Downloads and prepares dataset for reading.
Args:
download_dir: `str`, directory where downloaded files are stored. Defaults
to "~/tensorflow-datasets/downloads".
download_config: `tfds.download.DownloadConfig`, further configuration for
downloading and preparing dataset.
Raises:
IOError: if there is not enough disk space available.
"""
download_config = download_config or download.DownloadConfig()
data_exists = tf.io.gfile.exists(self._data_dir)
if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
return
elif data_exists and download_config.download_mode == REUSE_CACHE_IF_EXISTS:
logging.info("Deleting pre-existing dataset %s (%s)", self.name,
self._data_dir)
utils.as_path(self._data_dir).rmtree() # Delete pre-existing data.
data_exists = tf.io.gfile.exists(self._data_dir)
if self.version.tfds_version_to_prepare:
available_to_prepare = ", ".join(
str(v) for v in self.versions if not v.tfds_version_to_prepare)
raise AssertionError(
"The version of the dataset you are trying to use ({}:{}) can only "
"be generated using TFDS code synced @ {} or earlier. Either sync to "
"that version of TFDS to first prepare the data or use another "
"version of the dataset (available for `download_and_prepare`: "
"{}).".format(self.name, self.version,
self.version.tfds_version_to_prepare,
available_to_prepare))
# Only `cls.VERSION` or `experimental_latest` versions can be generated.
# Otherwise, users may accidentally generate an old version using the
# code from newer versions.
installable_versions = {
str(v) for v in (self.canonical_version, max(self.versions))
}
if str(self.version) not in installable_versions:
msg = ("The version of the dataset you are trying to use ({}) is too "
"old for this version of TFDS so cannot be generated.").format(
self.info.full_name)
if self.version.tfds_version_to_prepare:
msg += (
"{} can only be generated using TFDS code synced @ {} or earlier "
"Either sync to that version of TFDS to first prepare the data or "
"use another version of the dataset. ").format(
self.version, self.version.tfds_version_to_prepare)
else:
msg += (
"Either sync to a previous version of TFDS to first prepare the "
"data or use another version of the dataset. ")
msg += "Available for `download_and_prepare`: {}".format(
list(sorted(installable_versions)))
raise ValueError(msg)
# Currently it's not possible to overwrite the data because it would
# conflict with versioning: If the last version has already been generated,
# it will always be reloaded and data_dir will be set at construction.
if data_exists:
raise ValueError(
"Trying to overwrite an existing dataset {} at {}. A dataset with "
"the same version {} already exists. If the dataset has changed, "
"please update the version number.".format(self.name, self._data_dir,
self.version))
logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
if not utils.has_sufficient_disk_space(
self.info.dataset_size + self.info.download_size,
directory=self._data_dir_root):
raise IOError(
"Not enough disk space. Needed: {} (download: {}, generated: {})"
.format(
self.info.dataset_size + self.info.download_size,
self.info.download_size,
self.info.dataset_size,
))
self._log_download_bytes()
dl_manager = self._make_download_manager(
download_dir=download_dir,
download_config=download_config,
)
# Maybe save the `builder_cls` metadata common to all builder configs.
if self.BUILDER_CONFIGS:
_save_default_config_name(
# `data_dir/ds_name/config/version/` -> `data_dir/ds_name/`
common_dir=self.data_path.parent.parent,
default_config_name=self.BUILDER_CONFIGS[0].name,
)
# Create a tmp dir and rename to self._data_dir on successful exit.
with utils.incomplete_dir(self._data_dir) as tmp_data_dir:
# Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
# it to every sub function.
with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
if (download_config.try_download_gcs and
gcs_utils.is_dataset_on_gcs(self.info.full_name)):
logging.info(GCS_HOSTED_MSG, self.name)
gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir)
self.info.read_from_directory(self._data_dir)
else:
# Old version of TF are not os.PathLike compatible
with tf_compat.mock_gfile_pathlike():
self._download_and_prepare(
dl_manager=dl_manager,
download_config=download_config,
)
# NOTE: If modifying the lines below to put additional information in
# DatasetInfo, you'll likely also want to update
# DatasetInfo.read_from_directory to possibly restore these attributes
# when reading from package data.
self.info.download_size = dl_manager.downloaded_size
# Write DatasetInfo to disk, even if we haven't computed statistics.
self.info.write_to_directory(self._data_dir)
# The generated DatasetInfo contains references to `tmp_data_dir`
self.info.update_data_dir(self._data_dir)
self._log_download_done()