def download_and_prepare()

in tensorflow_datasets/core/dataset_builder.py [0:0]


  def download_and_prepare(self, *, download_dir=None, download_config=None):
    """Downloads and prepares dataset for reading.

    Args:
      download_dir: `str`, directory where downloaded files are stored. Defaults
        to "~/tensorflow-datasets/downloads".
      download_config: `tfds.download.DownloadConfig`, further configuration for
        downloading and preparing dataset.

    Raises:
      IOError: if there is not enough disk space available.
    """

    download_config = download_config or download.DownloadConfig()
    data_exists = tf.io.gfile.exists(self._data_dir)
    if data_exists and download_config.download_mode == REUSE_DATASET_IF_EXISTS:
      logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
      return
    elif data_exists and download_config.download_mode == REUSE_CACHE_IF_EXISTS:
      logging.info("Deleting pre-existing dataset %s (%s)", self.name,
                   self._data_dir)
      utils.as_path(self._data_dir).rmtree()  # Delete pre-existing data.
      data_exists = tf.io.gfile.exists(self._data_dir)

    if self.version.tfds_version_to_prepare:
      available_to_prepare = ", ".join(
          str(v) for v in self.versions if not v.tfds_version_to_prepare)
      raise AssertionError(
          "The version of the dataset you are trying to use ({}:{}) can only "
          "be generated using TFDS code synced @ {} or earlier. Either sync to "
          "that version of TFDS to first prepare the data or use another "
          "version of the dataset (available for `download_and_prepare`: "
          "{}).".format(self.name, self.version,
                        self.version.tfds_version_to_prepare,
                        available_to_prepare))

    # Only `cls.VERSION` or `experimental_latest` versions can be generated.
    # Otherwise, users may accidentally generate an old version using the
    # code from newer versions.
    installable_versions = {
        str(v) for v in (self.canonical_version, max(self.versions))
    }
    if str(self.version) not in installable_versions:
      msg = ("The version of the dataset you are trying to use ({}) is too "
             "old for this version of TFDS so cannot be generated.").format(
                 self.info.full_name)
      if self.version.tfds_version_to_prepare:
        msg += (
            "{} can only be generated using TFDS code synced @ {} or earlier "
            "Either sync to that version of TFDS to first prepare the data or "
            "use another version of the dataset. ").format(
                self.version, self.version.tfds_version_to_prepare)
      else:
        msg += (
            "Either sync to a previous version of TFDS to first prepare the "
            "data or use another version of the dataset. ")
      msg += "Available for `download_and_prepare`: {}".format(
          list(sorted(installable_versions)))
      raise ValueError(msg)

    # Currently it's not possible to overwrite the data because it would
    # conflict with versioning: If the last version has already been generated,
    # it will always be reloaded and data_dir will be set at construction.
    if data_exists:
      raise ValueError(
          "Trying to overwrite an existing dataset {} at {}. A dataset with "
          "the same version {} already exists. If the dataset has changed, "
          "please update the version number.".format(self.name, self._data_dir,
                                                     self.version))

    logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
    if not utils.has_sufficient_disk_space(
        self.info.dataset_size + self.info.download_size,
        directory=self._data_dir_root):
      raise IOError(
          "Not enough disk space. Needed: {} (download: {}, generated: {})"
          .format(
              self.info.dataset_size + self.info.download_size,
              self.info.download_size,
              self.info.dataset_size,
          ))
    self._log_download_bytes()

    dl_manager = self._make_download_manager(
        download_dir=download_dir,
        download_config=download_config,
    )

    # Maybe save the `builder_cls` metadata common to all builder configs.
    if self.BUILDER_CONFIGS:
      _save_default_config_name(
          # `data_dir/ds_name/config/version/` -> `data_dir/ds_name/`
          common_dir=self.data_path.parent.parent,
          default_config_name=self.BUILDER_CONFIGS[0].name,
      )

    # Create a tmp dir and rename to self._data_dir on successful exit.
    with utils.incomplete_dir(self._data_dir) as tmp_data_dir:
      # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
      # it to every sub function.
      with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
        if (download_config.try_download_gcs and
            gcs_utils.is_dataset_on_gcs(self.info.full_name)):
          logging.info(GCS_HOSTED_MSG, self.name)
          gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir)
          self.info.read_from_directory(self._data_dir)
        else:
          # Old version of TF are not os.PathLike compatible
          with tf_compat.mock_gfile_pathlike():
            self._download_and_prepare(
                dl_manager=dl_manager,
                download_config=download_config,
            )

          # NOTE: If modifying the lines below to put additional information in
          # DatasetInfo, you'll likely also want to update
          # DatasetInfo.read_from_directory to possibly restore these attributes
          # when reading from package data.
          self.info.download_size = dl_manager.downloaded_size
          # Write DatasetInfo to disk, even if we haven't computed statistics.
          self.info.write_to_directory(self._data_dir)
      # The generated DatasetInfo contains references to `tmp_data_dir`
      self.info.update_data_dir(self._data_dir)
    self._log_download_done()