def read_from_directory()

in tensorflow_datasets/core/dataset_info.py [0:0]


  def read_from_directory(self, dataset_info_dir: str) -> None:
    """Update DatasetInfo from the JSON files in `dataset_info_dir`.

    This function updates all the dynamically generated fields (num_examples,
    hash, time of creation,...) of the DatasetInfo.

    This will overwrite all previous metadata.

    Args:
      dataset_info_dir: `str` The directory containing the metadata file. This
        should be the root directory of a specific dataset version.

    Raises:
      FileNotFoundError: If the dataset_info.json can't be found.
    """
    logging.info("Load dataset info from %s", dataset_info_dir)

    json_filename = self._dataset_info_path(dataset_info_dir)
    if not tf.io.gfile.exists(json_filename):
      raise FileNotFoundError(
          "Tried to load `DatasetInfo` from a directory which does not exist or"
          " does not contain `dataset_info.json`. Please delete the directory "
          f"`{dataset_info_dir}`  if you are trying to re-generate the "
          "dataset.")

    # Load the metadata from disk
    parsed_proto = read_from_json(json_filename)

    # Update splits
    filename_template = naming.ShardedFileTemplate(
        dataset_name=self._builder.name,
        data_dir=self.data_dir,
        filetype_suffix=parsed_proto.file_format or "tfrecord")
    split_dict = splits_lib.SplitDict.from_proto(
        repeated_split_infos=parsed_proto.splits,
        filename_template=filename_template)
    self.set_splits(split_dict)

    # Restore the feature metadata (vocabulary, labels names,...)
    if self.features:
      self.features.load_metadata(dataset_info_dir)
    # For `ReadOnlyBuilder`, reconstruct the features from the config.
    elif tf.io.gfile.exists(feature_lib.make_config_path(dataset_info_dir)):
      self._features = feature_lib.FeatureConnector.from_config(
          dataset_info_dir)
    # Restore the MetaDataDict from metadata.json if there is any
    if (self.metadata is not None or
        tf.io.gfile.exists(_metadata_filepath(dataset_info_dir))):
      # If the dataset was loaded from file, self.metadata will be `None`, so
      # we create a MetadataDict first.
      if self.metadata is None:
        self._metadata = MetadataDict()
      self.metadata.load_metadata(dataset_info_dir)

    # Update fields which are not defined in the code. This means that
    # the code will overwrite fields which are present in
    # dataset_info.json.
    for field_name, field in self.as_proto.DESCRIPTOR.fields_by_name.items():
      field_value = getattr(self._info_proto, field_name)
      field_value_restored = getattr(parsed_proto, field_name)

      try:
        is_defined = self._info_proto.HasField(field_name)
      except ValueError:
        is_defined = bool(field_value)

      try:
        is_defined_in_restored = parsed_proto.HasField(field_name)
      except ValueError:
        is_defined_in_restored = bool(field_value_restored)

      # If field is defined in code, we ignore the value
      if is_defined:
        if field_value != field_value_restored:
          logging.info(
              "Field info.%s from disk and from code do not match. Keeping "
              "the one from code.", field_name)
        continue
      # If the field is also not defined in JSON file, we do nothing
      if not is_defined_in_restored:
        continue
      # Otherwise, we restore the dataset_info.json value
      if field.type == field.TYPE_MESSAGE:
        field_value.MergeFrom(field_value_restored)
      else:
        setattr(self._info_proto, field_name, field_value_restored)

    if self._builder._version != self.version:  # pylint: disable=protected-access
      raise AssertionError(
          "The constructed DatasetInfo instance and the restored proto version "
          "do not match. Builder version: {}. Proto version: {}".format(
              self._builder._version, self.version))  # pylint: disable=protected-access

    # Mark as fully initialized.
    self._fully_initialized = True