in tensorflow_datasets/core/dataset_info.py [0:0]
def read_from_directory(self, dataset_info_dir: str) -> None:
"""Update DatasetInfo from the JSON files in `dataset_info_dir`.
This function updates all the dynamically generated fields (num_examples,
hash, time of creation,...) of the DatasetInfo.
This will overwrite all previous metadata.
Args:
dataset_info_dir: `str` The directory containing the metadata file. This
should be the root directory of a specific dataset version.
Raises:
FileNotFoundError: If the dataset_info.json can't be found.
"""
logging.info("Load dataset info from %s", dataset_info_dir)
json_filename = self._dataset_info_path(dataset_info_dir)
if not tf.io.gfile.exists(json_filename):
raise FileNotFoundError(
"Tried to load `DatasetInfo` from a directory which does not exist or"
" does not contain `dataset_info.json`. Please delete the directory "
f"`{dataset_info_dir}` if you are trying to re-generate the "
"dataset.")
# Load the metadata from disk
parsed_proto = read_from_json(json_filename)
# Update splits
filename_template = naming.ShardedFileTemplate(
dataset_name=self._builder.name,
data_dir=self.data_dir,
filetype_suffix=parsed_proto.file_format or "tfrecord")
split_dict = splits_lib.SplitDict.from_proto(
repeated_split_infos=parsed_proto.splits,
filename_template=filename_template)
self.set_splits(split_dict)
# Restore the feature metadata (vocabulary, labels names,...)
if self.features:
self.features.load_metadata(dataset_info_dir)
# For `ReadOnlyBuilder`, reconstruct the features from the config.
elif tf.io.gfile.exists(feature_lib.make_config_path(dataset_info_dir)):
self._features = feature_lib.FeatureConnector.from_config(
dataset_info_dir)
# Restore the MetaDataDict from metadata.json if there is any
if (self.metadata is not None or
tf.io.gfile.exists(_metadata_filepath(dataset_info_dir))):
# If the dataset was loaded from file, self.metadata will be `None`, so
# we create a MetadataDict first.
if self.metadata is None:
self._metadata = MetadataDict()
self.metadata.load_metadata(dataset_info_dir)
# Update fields which are not defined in the code. This means that
# the code will overwrite fields which are present in
# dataset_info.json.
for field_name, field in self.as_proto.DESCRIPTOR.fields_by_name.items():
field_value = getattr(self._info_proto, field_name)
field_value_restored = getattr(parsed_proto, field_name)
try:
is_defined = self._info_proto.HasField(field_name)
except ValueError:
is_defined = bool(field_value)
try:
is_defined_in_restored = parsed_proto.HasField(field_name)
except ValueError:
is_defined_in_restored = bool(field_value_restored)
# If field is defined in code, we ignore the value
if is_defined:
if field_value != field_value_restored:
logging.info(
"Field info.%s from disk and from code do not match. Keeping "
"the one from code.", field_name)
continue
# If the field is also not defined in JSON file, we do nothing
if not is_defined_in_restored:
continue
# Otherwise, we restore the dataset_info.json value
if field.type == field.TYPE_MESSAGE:
field_value.MergeFrom(field_value_restored)
else:
setattr(self._info_proto, field_name, field_value_restored)
if self._builder._version != self.version: # pylint: disable=protected-access
raise AssertionError(
"The constructed DatasetInfo instance and the restored proto version "
"do not match. Builder version: {}. Proto version: {}".format(
self._builder._version, self.version)) # pylint: disable=protected-access
# Mark as fully initialized.
self._fully_initialized = True