in src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py [0:0]
def _split_generators(self, dl_manager):
if not self.config.data_files:
raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
dl_manager.download_config.extract_on_the_fly = True
# Do an early pass if:
# * `drop_labels` is None (default) or False, to infer the class labels
# * `drop_metadata` is None (default) or False, to find the metadata files
do_analyze = not self.config.drop_labels or not self.config.drop_metadata
labels, path_depths = set(), set()
metadata_files = collections.defaultdict(set)
def analyze(files_or_archives, downloaded_files_or_dirs, split):
if len(downloaded_files_or_dirs) == 0:
return
# The files are separated from the archives at this point, so check the first sample
# to see if it's a file or a directory and iterate accordingly
if os.path.isfile(downloaded_files_or_dirs[0]):
original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs
for original_file, downloaded_file in zip(original_files, downloaded_files):
original_file, downloaded_file = str(original_file), str(downloaded_file)
_, original_file_ext = os.path.splitext(original_file)
if original_file_ext.lower() in self.EXTENSIONS:
if not self.config.drop_labels:
labels.add(os.path.basename(os.path.dirname(original_file)))
path_depths.add(count_path_segments(original_file))
elif os.path.basename(original_file) in self.METADATA_FILENAMES:
metadata_files[split].add((original_file, downloaded_file))
else:
original_file_name = os.path.basename(original_file)
logger.debug(
f"The file '{original_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {self.METADATA_FILENAMES} either."
)
else:
archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs
for archive, downloaded_dir in zip(archives, downloaded_dirs):
archive, downloaded_dir = str(archive), str(downloaded_dir)
for downloaded_dir_file in dl_manager.iter_files(downloaded_dir):
_, downloaded_dir_file_ext = os.path.splitext(downloaded_dir_file)
if downloaded_dir_file_ext in self.EXTENSIONS:
if not self.config.drop_labels:
labels.add(os.path.basename(os.path.dirname(downloaded_dir_file)))
path_depths.add(count_path_segments(downloaded_dir_file))
elif os.path.basename(downloaded_dir_file) in self.METADATA_FILENAMES:
metadata_files[split].add((None, downloaded_dir_file))
else:
archive_file_name = os.path.basename(archive)
original_file_name = os.path.basename(downloaded_dir_file)
logger.debug(
f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not a {self.BASE_COLUMN_NAME}, and is not {self.METADATA_FILENAMES} either."
)
data_files = self.config.data_files
splits = []
for split_name, files in data_files.items():
if isinstance(files, str):
files = [files]
files, archives = self._split_files_and_archives(files)
downloaded_files = dl_manager.download(files)
downloaded_dirs = dl_manager.download_and_extract(archives)
if do_analyze: # drop_metadata is None or False, drop_labels is None or False
logger.info(f"Searching for labels and/or metadata files in {split_name} data files...")
analyze(files, downloaded_files, split_name)
analyze(archives, downloaded_dirs, split_name)
if metadata_files:
# add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False
add_metadata = not self.config.drop_metadata
# if `metadata_files` are found, don't add labels
add_labels = False
else:
# if `metadata_files` are not found, don't add metadata
add_metadata = False
# if `metadata_files` are not found and `drop_labels` is None (default) -
# add labels if files are on the same level in directory hierarchy and there is more than one label
add_labels = (
(len(labels) > 1 and len(path_depths) == 1)
if self.config.drop_labels is None
else not self.config.drop_labels
)
if add_labels:
logger.info("Adding the labels inferred from data directories to the dataset's features...")
if add_metadata:
logger.info("Adding metadata to the dataset...")
else:
add_labels, add_metadata, metadata_files = False, False, {}
splits.append(
datasets.SplitGenerator(
name=split_name,
gen_kwargs={
"files": tuple(zip(files, downloaded_files))
+ tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs),
"metadata_files": metadata_files.get(split_name, []),
"add_labels": add_labels,
"add_metadata": add_metadata,
},
)
)
if add_metadata:
# Verify that:
# * all metadata files have the same set of features in each split
# * the `file_name` key is one of the metadata keys and is of type string
features_per_metadata_file: list[tuple[str, datasets.Features]] = []
# Check that all metadata files share the same format
metadata_ext = {
os.path.splitext(original_metadata_file or downloaded_metadata_file)[-1]
for original_metadata_file, downloaded_metadata_file in itertools.chain.from_iterable(
metadata_files.values()
)
}
if len(metadata_ext) > 1:
raise ValueError(f"Found metadata files with different extensions: {list(metadata_ext)}")
metadata_ext = metadata_ext.pop()
for split_metadata_files in metadata_files.values():
pa_metadata_table = None
for _, downloaded_metadata_file in split_metadata_files:
for pa_metadata_table in self._read_metadata(downloaded_metadata_file, metadata_ext=metadata_ext):
break # just fetch the first rows
if pa_metadata_table is not None:
features_per_metadata_file.append(
(downloaded_metadata_file, datasets.Features.from_arrow_schema(pa_metadata_table.schema))
)
break # no need to fetch all the files
for downloaded_metadata_file, metadata_features in features_per_metadata_file:
if metadata_features != features_per_metadata_file[0][1]:
raise ValueError(
f"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}"
)
metadata_features = features_per_metadata_file[0][1]
feature_not_found = True
def _set_feature(feature):
nonlocal feature_not_found
if isinstance(feature, dict):
out = type(feature)()
for key in feature:
if (key == "file_name" or key.endswith("_file_name")) and feature[key] == datasets.Value(
"string"
):
key = key[: -len("_file_name")] or self.BASE_COLUMN_NAME
out[key] = self.BASE_FEATURE()
feature_not_found = False
elif (key == "file_names" or key.endswith("_file_names")) and feature[key] == datasets.List(
datasets.Value("string")
):
key = key[: -len("_file_names")] or (self.BASE_COLUMN_NAME + "s")
out[key] = datasets.List(self.BASE_FEATURE())
feature_not_found = False
elif (key == "file_names" or key.endswith("_file_names")) and feature[key] == [
datasets.Value("string")
]:
key = key[: -len("_file_names")] or (self.BASE_COLUMN_NAME + "s")
out[key] = [self.BASE_FEATURE()]
feature_not_found = False
else:
out[key] = feature[key]
return out
return feature
metadata_features = _visit(metadata_features, _set_feature)
if feature_not_found:
raise ValueError(
"`file_name` or `*_file_name` must be present as dictionary key (with type string) in metadata files"
)
else:
metadata_features = None
# Normally, we would do this in _info, but we need to know the labels and/or metadata
# before building the features
if self.config.features is None:
if add_metadata:
self.info.features = metadata_features
elif add_labels:
self.info.features = datasets.Features(
{
self.BASE_COLUMN_NAME: self.BASE_FEATURE(),
"label": datasets.ClassLabel(names=sorted(labels)),
}
)
else:
self.info.features = datasets.Features({self.BASE_COLUMN_NAME: self.BASE_FEATURE()})
return splits