in src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py [0:0]
def _generate_examples(self, files, metadata_files, add_metadata, add_labels):
sample_idx = 0
if add_metadata:
feature_paths = []
def find_feature_path(feature, feature_path):
nonlocal feature_paths
if feature_path and isinstance(feature, self.BASE_FEATURE):
feature_paths.append(feature_path)
_visit_with_path(self.info.features, find_feature_path)
for original_metadata_file, downloaded_metadata_file in metadata_files:
metadata_ext = os.path.splitext(original_metadata_file or downloaded_metadata_file)[-1]
downloaded_metadata_dir = os.path.dirname(downloaded_metadata_file)
def set_feature(item, feature_path: _VisitPath):
if len(feature_path) == 2 and isinstance(feature_path[0], str) and feature_path[1] == 0:
item[feature_path[0]] = item.pop("file_names", None) or item.pop(
feature_path[0] + "_file_names", None
)
elif len(feature_path) == 1 and isinstance(feature_path[0], str):
item[feature_path[0]] = item.pop("file_name", None) or item.pop(
feature_path[0] + "_file_name", None
)
elif len(feature_path) == 0:
file_relpath = os.path.normpath(item).replace("\\", "/")
item = os.path.join(downloaded_metadata_dir, file_relpath)
return item
for pa_metadata_table in self._read_metadata(downloaded_metadata_file, metadata_ext=metadata_ext):
for sample in pa_metadata_table.to_pylist():
for feature_path in feature_paths:
_nested_apply(sample, feature_path, set_feature)
yield sample_idx, sample
sample_idx += 1
else:
if self.config.filters is not None:
filter_expr = (
pq.filters_to_expression(self.config.filters)
if isinstance(self.config.filters, list)
else self.config.filters
)
for original_file, downloaded_file_or_dir in files:
downloaded_files = [downloaded_file_or_dir] if original_file else downloaded_file_or_dir
for downloaded_file in downloaded_files:
original_file_ext = os.path.splitext(original_file or downloaded_file)[-1]
if original_file_ext.lower() not in self.EXTENSIONS:
continue
sample = {self.BASE_COLUMN_NAME: downloaded_file}
if add_labels:
sample["label"] = os.path.basename(os.path.dirname(original_file or downloaded_file))
if self.config.filters is not None:
pa_table = pa.Table.from_pylist([sample]).filter(filter_expr)
if len(pa_table) == 0:
continue
yield sample_idx, sample
sample_idx += 1