in src/datasets/iterable_dataset.py [0:0]
def __iter__(self):
if not self.formatting or self.formatting.is_table:
formatter = PythonFormatter(
features=self._features if not self.ex_iterable.is_typed else None,
token_per_repo_id=self.token_per_repo_id,
)
else:
formatter = get_formatter(
self.formatting.format_type,
features=self._features if not self.ex_iterable.is_typed else None,
token_per_repo_id=self.token_per_repo_id,
)
if self.ex_iterable.iter_arrow:
# feature casting (inc column addition) handled within self._iter_arrow()
for key, pa_table in self._iter_arrow():
batch = formatter.format_batch(pa_table)
for example in _batch_to_examples(batch):
yield key, example
else:
format_dict = (
formatter.recursive_tensorize
if isinstance(formatter, TensorFormatter)
else None # cast in case features is None
)
for key, example in self.ex_iterable:
# don't apply feature types if already applied by ex_iterable (e.g. in case of chained with_format)
if self.features and not self.ex_iterable.is_typed:
example = _apply_feature_types_on_example(
example, self.features, token_per_repo_id=self.token_per_repo_id
)
if format_dict:
example = format_dict(example)
yield key, example