in ultravox/data/datasets.py [0:0]
def __iter__(self):
actual_length = 0
skipped_samples = 0
bad_samples = 0
dataset_iter = iter(self._dataset)
for row in dataset_iter:
actual_length += 1
sample = self._get_sample(row)
if sample is None:
print(f"Sample is None in dataset {self._config.alias} for row {row}")
bad_samples += 1
continue # Skip this sample and proceed to the next
if self._args.include_audio:
if sample.audio is None:
print(f"Audio is None for sample {sample}")
bad_samples += 1
continue # Skip this sample
if sample.audio.shape[-1] == 0:
print(f"Audio length is 0 for sample {sample}")
bad_samples += 1
continue # Skip this sample
if (
self._args.max_audio_duration_secs > 0
and sample.audio.shape[-1] / data_sample.SAMPLE_RATE
> self._args.max_audio_duration_secs
):
skipped_samples += 1
continue # Skip this sample
yield sample
logging.info(
f"Extracted {actual_length} samples from {self.name} (total: {len(self)}), removed {bad_samples} bad samples, and skipped {skipped_samples} samples for exceeding max audio duration ({self._args.max_audio_duration_secs}s)."
)