in ultravox/tools/ds_tool/ds_tool.py [0:0]
def _process(self, ds_chunk: datasets.Dataset) -> datasets.Dataset:
ds_mapped = self.args.task.map_split(
ds_chunk,
self.args.num_workers,
self.args.writer_batch_size,
self.args.exclude_fields,
)
check_empty_columns = self.args.check_empty_columns
if len(check_empty_columns) > 0:
return ds_mapped.filter(
lambda sample: all(
sample[column] is not None for column in check_empty_columns
),
num_proc=self.args.num_workers,
writer_batch_size=self.args.writer_batch_size,
)
else:
return ds_mapped