in services/worker/src/worker/job_runners/config/parquet_and_info.py [0:0]
def get_writer_batch_size_from_info(ds_config_info: datasets.info.DatasetInfo) -> Optional[int]:
"""
Get the writer_batch_size that defines the maximum row group size in the parquet files.
The default in `datasets` is 1,000 but we lower it to 100 for image datasets.
This allows to optimize random access to parquet file, since accessing 1 row requires
to read its entire row group.
Args:
ds_config_info (`datasets.info.DatasetInfo`):
Dataset info from `datasets`.
Returns:
`Optional[int]`:
Writer batch size to pass to a dataset builder.
If `None`, then it will use the `datasets` default.
"""
if ds_config_info.builder_name == "videofolder" or "Video(" in str(ds_config_info.features):
return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS
elif ds_config_info.builder_name == "audiofolder" or "Audio(" in str(ds_config_info.features):
return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS
elif ds_config_info.builder_name == "imagefolder" or "Image(" in str(ds_config_info.features):
return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS
elif ds_config_info.builder_name == "pdffolder" or "Pdf(" in str(ds_config_info.features):
return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_PDF_DATASETS
elif "'binary'" in str(ds_config_info.features):
return PROCESSING_STEP_CONFIG_PARQUET_AND_INFO_ROW_GROUP_SIZE_FOR_BINARY_DATASETS
else:
return None