in training/data.py [0:0]
def __init__(
self,
train_shards_path_or_url: Union[str, List[str]],
eval_shards_path_or_url: Union[str, List[str]],
tokenizer: PreTrainedTokenizer,
max_seq_length: int,
num_train_examples: int,
per_gpu_batch_size: int,
global_batch_size: int,
num_workers: int,
resolution: int = 256,
center_crop: bool = True,
random_flip: bool = False,
shuffle_buffer_size: int = 1000,
pin_memory: bool = False,
persistent_workers: bool = False,
is_pre_encoded: bool = False,
vae_checkpoint: Optional[str] = None,
text_encoder_checkpoint: Optional[str] = None,
use_filtered_dataset: bool = False,
require_marked_as_ok_by_spawning: bool = False,
require_marked_as_not_getty: bool = False,
max_pnsfw: Optional[float] = None,
max_pwatermark: Optional[float] = 0.5,
min_aesthetic_score: Optional[float] = 4.75,
min_size: Optional[int] = 256,
is_sdxl_synthetic_dataset: bool = False,
is_ds_clean_upscaled: bool = False,
is_ds_clean: bool = False,