configs/template_config.yaml (78 lines of code) (raw):

wandb: entity: ??? experiment: name: ??? project: ??? output_dir: ??? max_train_examples: ??? save_every: 1000 eval_every: 500 generate_every: 1000 log_every: 50 log_grad_norm_every: 100 resume_from_checkpoint: latest model: vq_model: pretrained: "openMUSE/maskgit-vqgan-imagenet-f16-256" transformer: vocab_size: 2048 # (1024 + 1000 + 1 = 2025 -> Vq + Imagenet + <mask>, use 2048 for even division by 8) max_position_embeddings: 264 # (256 + 1 for class id, use 264 for even division by 8) hidden_size: 768 num_hidden_layers: 12 num_attention_heads: 12 intermediate_size: 3072 codebook_size: 1024 num_vq_tokens: 256 num_classes: 1000 initializer_range: 0.02 norm_type: "layernorm" layer_norm_eps: 1e-6 use_bias: False use_normformer: True use_mlm_layer: True use_encoder_layernorm: True use_mlm_layernorm: True hidden_dropout: 0.0 attention_dropout: 0.0 gradient_checkpointing: True enable_xformers_memory_efficient_attention: False dataset: params: train_shards_path_or_url: ??? eval_shards_path_or_url: ??? batch_size: ${training.batch_size} shuffle_buffer_size: ??? num_workers: ??? resolution: 256 pin_memory: True persistent_workers: True preprocessing: resolution: 256 center_crop: True random_flip: False optimizer: name: adamw # Can be adamw or lion or fused_adamw. Install apex for fused_adamw params: # default adamw params learning_rate: ??? scale_lr: False # scale learning rate by total batch size beta1: 0.9 beta2: 0.999 weight_decay: 0.01 epsilon: 1e-8 lr_scheduler: scheduler: "constant_with_warmup" params: learning_rate: ${optimizer.params.learning_rate} warmup_steps: 500 training: gradient_accumulation_steps: 1 batch_size: 128 mixed_precision: "no" enable_tf32: True use_ema: False seed: 42 max_train_steps: ??? overfit_one_batch: False min_masking_rate: 0.0 label_smoothing: 0.0 max_grad_norm: null