configs/template_config.yaml (78 lines of code) (raw):
wandb:
entity: ???
experiment:
name: ???
project: ???
output_dir: ???
max_train_examples: ???
save_every: 1000
eval_every: 500
generate_every: 1000
log_every: 50
log_grad_norm_every: 100
resume_from_checkpoint: latest
model:
vq_model:
pretrained: "openMUSE/maskgit-vqgan-imagenet-f16-256"
transformer:
vocab_size: 2048 # (1024 + 1000 + 1 = 2025 -> Vq + Imagenet + <mask>, use 2048 for even division by 8)
max_position_embeddings: 264 # (256 + 1 for class id, use 264 for even division by 8)
hidden_size: 768
num_hidden_layers: 12
num_attention_heads: 12
intermediate_size: 3072
codebook_size: 1024
num_vq_tokens: 256
num_classes: 1000
initializer_range: 0.02
norm_type: "layernorm"
layer_norm_eps: 1e-6
use_bias: False
use_normformer: True
use_mlm_layer: True
use_encoder_layernorm: True
use_mlm_layernorm: True
hidden_dropout: 0.0
attention_dropout: 0.0
gradient_checkpointing: True
enable_xformers_memory_efficient_attention: False
dataset:
params:
train_shards_path_or_url: ???
eval_shards_path_or_url: ???
batch_size: ${training.batch_size}
shuffle_buffer_size: ???
num_workers: ???
resolution: 256
pin_memory: True
persistent_workers: True
preprocessing:
resolution: 256
center_crop: True
random_flip: False
optimizer:
name: adamw # Can be adamw or lion or fused_adamw. Install apex for fused_adamw
params: # default adamw params
learning_rate: ???
scale_lr: False # scale learning rate by total batch size
beta1: 0.9
beta2: 0.999
weight_decay: 0.01
epsilon: 1e-8
lr_scheduler:
scheduler: "constant_with_warmup"
params:
learning_rate: ${optimizer.params.learning_rate}
warmup_steps: 500
training:
gradient_accumulation_steps: 1
batch_size: 128
mixed_precision: "no"
enable_tf32: True
use_ema: False
seed: 42
max_train_steps: ???
overfit_one_batch: False
min_masking_rate: 0.0
label_smoothing: 0.0
max_grad_norm: null