in muse/modeling_transformer.py [0:0]
def __init__(
self,
vocab_size, # codebook_size + 1 (for the mask token), for class-conditioned generation it'll be codebook_size + num_classes + 1
hidden_size=768,
embedding_size=None,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_dropout=0.1,
attention_dropout=0.1,
max_position_embeddings=256, # for clas-conditioned generation it'll be 256 + 1 (for the class token)
add_cross_attention=False,
encoder_hidden_size=1024, # T5-large
project_encoder_hidden_states=False,
initializer_range=0.02,
norm_type="layernorm", # or rmsnorm
layer_norm_eps=1e-5,
use_normformer=True,
use_encoder_layernorm=True,
use_mlm_layer=True,
use_mlm_layernorm=True,
use_bias=False,
codebook_size=1024,
num_vq_tokens=256,
num_classes=None, # set for class-conditioned generation
use_codebook_size_for_output=False,
use_conv_in_out=False,
patch_size=1,
**kwargs,