in janus/janusflow/models/modeling_vlm.py [0:0]
def __init__(self, config: MultiModalityConfig):
super().__init__(config)
# vision understanding encoder
vision_und_enc_config = config.vision_und_enc_config
vision_und_enc_cls = model_name_to_cls(vision_und_enc_config.cls)
self.vision_und_enc_model = vision_und_enc_cls(**vision_und_enc_config.params)
# vision understanding aligner
self.vision_und_enc_aligner = nn.Linear(1024, 2048, bias=True)
# begin of understanding embedding
self.beg_of_und_embed = nn.Parameter(torch.zeros(1, 2048))
# vision generation encoder
vision_gen_enc_config = config.vision_gen_enc_config
vision_gen_enc_cls = model_name_to_cls(vision_gen_enc_config.cls)
self.vision_gen_enc_model = vision_gen_enc_cls(**vision_gen_enc_config.params)
# vision generation encoder aligner
self.vision_gen_enc_aligner = nn.Linear(768, 2048, bias=True)
# vision generation decoder
vision_gen_dec_config = config.vision_gen_dec_config
vision_gen_dec_cls = model_name_to_cls(vision_gen_dec_config.cls)
self.vision_gen_dec_model = vision_gen_dec_cls(**vision_gen_dec_config.params)
# language model
language_config = config.language_config
self.language_model = LlamaForCausalLM(language_config)
# vision generation decoder aligner
self.vision_gen_dec_aligner_norm = LlamaRMSNorm(
2048, eps=language_config.rms_norm_eps
)
self.vision_gen_dec_aligner = nn.Linear(2048, 768, bias=True)