def __init__()

in janus/janusflow/models/modeling_vlm.py [0:0]


    def __init__(self, config: MultiModalityConfig):
        super().__init__(config)

        # vision understanding encoder
        vision_und_enc_config = config.vision_und_enc_config
        vision_und_enc_cls = model_name_to_cls(vision_und_enc_config.cls)
        self.vision_und_enc_model = vision_und_enc_cls(**vision_und_enc_config.params)

        # vision understanding aligner
        self.vision_und_enc_aligner = nn.Linear(1024, 2048, bias=True)

        # begin of understanding embedding
        self.beg_of_und_embed = nn.Parameter(torch.zeros(1, 2048))

        # vision generation encoder
        vision_gen_enc_config = config.vision_gen_enc_config
        vision_gen_enc_cls = model_name_to_cls(vision_gen_enc_config.cls)
        self.vision_gen_enc_model = vision_gen_enc_cls(**vision_gen_enc_config.params)

        # vision generation encoder aligner
        self.vision_gen_enc_aligner = nn.Linear(768, 2048, bias=True)

        # vision generation decoder
        vision_gen_dec_config = config.vision_gen_dec_config
        vision_gen_dec_cls = model_name_to_cls(vision_gen_dec_config.cls)
        self.vision_gen_dec_model = vision_gen_dec_cls(**vision_gen_dec_config.params)

        # language model
        language_config = config.language_config
        self.language_model = LlamaForCausalLM(language_config)

        # vision generation decoder aligner
        self.vision_gen_dec_aligner_norm = LlamaRMSNorm(
            2048, eps=language_config.rms_norm_eps
        )
        self.vision_gen_dec_aligner = nn.Linear(2048, 768, bias=True)