in models/vision_language_model.py [0:0]
def __init__(self, cfg: VLMConfig, load_backbone=True):
super().__init__()
self.cfg = cfg
if load_backbone:
print("Loading from backbone weights")
self.vision_encoder = ViT.from_pretrained(cfg)
self.decoder = LanguageModel.from_pretrained(cfg)
else:
self.vision_encoder = ViT(cfg)
self.decoder = LanguageModel(cfg)
self.MP = ModalityProjector(cfg)
self.load_backbone = load_backbone
self.tokenizer = get_tokenizer(cfg.lm_tokenizer, cfg.vlm_extra_tokens, cfg.lm_chat_template)