in optimum/habana/transformers/models/glm4v/modeling_chatglm.py [0:0]
def __init__(self, config: GLM4VConfig, device=None, empty_init=True):
super().__init__(config)
if empty_init:
init_method = skip_init
else:
init_method = default_init
init_kwargs = {}
if device is not None:
init_kwargs["device"] = device
self.embedding = init_method(Embedding, config, **init_kwargs)
self.num_layers = config.num_layers
self.multi_query_group_num = config.multi_query_group_num
self.kv_channels = config.kv_channels
# Rotary positional embeddings
self.seq_length = config.seq_length
rotary_dim = (
config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
)
self.rotary_pos_emb = RotaryEmbedding(
rotary_dim // 2,
rope_ratio=config.rope_ratio,
original_impl=config.original_rope,
device=device,
dtype=config.torch_dtype,
)
self.encoder = init_method(GLMTransformer, config, **init_kwargs)
self.output_layer = init_method(
torch.nn.Linear,
config.hidden_size,
config.padded_vocab_size,
bias=False,
dtype=config.torch_dtype,
**init_kwargs,
)
self.pre_seq_len = config.pre_seq_len if config.pre_seq_len is not None else 0
self.prefix_projection = config.prefix_projection
if self.pre_seq_len > 0:
for param in self.parameters():
param.requires_grad = False
self.prefix_tokens = torch.arange(self.pre_seq_len).long()
self.prefix_encoder = PrefixEncoder(config)
self.dropout = torch.nn.Dropout(0.1)
self.vision = EVA2CLIPModel(config)
if hasattr(config, "vision_config"):
self.image_size: int = self.config.vision_config["image_size"]
self.patch_size: int = self.config.vision_config["patch_size"]
self.num_patches = (self.image_size // self.patch_size // 2) ** 2