modules/SwissArmyTransformer/sat/model/official/chatglm3_model.py [17:79]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self.max_seq_len = max_seq_len
        
    def attention_forward(self, hidden_states, mask, **kw_args):
        origin = self
        self = self.transformer.layers[kw_args['layer_id']].attention
        attention_fn = attention_fn_default
        if 'attention_fn' in self.hooks:
            attention_fn = self.hooks['attention_fn']

        mixed_raw_layer = self.query_key_value(hidden_states)
        (mixed_query_layer,
            mixed_key_layer,
            mixed_value_layer) = split_tensor_along_last_dim(mixed_raw_layer, self.stride)

        dropout_fn = self.attention_dropout if self.training else None

        query_layer = self._transpose_for_scores(mixed_query_layer)
        key_layer = self._transpose_for_scores(mixed_key_layer)
        value_layer = self._transpose_for_scores(mixed_value_layer)

        max_seq_len = kw_args['position_ids'].max() + 1
        query_layer, key_layer = origin.rotary_pos_emb(query_layer, key_layer, kw_args['position_ids'], max_seqlen=max_seq_len)

        if kw_args.get('past_key_values', None) is not None:
            pack = kw_args['past_key_values'][kw_args['layer_id']]
            if pack is not None:
                past_key, past_value = pack
                key_layer = torch.cat((past_key, key_layer), dim=2)
                value_layer = torch.cat((past_value, value_layer), dim=2)
        kw_args['output_this_layer']['past_key_values'] = (key_layer, value_layer)

        context_layer = attention_fn(query_layer, key_layer, value_layer, mask, dropout_fn, **kw_args)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
        context_layer = context_layer.view(*new_context_layer_shape)
        output = self.dense(context_layer)

        if self.training:
            output = self.output_dropout(output)
        return output

class SwiGLUMixin(BaseMixin):
    def __init__(self, num_layers, in_features, hidden_features, bias=False):
        super().__init__()
        self.w2 = nn.ModuleList([ColumnParallelLinear(
            in_features,
            hidden_features,
            gather_output=False,
            # init_method=init_method,
            bias=bias,
            # params_dtype=params_dtype,
            module=self,
            name="dense_h_to_4h_gate",
            # skip_init=skip_init,
            # device=device
        ) for i in range(num_layers)])

    def mlp_forward(self, hidden_states, **kw_args):
        x = hidden_states
        origin = self.transformer.layers[kw_args['layer_id']].mlp
        x1 = origin.dense_h_to_4h(x)
        x2 = self.w2[kw_args['layer_id']](x)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



modules/SwissArmyTransformer/sat/model/official/chatglm4_model.py [17:79]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        self.max_seq_len = max_seq_len
        
    def attention_forward(self, hidden_states, mask, **kw_args):
        origin = self
        self = self.transformer.layers[kw_args['layer_id']].attention
        attention_fn = attention_fn_default
        if 'attention_fn' in self.hooks:
            attention_fn = self.hooks['attention_fn']

        mixed_raw_layer = self.query_key_value(hidden_states)
        (mixed_query_layer,
            mixed_key_layer,
            mixed_value_layer) = split_tensor_along_last_dim(mixed_raw_layer, self.stride)

        dropout_fn = self.attention_dropout if self.training else None

        query_layer = self._transpose_for_scores(mixed_query_layer)
        key_layer = self._transpose_for_scores(mixed_key_layer)
        value_layer = self._transpose_for_scores(mixed_value_layer)

        max_seq_len = kw_args['position_ids'].max() + 1
        query_layer, key_layer = origin.rotary_pos_emb(query_layer, key_layer, kw_args['position_ids'], max_seqlen=max_seq_len)

        if kw_args.get('past_key_values', None) is not None:
            pack = kw_args['past_key_values'][kw_args['layer_id']]
            if pack is not None:
                past_key, past_value = pack
                key_layer = torch.cat((past_key, key_layer), dim=2)
                value_layer = torch.cat((past_value, value_layer), dim=2)
        kw_args['output_this_layer']['past_key_values'] = (key_layer, value_layer)

        context_layer = attention_fn(query_layer, key_layer, value_layer, mask, dropout_fn, **kw_args)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
        context_layer = context_layer.view(*new_context_layer_shape)
        output = self.dense(context_layer)

        if self.training:
            output = self.output_dropout(output)
        return output

class SwiGLUMixin(BaseMixin):
    def __init__(self, num_layers, in_features, hidden_features, bias=False):
        super().__init__()
        self.w2 = nn.ModuleList([ColumnParallelLinear(
            in_features,
            hidden_features,
            gather_output=False,
            # init_method=init_method,
            bias=bias,
            # params_dtype=params_dtype,
            module=self,
            name="dense_h_to_4h_gate",
            # skip_init=skip_init,
            # device=device
        ) for i in range(num_layers)])

    def mlp_forward(self, hidden_states, **kw_args):
        x = hidden_states
        origin = self.transformer.layers[kw_args['layer_id']].mlp
        x1 = origin.dense_h_to_4h(x)
        x2 = self.w2[kw_args['layer_id']](x)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



