maga_transformer/models/chat_glm_v2.py (92 lines of code) (raw):

from typing import List, Any, Dict import torch from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters from maga_transformer.utils.util import get_config_from_path from maga_transformer.tokenizer.tokenization_chatglm2 import ChatGLMTokenizer from maga_transformer.models.glm_v2_weight import GlmV2WeightInfo from maga_transformer.models.base_model import BaseModel from maga_transformer.model_factory_register import register_model class ChatGlmV2(BaseModel): @classmethod def get_tokenizer(cls, config: GptInitModelParameters): return ChatGLMTokenizer.from_pretrained(config.tokenizer_path) @staticmethod def get_weight_cls(): return GlmV2WeightInfo @classmethod def from_huggingface(cls, config_json: Dict[str, Any]): ''' "apply_query_key_layer_scaling": true, "apply_residual_connection_post_layernorm": false, "attention_softmax_in_fp32": true, "fp32_residual_connection": false, "original_rope": true, ''' config = GptInitModelParameters(head_num=32, size_per_head=128, layer_num=32, max_seq_len=8192, vocab_size=65024) config.head_num = config_json['num_attention_heads'] if config_json.get('multi_query_attention', False): config.head_num_kv = config_json['multi_query_group_num'] else: config.head_num_kv = config.head_num config.size_per_head = config_json['hidden_size'] // config_json['num_attention_heads'] config.layer_num = config_json['num_layers'] config.max_seq_len = config_json.get('seq_length', 8192) config.vocab_size = config_json['padded_vocab_size'] config.layernorm_eps = config_json['layernorm_epsilon'] config.inter_size = config_json['ffn_hidden_size'] config.add_bias_linear = config_json['add_bias_linear'] config.has_post_decoder_layernorm = config_json['post_layer_norm'] if 'pre_seq_len' in config_json: config.pre_seq_len = config_json['pre_seq_len'] if 'prefix_projection' in config_json: config.prefix_projection = config_json['prefix_projection'] config.src_quantization_bit = config_json.get('quantization_bit', 0) config.rotary_embedding_dim = config.size_per_head config.tie_word_embeddings = config_json.get('tie_word_embeddings', False) config.special_tokens.pad_token_id = config_json.get('pad_token_id', 0) config = cls.get_rotary_embedding_scale(config, config_json) cls.update_stop_words(config, config_json) return config @classmethod def update_stop_words(cls, config: GptInitModelParameters, config_json: Dict[str, Any]): config.special_tokens.eos_token_id = config_json.get('eos_token_id', 2) @staticmethod def get_rotary_embedding_scale(config, config_json): config.rotary_embedding_scale = config_json.get("rope_ratio", 1) return config @staticmethod def default_config(): config = GptInitModelParameters(head_num=32, head_num_kv=2, size_per_head= 128, layer_num=32, max_seq_len=8192, vocab_size=65024, layernorm_eps=1e-5, inter_size=13696, add_bias_linear=False, has_post_decoder_layernorm=False) return config @staticmethod def modify_config(config): config.use_attention_linear_bias = False config.activation_type = "SiGLU" config.norm_type = "rmsnorm" config.rotary_embedding_dim = 128 config.rotary_embedding_style = 2 return config @classmethod def _create_config(cls, ckpt_path: str): config_dict = get_config_from_path(ckpt_path) if config_dict is not None: config = ChatGlmV2.from_huggingface(config_dict) else: config = ChatGlmV2.default_config() config = ChatGlmV2.modify_config(config) return config register_model('chatglm2', ChatGlmV2, ["ChatGLMModel"], ["THUDM/chatglm2-6b", "THUDM/chatglm2-6b-int4", "THUDM/chatglm2-6b-32k"]) register_model('chat_glm_2', ChatGlmV2)