maga_transformer/models/qwen_v2.py (194 lines of code) (raw):

import logging import os import functools import json from typing import List, Any, Dict from maga_transformer.utils.model_weight import (W, WeightStyle, CkptWeightInfo, identity, sp_0, sp_head_lora, sp_id, sp_neg1, zeros, transpose, transpose_pad, merge_qkv_b, merge_qkv_hf, merge_qkv_lora_A, merge_qkv_lora_B) from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters from maga_transformer.models.qwen import QWen from transformers import AutoTokenizer from maga_transformer.model_factory_register import register_model from maga_transformer.model_loader.weight_module import WeightModule, AtomicWeight from maga_transformer.model_loader.ffn_weight import FfnAtomicWeight, FfnWeight, FfnConfig from maga_transformer.model_loader.attn_weight import AttnAtomicWeight, AttnConfig from maga_transformer.model_loader.ffn_weight import FfnWeight, FfnAtomicWeight, FfnConfig from maga_transformer.model_loader.attn_weight import AttnAtomicWeight, AttnConfig from maga_transformer.model_loader.model_weight_info import ModelWeightInfo, ModelDeployWeightInfo def scale_reshape(ts): return ts[0].reshape(-1) class QWenV2Weight(ModelDeployWeightInfo): def __init__(self, *args: Any, **kwargs: Any): self.prefix: str = kwargs.pop('prefix', "") super().__init__(*args, **kwargs) @property def support_lora(self): return True def _process_meta(self, meta_dicts: Any, weight_keys: List[str]): # compat for qwen_v2_video if self._contains(weight_keys, 'language_model.'): self.prefix = 'language_model.' if self.prefix + 'transformer.layers.0.attention.dense.weight' in meta_dicts[0]: self.weight_style = WeightStyle.TRT_ENGINE logging.info(f"weight_style: {self.weight_style}") def _get_weight_info(self): return self._get_hf_weight_info() def _get_hf_ffn_layer_weight_info(self, layer_id: int) -> List[WeightModule]: ffn_config = FfnConfig( is_gated_activation=self._is_gated_activation, inter_padding_size=self._inter_padding_size, is_moe=False ) inter_padding_size = self._layer_inter_padding_size[layer_id] if self._layer_inter_padding_size else self._inter_padding_size return [FfnWeight(sub_weights=[ FfnAtomicWeight(W.ffn_w1, [CkptWeightInfo(self.prefix + 'model.layers.{i}.mlp.gate_proj.weight', identity)], functools.partial(transpose_pad, inter_padding_size=inter_padding_size, dim=0), config=ffn_config, lora_a_process_func=transpose, lora_b_process_func=functools.partial(transpose_pad, inter_padding_size=inter_padding_size, dim=0), lora_a_split_func=sp_id, lora_b_split_func=sp_neg1), FfnAtomicWeight(W.ffn_w3, [CkptWeightInfo(self.prefix + 'model.layers.{i}.mlp.up_proj.weight', identity)], functools.partial(transpose_pad, inter_padding_size=inter_padding_size, dim=0), config=ffn_config, lora_a_process_func=transpose, lora_b_process_func=functools.partial(transpose_pad, inter_padding_size=inter_padding_size, dim=0), lora_a_split_func=sp_id, lora_b_split_func=sp_neg1), FfnAtomicWeight(W.ffn_w2, [CkptWeightInfo(self.prefix + 'model.layers.{i}.mlp.down_proj.weight', identity)], functools.partial(transpose_pad, inter_padding_size=inter_padding_size, dim=1), config=ffn_config, lora_a_process_func=functools.partial(transpose_pad, inter_padding_size=inter_padding_size, dim=1), lora_b_process_func=transpose, lora_a_split_func=sp_0, lora_b_split_func=sp_id) ], config=ffn_config) ] def _get_hf_layer_weight_info(self, layer_id: int): attn_config = AttnConfig( hidden_size=self._hidden_size, size_per_head=self._size_per_head, head_num=self._head_num, head_num_kv=self._head_num_kv) layer_weights = [ AtomicWeight(W.pre_ln_gamma, [CkptWeightInfo(self.prefix + 'model.layers.{i}.input_layernorm.weight', identity)], identity), AttnAtomicWeight(W.attn_qkv_b, [ CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.q_proj.bias', identity), CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.k_proj.bias', identity), CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.v_proj.bias', identity) ], functools.partial(merge_qkv_b), config=attn_config), AttnAtomicWeight(W.attn_qkv_w, [ CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.q_proj.weight', identity), CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.k_proj.weight', identity), CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.v_proj.weight', identity) ], functools.partial(merge_qkv_hf), config=attn_config, lora_a_process_func=functools.partial(merge_qkv_lora_A, allow_empty=False, hidden_size=self._hidden_size, head_num=self._head_num, head_num_kv=self._head_num_kv, size_per_head=self._size_per_head), lora_b_process_func=functools.partial(merge_qkv_lora_B, allow_empty=False, hidden_size=self._hidden_size, head_num=self._head_num, head_num_kv=self._head_num_kv, size_per_head=self._size_per_head), lora_a_split_func=sp_id, lora_b_split_func=sp_head_lora ), AttnAtomicWeight(W.attn_o_w, [CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.o_proj.weight', identity)], transpose, config=attn_config, lora_a_process_func=transpose, lora_b_process_func=transpose, lora_a_split_func=sp_0, lora_b_split_func=sp_id), AtomicWeight(W.post_ln_gamma, [CkptWeightInfo(self.prefix + 'model.layers.{i}.post_attention_layernorm.weight', identity)], identity, config=attn_config), ] if self._use_qk_norm: layer_weights.extend([ AttnAtomicWeight(W.q_ln_gamma, [CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.q_norm.weight')], config=attn_config), AttnAtomicWeight(W.k_ln_gamma, [CkptWeightInfo(self.prefix + 'model.layers.{i}.self_attn.k_norm.weight')], config=attn_config)] ) layer_weights.extend(self._get_hf_ffn_layer_weight_info(layer_id)) return layer_weights def _get_hf_weight_info(self): if self.weight_style == WeightStyle.TRT_ENGINE: weights = [ AtomicWeight(W.embedding, [CkptWeightInfo('transformer.vocab_embedding.weight', identity)], identity), AtomicWeight(W.lm_head, [CkptWeightInfo('lm_head.weight', identity)], identity), AtomicWeight(W.final_ln_gamma, [CkptWeightInfo('transformer.ln_f.weight', identity)], identity), AtomicWeight(W.final_ln_beta, [], functools.partial(zeros, shape=[self._hidden_size])), ] else: weights = [ AtomicWeight(W.embedding, [CkptWeightInfo(self.prefix + 'model.embed_tokens.weight', identity)], identity), AtomicWeight(W.lm_head, [CkptWeightInfo(self.prefix + 'lm_head.weight', identity)], identity), AtomicWeight(W.final_ln_gamma, [CkptWeightInfo(self.prefix + 'model.norm.weight', identity)], identity), AtomicWeight(W.final_ln_beta, [], functools.partial(zeros, shape=[self._hidden_size])), ] layer_weights: List[List[WeightModule]] = [] for layer in range(self._num_layers): w = self._get_hf_layer_weight_info(layer) layer_weights.append(w) return ModelWeightInfo(layer_weights=layer_weights, weights=weights) class QWenV2(QWen): @classmethod def _create_config(cls, ckpt_path: str): config = GptInitModelParameters( head_num=0, head_num_kv=0, size_per_head=0, layer_num=0, inter_size=0, # 13696 vocab_size=152064, max_seq_len=8192) config.rotary_embedding_dim = 128 config.rotary_embedding_style = 1 config.activation_type = 'SiGLU' config.has_pre_decoder_layernorm = False config.has_post_decoder_layernorm = True config.norm_type = 'rmsnorm' config.special_tokens.bos_token_id = -1 config.special_tokens.eos_token_id = 151643 # <|im_start|> and <|im_end|> config.special_tokens.stop_words_id_list = [[151645], [151644]] config.special_tokens.system.token_ids = [151644, 8948, 198] # '<|im_start|>system\n' config.special_tokens.system.eos_token_ids = [151645, 198] # '<|im_end|>\n' config.special_tokens.user.token_ids = [151644, 872, 198] # '<|im_start|>user\n' config.special_tokens.user.eos_token_ids = [151645, 198] # '<|im_end|>\n' config.special_tokens.assistant.token_ids = [151644, 77091, 198] # '<|im_start|>assistant\n' config.special_tokens.assistant.eos_token_ids = [151645, 198] # '<|im_end|>\n' cls._from_hf(config, ckpt_path) assert config.head_num > 0 and config.head_num_kv > 0 and config.size_per_head > 0 and config.layer_num > 0 and config.inter_size > 0, "error config" return config @classmethod def _from_hf(cls, config: GptInitModelParameters, ckpt_path: str): config_path = os.path.join(ckpt_path, "config.json") if not os.path.exists(config_path): return with open(config_path) as reader: content = reader.read() config_json = json.loads(content) QWenV2._from_config_json(config, config_json) return config @staticmethod def _from_config_json(config: GptInitModelParameters, config_json: Dict[str, Any]): # config.activation_type = config_json["hidden_act"] config.inter_size = config_json["intermediate_size"] config.head_num = config_json["num_attention_heads"] config.head_num_kv = config_json.get("num_key_value_heads", config.head_num) config.size_per_head = int(config_json.get("head_dim")) if "head_dim" in config_json else config_json["hidden_size"] // config.head_num if config_json.get("hidden_size") is not None: config.hidden_size = config_json["hidden_size"] config.layer_num = config_json["num_hidden_layers"] config.rotary_embedding_base = config_json.get("rope_theta", config.rotary_embedding_base) config.vocab_size = config_json["vocab_size"] config.rotary_embedding_dim = config.size_per_head config.layernorm_eps = config_json.get("rms_norm_eps", 1e-06) config.tie_word_embeddings = config_json.get('tie_word_embeddings', False) @staticmethod def get_weight_cls(): return QWenV2Weight @classmethod def get_tokenizer(cls, config: GptInitModelParameters): tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_path, verbose=False, trust_remote_code=True) tokenizer.im_start_id = tokenizer.encode('<|im_start|>')[0] tokenizer.im_end_id = tokenizer.encode('<|im_end|>')[0] return tokenizer class QWenV2Embedding(QWenV2): @classmethod def _create_config(cls, ckpt_path: str): config = QWenV2._create_config(ckpt_path) config.is_causal = False return config register_model('qwen_2', QWenV2, ["Qwen2ForCausalLM"]) register_model('qwen_agent', QWenV2) register_model('qwen_2_embedding', QWenV2Embedding) register_model("qwen_tool", QWenV2)