maga_transformer/models/phi.py (63 lines of code) (raw):
from maga_transformer.utils.util import get_config_from_path
from maga_transformer.config.gpt_init_model_parameters import GptInitModelParameters
from maga_transformer.utils.model_weight import W, CkptWeightInfo, identity, transpose
from maga_transformer.model_loader.model_weight_info import ModelWeightInfo, ModelDeployWeightInfo
from maga_transformer.model_loader.weight_module import AtomicWeight
from maga_transformer.model_loader.ffn_weight import FfnAtomicWeight, FfnWeight
from maga_transformer.model_loader.attn_weight import AttnAtomicWeight
from maga_transformer.models.base_model import BaseModel
from maga_transformer.model_factory_register import register_model
class PhiWeightInfo(ModelDeployWeightInfo):
def _get_weight_info(self):
weights = [
AtomicWeight(W.embedding, [CkptWeightInfo('layers.0.wte.weight', identity)], identity),
AtomicWeight(W.lm_head, [CkptWeightInfo('layers.25.linear.weight', identity)], identity),
AtomicWeight(W.lm_head_b, [CkptWeightInfo('layers.25.linear.bias', identity)], identity),
AtomicWeight(W.final_ln_gamma, [CkptWeightInfo('layers.25.ln.weight', identity)], identity),
AtomicWeight(W.final_ln_beta, [CkptWeightInfo('layers.25.ln.bias', identity)], identity),
]
attn_config = self.attn_config
ffn_config = self.ffn_config
layer_weights = []
for _ in range(self._num_layers):
layer_weight = [
AtomicWeight(W.pre_ln_beta, [CkptWeightInfo('layers.{i_1}.ln.bias', identity)], identity),
AtomicWeight(W.pre_ln_gamma, [CkptWeightInfo('layers.{i_1}.ln.weight', identity)], identity),
AttnAtomicWeight(W.attn_qkv_w, [CkptWeightInfo('layers.{i_1}.mixer.Wqkv.weight', identity)], transpose, config=attn_config),
AttnAtomicWeight(W.attn_qkv_b, [CkptWeightInfo('layers.{i_1}.mixer.Wqkv.bias', identity)], identity, config=attn_config),
AttnAtomicWeight(W.attn_o_w, [CkptWeightInfo('layers.{i_1}.mixer.out_proj.weight', identity)], transpose, config=attn_config),
AttnAtomicWeight(W.attn_o_b, [CkptWeightInfo('layers.{i_1}.mixer.out_proj.bias', identity)], identity, config=attn_config),
FfnWeight(sub_weights=[
FfnAtomicWeight(W.ffn_w3, [CkptWeightInfo('layers.{i_1}.mlp.fc1.weight', identity)], transpose, config=ffn_config),
FfnAtomicWeight(W.ffn_b3, [CkptWeightInfo('layers.{i_1}.mlp.fc1.bias', identity)], identity, config=ffn_config),
FfnAtomicWeight(W.ffn_w2, [CkptWeightInfo('layers.{i_1}.mlp.fc2.weight', identity)], transpose, config=ffn_config),
FfnAtomicWeight(W.ffn_b2, [CkptWeightInfo('layers.{i_1}.mlp.fc2.bias', identity)], identity, config=ffn_config)],
config=ffn_config)
]
layer_weights.append(layer_weight)
# close to falcon
return ModelWeightInfo(layer_weights=layer_weights, weights=weights)
class Phi(BaseModel):
@staticmethod
def get_weight_cls():
return PhiWeightInfo
@classmethod
def _create_config(cls, ckpt_path: str):
config_dict = get_config_from_path(ckpt_path)
size_per_head = int(config_dict.get('n_embd', 2048) / config_dict.get('n_head', 32))
config = GptInitModelParameters(
head_num=config_dict.get('n_head', 32),
size_per_head=size_per_head,
inter_size=4 * config_dict.get('n_embd', 2048),
layer_num=config_dict.get('n_layer', 24),
max_seq_len=config_dict.get('n_positions', 2048),
vocab_size=config_dict.get('vocab_size', 32),
rotary_embedding_dim=config_dict.get('rotary_dim', size_per_head),
rotary_embedding_style=1,
activation_type='gelu',
has_positional_encoding=False,
has_post_decoder_layernorm=True,
has_lm_head_bias=True,
tie_word_embeddings = config_dict.get('tie_word_embeddings', False))
config.head_num_kv = config.head_num
return config
register_model('phi', Phi)