maga_transformer/cpp/engine_base/Executor.h (84 lines of code) (raw):

#pragma once #include "absl/status/statusor.h" #include "maga_transformer/cpp/stream/GenerateStream.h" #include "maga_transformer/cpp/models/GptModel.h" #include "maga_transformer/cpp/devices/DeviceBase.h" #include <memory> namespace rtp_llm { class Executor { public: Executor(rtp_llm::DeviceBase* device): device_(device){}; virtual absl::Status process(const std::list<GenerateStreamPtr>& streams) = 0; static GptModelDescription genModelDescription(const rtp_llm::GptInitParameter& params) { rtp_llm::RopeConfig rope_config = params.getRopeConfig(); int moe_tp_size = params.tp_size_ * params.dp_size_ / params.ep_size_; KvCacheDataType kv_cache_dtype = loadKvCacheDataTypeFromDataType(params.kv_cache_data_type_); rtp_llm::AttentionConfigs attention_config{ params.head_num_ > 1 ? (size_t)params.head_num_ / params.tp_size_ : 1, params.head_num_kv_ > 1 ? (size_t)params.head_num_kv_ / params.tp_size_ : 1, (size_t)params.size_per_head_, (size_t)params.hidden_size_, rope_config, (size_t)params.seq_size_per_block_, params.is_causal_ ? rtp_llm::AttentionMaskType::causalMask : rtp_llm::AttentionMaskType::noMask, 1.0, // if qk_norm or use embedding model, fuse add bias in gemm params.qk_norm_ || (params.rotary_embedding_style_ == 0 && !params.use_kvcache_) ? false : true, false, params.use_mla_, (size_t)params.q_lora_rank_, (size_t)params.kv_lora_rank_, (size_t)params.nope_head_dim_, (size_t)params.rope_head_dim_, (size_t)params.v_head_dim_, params.softmax_extra_scale_, kv_cache_dtype}; // TP在init的时候处理,认为每个MOE Plugin只看到一个TP rank;EP在MOE Plugin中处理; auto moe_configs = params.moe_style_ ? (std::optional<rtp_llm::MoeConfigs>)rtp_llm::MoeConfigs({ (size_t)params.expert_num_, (size_t)(params.phy_exp_num_-params.expert_num_), (size_t)params.moe_k_, params.moe_normalize_expert_scale_, params.moe_inter_padding_size_ / moe_tp_size, params.has_moe_norm_, (size_t)params.ep_rank_, (size_t)params.ep_size_, (size_t)params.tp_rank_, (size_t)params.tp_size_, (size_t)params.dp_rank_, (size_t)params.dp_size_, (int)params.scoring_func_, (int)params.moe_topk_group_, (int)params.moe_n_group_, params.enable_eplb_ }) : std::nullopt; rtp_llm::FfnConfigs ffn_config{ rtp_llm::getActivationType(params.activation_type_str_), move(moe_configs), }; rtp_llm::QScheme act_qscheme = rtp_llm::QScheme::NoQuantize; if (params.quant_algo_.isPerTensorQuant()) { act_qscheme = rtp_llm::QScheme::Qint8PerTensor; } else if (params.quant_algo_.isSmoothQuant() || params.quant_algo_.isOmniQuant()) { act_qscheme = rtp_llm::QScheme::Qint8PerToken; } else if (params.quant_algo_.isFp8() && !params.quant_algo_.isGroupwise()) { act_qscheme = rtp_llm::QScheme::Qfp8PerTensor; } else if (params.quant_algo_.isFp8() && params.quant_algo_.isGroupwise()) { act_qscheme = rtp_llm::QScheme::Qfp8PerTokenBlock; } return {attention_config, ffn_config, rtp_llm::getNormType(params.norm_type_str_), act_qscheme, params.layernorm_eps_, (size_t)params.vocab_size_, params.layernorm_type_ == rtp_llm::LayerNormType::post_layernorm, params.input_embedding_scalar_, params.residual_scalar_, params.reverse_e_h_norm_}; } virtual ~Executor(){}; public: rtp_llm::DeviceBase* device_; }; } // namespace rtp_llm