maga_transformer/cpp/th_op/GptInitParameter.h (220 lines of code) (raw):
#pragma once
#include "maga_transformer/cpp/utils/layernorm_types.h"
#include "maga_transformer/cpp/utils/activation_types.h"
#include "maga_transformer/cpp/utils/quantization.h"
#include "maga_transformer/cpp/utils/RopeConfig.h"
#include "maga_transformer/cpp/utils/MlaConfig.h"
#include "maga_transformer/cpp/utils/EplbConfig.h"
#include "maga_transformer/cpp/utils/QuantInfo.h"
#include "maga_transformer/cpp/core/Types.h"
#include "maga_transformer/cpp/th_op/GptInitParameterRegister.h"
#include <vector>
#include <map>
namespace rtp_llm {
enum TaskType {
DENSE_EMBEDDING = 0,
ALL_EMBEDDING = 1,
SPARSE_EMBEDDING = 2,
COLBERT_EMBEDDING = 3,
LANGUAGE_MODEL = 4,
SEQ_CLASSIFICATION = 5,
RERANKER = 6,
LINEAR_SOFTMAX = 7,
BGE_M3 = 8
};
struct RoleSpecialTokens {
public:
std::vector<int64_t> token_ids_;
std::vector<int64_t> eos_token_ids_;
};
struct SpecialTokens {
public:
SpecialTokens();
int64_t bos_token_id_ = -1;
int64_t eos_token_id_ = 0;
int64_t pad_token_id_ = 0;
int64_t decoder_start_token_id_ = -1;
RoleSpecialTokens user_;
RoleSpecialTokens assistant_;
RoleSpecialTokens system_;
std::vector<std::vector<int64_t>> stop_words_id_list_;
std::vector<std::string> stop_words_str_list_;
};
class GptInitParameter {
public:
// model variant params used in ft
int64_t head_num_ = 0;
int64_t head_num_kv_ = -1;
int64_t size_per_head_ = 0;
int64_t inter_size_ = 0;
int64_t inter_padding_size_ = -1;
int64_t moe_inter_padding_size_ = -1;
int64_t num_layers_ = 0;
int64_t num_valid_layer_ = 0;
int64_t hidden_size_ = 0;
// mla extra params
bool use_mla_ = false;
int64_t q_lora_rank_ = 0;
int64_t kv_lora_rank_ = 0;
int64_t nope_head_dim_ = 0;
int64_t rope_head_dim_ = 0;
int64_t v_head_dim_ = 0;
MlaOpsType mla_ops_type_ = MlaOpsType::AUTO;
// rope config for deepseek
double deepseek_rope_mscale_ = 1.0;
double deepseek_mscale_all_dim_ = 1.0;
// deepseek moe extra params
int64_t moe_n_group_ = 1;
int64_t moe_topk_group_ = 1;
// in sparse, those params might vary among layers
bool is_sparse_head_ = false;
std::vector<int64_t> layer_head_num_ = {};
std::vector<int64_t> layer_head_num_kv_ = {};
std::vector<int64_t> layer_inter_size_ = {};
std::vector<int64_t> layer_inter_padding_size_ = {};
double layernorm_eps_ = 1e-5;
std::string layernorm_type_str_ = "pre_layernorm";
std::string norm_type_str_ = "layernorm";
std::string activation_type_str_ = "Gelu";
std::string kv_cache_data_type_str_ = "fp16";
LayerNormType layernorm_type_ = LayerNormType::pre_layernorm;
NormType norm_type_ = NormType::layernorm;
TaskType task_type_ = TaskType::LANGUAGE_MODEL;
ActivationType activation_type_ = ActivationType::Gelu;
DataType kv_cache_data_type_ = DataType::TYPE_FP16;
int64_t rotary_embedding_dim_ = 0;
int64_t rotary_embedding_style_ = 0;
int64_t position_ids_style_ = 0;
float rotary_embedding_base_ = 10000.f;
double rotary_embedding_scale_ = 1.0;
double rotary_factor1_ = 0;
double rotary_factor2_ = 0;
int64_t org_embedding_max_pos_ = 0;
double rotary_embedding_mscale_ = 1.0;
int64_t rotary_embedding_offset_ = 0;
// for Gemma, hidden_states = hidden_states * (hidden_size**0.5)
double input_embedding_scalar_ = 1;
double residual_scalar_ = 1;
float softmax_extra_scale_ = 1.0f;
std::vector<int64_t> mrope_section_ = {};
bool use_logn_attn_ = false;
double q_scaling_ = 1;
bool qk_norm_ = false;
bool use_cross_attn_ = false;
int64_t cross_attn_input_len_ = 0;
bool use_norm_input_residual_ = false;
bool use_norm_attn_out_residual_ = false;
std::string data_type_ = "fp16";
int64_t local_rank_ = 0;
int64_t max_seq_len_ = 0;
int64_t vocab_size_ = 0;
int64_t input_vocab_size_ = 0; // 0 if not set
int64_t type_vocab_size_ = 0;
int64_t embedding_size_ = 0;
int64_t expert_num_ = 0;
int64_t moe_k_ = 0;
bool moe_normalize_expert_scale_ = false;
// 0 for no moe; 1 for all layer moe; 2 for partial layer moe
int64_t moe_style_ = 0;
// 0 for softmax; 1 for sigmoid
int64_t scoring_func_ = 0;
std::vector<int64_t> moe_layer_index_ = {};
// EPLB
bool enable_eplb_ = false;
int64_t phy_exp_num_ = 0; // number of physical experts
int64_t eplb_update_time_ = 5000;
EplbMode eplb_mode_ = EplbMode::NONE;
py::object py_eplb_;
bool has_positional_encoding_ = false;
bool has_pre_decoder_layernorm_ = false;
bool has_post_decoder_layernorm_ = false;
bool has_lm_head_ = true;
bool use_attention_linear_bias_ = false;
bool use_fp32_to_compute_logit_ = false;
bool add_bias_linear_ = false;
bool has_moe_norm_ = false;
double logit_scale_ = 1.0;
bool is_causal_ = true;
bool use_kvcache_ = true;
std::string tokenizer_path_ = "";
std::string ckpt_path_ = "";
int64_t pre_seq_len_ = 0;
bool prefix_projection_ = false;
bool using_hf_sampling_ = false;
SpecialTokens special_tokens_;
QuantAlgo quant_algo_;
// async mode config
int64_t max_generate_batch_size_ = 1;
int64_t max_context_batch_size_ = 1;
int64_t gen_num_per_circle_ = 1;
bool is_multimodal_ = false;
std::vector<std::vector<int64_t>> mm_sep_tokens_ = {};
bool include_sep_tokens_ = false;
int64_t mm_position_ids_style_ = 0; // 0 for default; 1 for chatglm4v; 2 for qwen2 vl
int64_t position_id_len_factor_ = 1;
bool pre_allocate_op_mem_ = true;
int64_t seq_size_per_block_ = 8;
int64_t block_nums_ = 0;
int64_t scheduler_reserve_resource_ratio_ = 5;
int64_t reserve_runtime_mem_mb_ = 0;
int64_t kv_cache_mem_mb_ = 0;
bool reuse_cache_ = false;
bool enable_partial_fallback_ = false;
bool enable_fast_gen_ = false;
bool warm_up_ = false;
bool warm_up_with_loss_ = false;
int64_t fast_gen_max_context_len_ = 0;
bool reverse_e_h_norm_ = false;
bool use_expert_attention_ = false; // true for CogVLM2, false for other models
std::string nccl_ip_ = "";
int64_t tp_nccl_port_ = 0;
int64_t dp_tp_nccl_port_ = 0;
int64_t ffn_tp_nccl_port_ = 0;
int64_t http_port_ = 0;
int64_t model_rpc_port_ = 0;
int64_t tp_size_ = 1;
int64_t tp_rank_ = 0;
int64_t ep_size_ = 1;
int64_t ep_rank_ = 0;
int64_t dp_size_ = 1;
int64_t dp_rank_ = 0;
int64_t ffn_tp_size_ = 1;
int64_t ffn_tp_rank_ = 0;
bool enable_sp_ = false;
int64_t world_size_ = 1;
// pd speration
bool pd_separation_ = false;
bool use_cache_store_ = false;
bool cache_store_rdma_mode_ = true;
int64_t cache_store_listen_port_ = 0;
int64_t cache_store_connect_port_ = 0;
int64_t cache_store_rdma_listen_port_ = 0;
int64_t cache_store_rdma_connect_port_ = 0;
int64_t remote_rpc_server_port_ = 0;
int64_t prefill_retry_times_ = 0;
int64_t prefill_retry_timeout_ms_ = 0;
int64_t prefill_max_wait_timeout_ms_ = 0;
int64_t decode_retry_times_ = 0;
int64_t decode_retry_timeout_ms_ = 0;
int64_t decode_polling_kv_cache_step_ms_ = 0;
bool decode_use_async_load_cache_ = true;
int64_t rdma_connect_retry_times_ = 0;
bool pd_sep_enable_fallback_ = false;
std::string load_balance_policy_name_ = "";
int64_t sync_status_interval_ms_ = 0;
int64_t load_cache_timeout_ms_ = 0;
int64_t max_rpc_timeout_ms_ = 0;
int64_t worker_port_offset_ = 0;
std::map<std::string, std::vector<int>> multi_task_prompt_tokens_;
// 0 for no sep, 1 for server, 2 for client
int64_t vit_separation_ = 0;
bool enable_speculative_decoding_ = false;
std::string model_name_ = "";
//multi machine
std::vector<std::string> worker_addrs_;
std::vector<std::string> worker_grpc_addrs_;
GptInitParameter();
GptInitParameter(
int64_t head_num, int64_t size_per_head, int64_t num_layers, int64_t max_seq_len,
int64_t vocab_size, int64_t hidden_size);
void insertMultiTaskPromptTokens(std::string task_id, std::vector<int64_t> tokens_id);
void setLayerNormType();
void setNormType();
void setTaskType(std::string task);
void setActivationType();
void setKvCacheDataType();
bool isGatedActivation() const;
RopeConfig getRopeConfig() const;
bool isKvCacheQuant() const;
// is not pd-sep
bool isPDFusion() const;
// is prefill in p-d sep
bool isPrefillRole() const;
// is decode in p-d sep
bool isDecodeRole() const;
};
}