maga_transformer/ops/libth_transformer.pyi (407 lines of code) (raw):

from __future__ import annotations import torch import typing __all__ = ['DeviceExporter', 'DeviceType', 'EmbeddingHandlerOp', 'EngineScheduleInfo', 'EngineTaskInfo', 'EplbMode', 'GptInitParameter', 'LoadBalanceInfo', 'MlaOpsType', 'MultimodalInput', 'QuantAlgo', 'RoleSpecialTokens', 'RtpEmbeddingOp', 'RtpLLMOp', 'SpecialTokens', 'create_linear_softmax_handler', 'get_device'] class DeviceExporter: def get_device_id(self) -> int: ... def get_device_type(self) -> DeviceType: ... def pack_int8_tensor_to_packed_int4(self, weight: torch.Tensor) -> torch.Tensor: ... def preprocess_gemm_weight_by_key(self, key: str, weight: torch.Tensor) -> torch.Tensor: ... def preprocess_weight_scale(self, weight: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: ... def preprocess_weights_for_mixed_gemm(self, weight: torch.Tensor, quant_type: typing.Any, arch: str) -> torch.Tensor: ... def symmetric_quantize_last_axis_of_batched_matrix(self, weight: torch.Tensor, quant_type: typing.Any, arch: str) -> list[torch.Tensor]: ... class DeviceType: """ Members: Cpu Cuda Yitian ArmCpu ROCm Ppu """ ArmCpu: typing.ClassVar[DeviceType] # value = <DeviceType.ArmCpu: 3> Cpu: typing.ClassVar[DeviceType] # value = <DeviceType.Cpu: 0> Cuda: typing.ClassVar[DeviceType] # value = <DeviceType.Cuda: 1> Ppu: typing.ClassVar[DeviceType] # value = <DeviceType.Ppu: 5> ROCm: typing.ClassVar[DeviceType] # value = <DeviceType.ROCm: 4> Yitian: typing.ClassVar[DeviceType] # value = <DeviceType.Yitian: 2> __members__: typing.ClassVar[dict[str, DeviceType]] # value = {'Cpu': <DeviceType.Cpu: 0>, 'Cuda': <DeviceType.Cuda: 1>, 'Yitian': <DeviceType.Yitian: 2>, 'ArmCpu': <DeviceType.ArmCpu: 3>, 'ROCm': <DeviceType.ROCm: 4>, 'Ppu': <DeviceType.Ppu: 5>} def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: ... def __hash__(self) -> int: ... def __index__(self) -> int: ... def __init__(self, value: int) -> None: ... def __int__(self) -> int: ... def __ne__(self, other: typing.Any) -> bool: ... def __repr__(self) -> str: ... def __setstate__(self, state: int) -> None: ... def __str__(self) -> str: ... @property def name(self) -> str: ... @property def value(self) -> int: ... class EmbeddingHandlerOp: def __init__(self) -> None: ... def forward(self, hidden_states: torch.Tensor, input_lengths: torch.Tensor) -> torch.Tensor: ... def load_tensor(self, weights: dict[str, torch.Tensor]) -> None: ... class EngineScheduleInfo: finished_task_info_list: list[EngineTaskInfo] last_schedule_delta: int running_task_info_list: list[EngineTaskInfo] def __init__(self) -> None: ... class EngineTaskInfo: input_length: int prefix_length: int request_id: int def __init__(self) -> None: ... class EplbMode: """ Members: NONE STATS EPLB ALL """ ALL: typing.ClassVar[EplbMode] # value = <EplbMode.ALL: 3> EPLB: typing.ClassVar[EplbMode] # value = <EplbMode.EPLB: 2> NONE: typing.ClassVar[EplbMode] # value = <EplbMode.NONE: 0> STATS: typing.ClassVar[EplbMode] # value = <EplbMode.STATS: 1> __members__: typing.ClassVar[dict[str, EplbMode]] # value = {'NONE': <EplbMode.NONE: 0>, 'STATS': <EplbMode.STATS: 1>, 'EPLB': <EplbMode.EPLB: 2>, 'ALL': <EplbMode.ALL: 3>} def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: ... def __hash__(self) -> int: ... def __index__(self) -> int: ... def __init__(self, value: int) -> None: ... def __int__(self) -> int: ... def __ne__(self, other: typing.Any) -> bool: ... def __repr__(self) -> str: ... def __setstate__(self, state: int) -> None: ... def __str__(self) -> str: ... @property def name(self) -> str: ... @property def value(self) -> int: ... class GptInitParameter: activation_type: str add_bias_linear: bool block_nums: int cache_store_connect_port: int cache_store_listen_port: int cache_store_rdma_connect_port: int cache_store_rdma_listen_port: int cache_store_rdma_mode: bool ckpt_path: str cross_attn_input_len: int data_type: str decode_polling_kv_cache_step_ms: int decode_retry_timeout_ms: int decode_retry_times: int decode_use_async_load_cache: bool deepseek_mscale_all_dim: float deepseek_rope_mscale: float dp_rank: int dp_size: int dp_tp_nccl_port: int embedding_size: int enable_eplb: bool enable_fast_gen: bool enable_partial_fallback: bool enable_sp: bool enable_speculative_decoding: bool ep_rank: int ep_size: int eplb_mode: EplbMode eplb_update_time: int expert_num: int fast_gen_max_context_len: int ffn_tp_nccl_port: int ffn_tp_rank: int ffn_tp_size: int gen_num_per_circle: int has_lm_head: bool has_moe_norm: bool has_positional_encoding: bool has_post_decoder_layernorm: bool has_pre_decoder_layernorm: bool head_num: int head_num_kv: int hidden_size: int http_port: int include_sep_tokens: bool input_embedding_scalar: float input_vocab_size: int inter_padding_size: int inter_size: int is_causal: bool is_multimodal: bool is_sparse_head: bool kv_cache_data_type: str kv_cache_mem_mb: int kv_lora_rank: int layer_head_num: list[int] layer_head_num_kv: list[int] layer_inter_padding_size: list[int] layer_inter_size: list[int] layer_num: int layernorm_eps: float layernorm_type: str load_balance_policy_name: str load_cache_timeout_ms: int local_rank: int logit_scale: float max_context_batch_size: int max_generate_batch_size: int max_rpc_timeout_ms: int max_seq_len: int mla_ops_type: MlaOpsType mm_position_ids_style: int mm_sep_tokens: list[list[int]] model_name: str model_rpc_port: int moe_inter_padding_size: int moe_k: int moe_layer_index: list[int] moe_n_group: int moe_normalize_expert_scale: bool moe_style: int moe_topk_group: int mrope_section: list[int] nccl_ip: str nope_head_dim: int norm_type: str num_layers: int num_valid_layer: int org_embedding_max_pos: int pd_sep_enable_fallback: bool pd_separation: bool phy_exp_num: int position_id_len_factor: int position_ids_style: int pre_allocate_op_mem: bool pre_seq_len: int prefill_max_wait_timeout_ms: int prefill_retry_timeout_ms: int prefill_retry_times: int prefix_projection: bool py_eplb: typing.Any q_lora_rank: int q_scaling: float qk_norm: bool quant_algo: QuantAlgo rdma_connect_retry_times: int remote_rpc_server_port: int reserve_runtime_mem_mb: int residual_scalar: float reuse_cache: bool reverse_e_h_norm: bool rope_head_dim: int rotary_embedding_base: float rotary_embedding_dim: int rotary_embedding_mscale: float rotary_embedding_offset: int rotary_embedding_scale: float rotary_embedding_style: int rotary_factor1: float rotary_factor2: float scheduler_reserve_resource_ratio: int scoring_func: int seq_size_per_block: int size_per_head: int softmax_extra_scale: float special_tokens: SpecialTokens tokenizer_path: str tp_nccl_port: int tp_rank: int tp_size: int type_vocab_size: int use_attention_linear_bias: bool use_cache_store: bool use_cross_attn: bool use_expert_attention: bool use_fp32_to_compute_logit: bool use_kvcache: bool use_logn_attn: bool use_mla: bool use_norm_attn_out_residual: bool use_norm_input_residual: bool using_hf_sampling: bool v_head_dim: int vit_separation: int vocab_size: int warm_up: bool warm_up_with_loss: bool worker_addrs: list[str] worker_grpc_addrs: list[str] worker_port_offset: int world_size: int def __init__(self, head_num: int, size_per_head: int, num_layers: int, max_seq_len: int, vocab_size: int, hidden_size: int) -> None: ... def insertMultiTaskPromptTokens(self, task_id: str, tokens_id: list[int]) -> None: ... def isGatedActivation(self) -> bool: ... def isKvCacheQuant(self) -> bool: ... def setActivationType(self) -> None: ... def setKvCacheDataType(self) -> None: ... def setLayerNormType(self) -> None: ... def setNormType(self) -> None: ... def setTaskType(self, task: str) -> None: ... class LoadBalanceInfo: onflight_requests: int available_kv_cache: int iterate_count: int step_latency_us: int step_per_minute: int total_kv_cache: int onflight_requests: int def __init__(self) -> None: ... class MlaOpsType: """ Members: AUTO MHA FLASH_INFER FLASH_MLA """ AUTO: typing.ClassVar[MlaOpsType] # value = <MlaOpsType.AUTO: 0> FLASH_INFER: typing.ClassVar[MlaOpsType] # value = <MlaOpsType.FLASH_INFER: 2> FLASH_MLA: typing.ClassVar[MlaOpsType] # value = <MlaOpsType.FLASH_MLA: 3> MHA: typing.ClassVar[MlaOpsType] # value = <MlaOpsType.MHA: 1> __members__: typing.ClassVar[dict[str, MlaOpsType]] # value = {'AUTO': <MlaOpsType.AUTO: 0>, 'MHA': <MlaOpsType.MHA: 1>, 'FLASH_INFER': <MlaOpsType.FLASH_INFER: 2>, 'FLASH_MLA': <MlaOpsType.FLASH_MLA: 3>} def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: ... def __hash__(self) -> int: ... def __index__(self) -> int: ... def __init__(self, value: int) -> None: ... def __int__(self) -> int: ... def __ne__(self, other: typing.Any) -> bool: ... def __repr__(self) -> str: ... def __setstate__(self, state: int) -> None: ... def __str__(self) -> str: ... @property def name(self) -> str: ... @property def value(self) -> int: ... class MultimodalInput: mm_type: int tensor: torch.Tensor url: str def __init__(self, url: str, tensor: torch.Tensor, mm_type: int) -> None: ... class QuantAlgo: def __getstate__(self) -> tuple: ... def __init__(self) -> None: ... def __setstate__(self, arg0: tuple) -> None: ... def getActivationBits(self) -> int: ... def getGroupSize(self) -> int: ... def getWeightBits(self) -> int: ... def isAwq(self) -> bool: ... def isFp8(self) -> bool: ... def isGptq(self) -> bool: ... def isGroupwise(self) -> bool: ... def isOmniQuant(self) -> bool: ... def isPerTensorQuant(self) -> bool: ... def isQuant(self) -> bool: ... def isSmoothQuant(self) -> bool: ... def isWeightOnlyPerCol(self) -> bool: ... def setQuantAlgo(self, quant_method: str, bits: int, group_size: int) -> None: ... class RoleSpecialTokens: eos_token_ids: list[int] token_ids: list[int] def __init__(self) -> None: ... class RtpEmbeddingOp: def __init__(self) -> None: ... def decode(self, token_ids: torch.Tensor, token_type_ids: torch.Tensor, input_lengths: torch.Tensor, request_id: int, multimodal_inputs: list[MultimodalInput]) -> typing.Any: ... def init(self, model: typing.Any, mm_process_engine: typing.Any) -> None: ... def stop(self) -> None: ... class RtpLLMOp: def __init__(self) -> None: ... def add_lora(self, adapter_name: str, lora_a_weights: typing.Any, lora_b_weights: typing.Any) -> None: ... def get_engine_schedule_info(self) -> EngineScheduleInfo: ... def get_load_balance_info(self) -> LoadBalanceInfo: ... def init(self, model: typing.Any, mm_process_engine: typing.Any, propose_model: typing.Any, token_processor: typing.Any) -> None: ... def ready(self) -> bool: ... def remove_lora(self, adapter_name: str) -> None: ... def start_http_server(self, model_weights_loader: typing.Any, lora_infos: typing.Any, gang_info: typing.Any, tokenizer: typing.Any, render: typing.Any) -> None: ... def stop(self) -> None: ... def update_scheduler_info(self, arg0: str) -> None: ... class SpecialTokens: assistant: RoleSpecialTokens bos_token_id: int decoder_start_token_id: int eos_token_id: int pad_token_id: int stop_words_id_list: list[list[int]] stop_words_str_list: list[str] system: RoleSpecialTokens user: RoleSpecialTokens def __init__(self) -> None: ... def create_linear_softmax_handler(gpt_init_params: GptInitParameter) -> EmbeddingHandlerOp: ... def get_device() -> DeviceExporter: ...