in optimum/neuron/models/inference/backend/config.py [0:0]
def __init__(
self,
checkpoint_id: str = None,
checkpoint_revision: str = None,
batch_size: Optional[int] = 1,
max_batch_size: Optional[int] = None,
continuous_batching: Optional[bool] = False,
speculation_length: Optional[int] = 0,
sequence_length: Optional[int] = 128,
tp_degree: Optional[int] = 1,
ep_degree: Optional[int] = 1,
pp_degree: Optional[int] = 1,
torch_dtype: Optional[Union[str, torch.dtype]] = torch.bfloat16,
rpl_reduce_dtype: Optional[Union[str, torch.dtype]] = None,
n_active_tokens: Optional[int] = None,
max_context_length: Optional[int] = None,
output_logits: Optional[bool] = False,
padding_side: Optional[str] = "right",
fused_qkv: Optional[bool] = False,
vocab_parallel: Optional[bool] = False,
sequence_parallel_enabled: Optional[bool] = False,
is_chunked_prefill: Optional[bool] = False,
flash_decoding_enabled: Optional[bool] = False,
async_mode: Optional[bool] = False,
qk_layernorm: Optional[bool] = False,
attn_kernel_enabled: Optional[bool] = False,
qkv_kernel_enabled: Optional[bool] = False,
mlp_kernel_enabled: Optional[bool] = False,
mlp_kernel_fuse_residual_add: Optional[bool] = False,
enable_bucketing: Optional[bool] = False,
target: Optional[str] = None, # Set to "trn2" for trn2
logical_nc_config: Optional[int] = 1,
cc_pipeline_tiling_factor: Optional[int] = 2,
num_cores_per_group: Optional[int] = 1,
on_device_sampling: Optional[bool] = False,
max_topk: Optional[int] = 256,
start_rank_id: Optional[int] = 0,
local_ranks_size: Optional[int] = None,
capacity_factor: float = None,
glu_mlp: bool = True,