in optimum_benchmark/backends/tensorrt_llm/backend.py [0:0]
def trtllm_kwargs(self):
kwargs = {}
if self.config.tp is not None:
kwargs["tp"] = self.config.tp
if self.config.pp is not None:
kwargs["pp"] = self.config.pp
if self.config.dtype is not None:
kwargs["dtype"] = self.config.dtype
if self.config.use_fp8 is not None:
kwargs["use_fp8"] = self.config.use_fp8
if self.config.world_size is not None:
kwargs["world_size"] = self.config.world_size
if self.config.gpus_per_node is not None:
kwargs["gpus_per_node"] = self.config.gpus_per_node
if self.config.max_input_len is not None:
kwargs["max_input_len"] = self.config.max_input_len
if self.config.max_output_len is not None:
kwargs["max_output_len"] = self.config.max_output_len
if self.config.max_batch_size is not None:
kwargs["max_batch_size"] = self.config.max_batch_size
if self.config.max_new_tokens is not None:
kwargs["max_new_tokens"] = self.config.max_new_tokens
if self.config.max_prompt_length is not None:
kwargs["max_prompt_length"] = self.config.max_prompt_length
if self.config.optimization_level is not None:
kwargs["optimization_level"] = self.config.optimization_level
if self.config.use_cuda_graph is not None:
kwargs["use_cuda_graph"] = self.config.use_cuda_graph
return kwargs