in llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py [0:0]
def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
weights_config = kwargs["weights_config"]
attn_implementation = kwargs["attn_implementation"]
assert (
weights_config in self.weights_configs
), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue"
torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
quant_scheme = self.weights_configs[weights_config]["quant_scheme"]
quant_config = self.weights_configs[weights_config]["quant_config"]
launcher_config = ProcessConfig(
device_isolation=True, device_isolation_action="kill"
)
scenario_config = InferenceConfig(
memory=True,
energy=True,
latency=True,
duration=10,
iterations=10,
warmup_runs=10,
input_shapes=INPUT_SHAPES,
generate_kwargs=GENERATE_KWARGS,
)
backend_config = PyTorchConfig(
model=model,
device="cuda",
device_ids="0",
no_weights=True,
library="transformers",
task="text-generation",
torch_dtype=torch_dtype,
quantization_scheme=quant_scheme,
quantization_config=quant_config,
attn_implementation=attn_implementation,
model_kwargs={"trust_remote_code": True},
)
return BenchmarkConfig(
name=f"{weights_config}-{attn_implementation}",
scenario=scenario_config,
launcher=launcher_config,
backend=backend_config,
)