in src/optimum/nvidia/compression/modelopt.py [0:0]
def _process_model_after_weight_loading(self, model, **kwargs):
if "workspace" not in kwargs:
raise KeyError(
"workspace not provided but required to generate quantized model representation"
)
# Retrieve the workspace where artifacts are being stored
workspace: "Workspace" = kwargs.pop("workspace")
with torch.inference_mode():
# Sparsify the model if requested
if sconfig := self._recipe.config.sparsity:
device = model.device
model = mts.sparsify(
model,
sconfig,
{"data_loader": self._recipe.dataset, "collect_func": lambda x: x},
)
model = mts.export(model)
model.to(device)
# Quantize the model
qmodel = mtq.quantize(
model, vars(self._recipe.config.qconfig), forward_loop=self._looper
)
# Export to TRTLLM checkpoint and return
export_tensorrt_llm_checkpoint(
qmodel,
decoder_type=model.config.model_type,
dtype=model.dtype,
export_dir=workspace.checkpoints_path,
inference_tensor_parallel=1,
inference_pipeline_parallel=1,
use_nfs_workspace=False,
naive_fp8_quantization=False,
)
return qmodel