in optimum/gptq/quantizer.py [0:0]
def post_init_model(self, model):
"""
Post-initialization that require device information, for example buffers initialization on device.
Args:
model (`nn.Module`):
The input model
"""
if self.bits == 4 and not self.disable_exllama:
if get_device(model).type != "cuda" or (
hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk", "hpu"])
):
if not self.disable_exllama:
logger.warning(
"Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
)
self.disable_exllama = True
class StoreAttr(object):
pass
if is_gptqmodel_available():
model, _ = hf_convert_gptq_v1_to_v2_format(
model, self.bits, self.quant_linear, self.checkpoint_format, self.meta
)
model.quantize_config = StoreAttr()
model.quantize_config.desc_act = self.desc_act
model = gptq_post_init(model, use_act_order=self.desc_act)
if (
self.desc_act
and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
and self.max_input_length is not None
):
model = exllama_set_max_input_length(model, self.max_input_length)
return model