in optimum/quanto/library/qbytes_mm.py [0:0]
def qbytes_mm_impl_cpu(activations: torch.Tensor, weights: torch.Tensor, output_scales: torch.Tensor) -> torch.Tensor:
if (
# FIXME: accuracy issues with 2.4.x
version.parse(torch.__version__).release >= version.parse("2.6.0").release
and activations.dtype == torch.int8
and weights.dtype == torch.int8
):
return qbytes_int_mm(activations, weights, output_scales)
in_features = activations.shape[-1]
if activations.dtype == torch.bfloat16 and weights.dtype == torch.int8 and in_features % 4 == 0:
if type(activations) is not torch.Tensor:
activations = activations.dequantize()
return qbytes_int8pack_mm(activations, weights, output_scales)
return qbytes_mm(activations, weights, output_scales)