def qbytes_mm_impl

def qbytes_mm_impl_cpu()

in optimum/quanto/library/qbytes_mm.py [0:0]

13 lines of code
8 McCabe index (conditional complexity)


def qbytes_mm_impl_cpu(activations: torch.Tensor, weights: torch.Tensor, output_scales: torch.Tensor) -> torch.Tensor:
    if (
        # FIXME: accuracy issues with 2.4.x
        version.parse(torch.__version__).release >= version.parse("2.6.0").release
        and activations.dtype == torch.int8
        and weights.dtype == torch.int8
    ):
        return qbytes_int_mm(activations, weights, output_scales)
    in_features = activations.shape[-1]
    if activations.dtype == torch.bfloat16 and weights.dtype == torch.int8 and in_features % 4 == 0:
        if type(activations) is not torch.Tensor:
            activations = activations.dequantize()
        return qbytes_int8pack_mm(activations, weights, output_scales)
    return qbytes_mm(activations, weights, output_scales)