in optimum/quanto/tensor/weights/tinygemm/packed.py [0:0]
def unpack(self):
"""Unpack the packed tensor to a torch.Tensor
Packing is device specific and implemented in undocumented dedicated kernels
that are synchronized with the corresponding matrix multiplication operation.
Instead of implementing a dedicated unpacking code, we pass an identity matrix
to the mm operation with identity scale and shifts to produce the unpacked uint8 weights.
Returns:
An unpacked uint8 `torch.Tensor` expanded along the second dimension.
"""
out_features, in_features = self.size()
# We need to pass a group_size to the mm and format the scale and shift accordingly,
# although it does not modify the calculation since we use identity scales and shifts.
# We arbitrarily choose the smallest group_size to be sure it divides in_features
group_size = 32
scale_and_shift_shape = (in_features // group_size, out_features, 2)
# Initialize identity scale
id_scale_and_shift = torch.ones(scale_and_shift_shape, dtype=torch.bfloat16, device=self.device)
# Set shift to mid-point, i.e. 2 **(bits - 1)
id_scale_and_shift[:, :, 1] = 8
identity = torch.eye(in_features, dtype=torch.bfloat16, device=self.device)
if self._data.device.type == "cpu":
unpacked_data = torch._weight_int4pack_mm_for_cpu(identity, self._data, group_size, id_scale_and_shift)
else:
unpacked_data = torch._weight_int4pack_mm(identity, self._data, group_size, id_scale_and_shift)
return unpacked_data.t().to(torch.uint8)