in optimum/quanto/tensor/weights/awq/packed.py [0:0]
def unpack_v2(packed):
"""Unpack a packed int16 tensor to a larger uint8 tensor
Applies pack operations in reverse order (see pack_v2 method for details).
Warning: very slow, to be used for debug only.
Args:
packed (`torch.Tensor`):
The packed `torch.int16` tensor
Returns:
An unpacked uint8 `torch.Tensor` expanded along the first dimension.
"""
assert packed.device.type in ["cuda", "xpu"]
assert packed.ndim == 2
I = 4
S = 64
N_div_I, K = packed.shape
N = N_div_I * I
# Reshape (N // I, K) -> (N // I, K // S, S, 1)
unpacked = packed.reshape(N // I, K // S, S, 1)
# Convert to uint16 (through numpy because not supported by pytorch)
unpacked = unpacked.cpu().numpy().astype(np.uint16)
# Unpack (N // I, K, S) -> (N // I, K // S, S, I)
unpacked = torch.cat(
[
torch.tensor((unpacked & 0xF).astype(np.uint8)).to(packed.device),
torch.tensor(((unpacked & 0xF0) >> 4).astype(np.uint8)).to(packed.device),
torch.tensor(((unpacked & 0xF00) >> 8).astype(np.uint8)).to(packed.device),
torch.tensor(((unpacked & 0xF000) >> 12).astype(np.uint8)).to(packed.device),
],
axis=-1,
)
# reshape (N // I, K // S, S, I) -> (N // I, K // S, I, S)
unpacked = unpacked.reshape(N // I, K // S, I, S)
# transpose (N // I, K // S, I, S) -> (N // I, I, K // S, S)
unpacked = unpacked.permute(0, 2, 1, 3)
# deinterleaving (N // I, I, K // S, S) -> (N, K)
unpacked = unpacked.reshape(N, K)
# Final steps to reorder (see packing code for explaination)
unpacked = unpacked.reshape(N, K // 32, 4, 2, 4).permute(0, 1, 2, 4, 3)
unpacked = unpacked.permute(0, 1, 3, 2, 4)
unpacked = unpacked.reshape(N, K)
return unpacked