# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ast

import torch
from torch.utils import _pytree as pytree


__all__ = ["PackedTensor"]


def pack_weights(intweights: torch.Tensor, bits: int) -> torch.Tensor:
    """
    Pack int4 / int2 weights in a uint8 tensor

    What packing means? Assume we have 4 values that are in 2bit but encoded in 8bit
    (because torch does not have native support for 2-bit datatypes)

    > 0000 0011 | 0000 0010 | 0000 0001 | 0000 0000

    We can pack them in a single 8-bit uint value

    > 1110 0100

    Therefore instead of saving 4 values in 8-bit precision we save a single value of 8-bit precision saving 24 bits in total.

    Args:
        intweights (`torch.Tensor`):
            The un-packed `torch.uint8` tensor
        bits (`int`):
            The actual `bits` - can be 2, 4
    """
    original_shape = intweights.shape
    values_per_item = 8 // bits
    row_dim = (original_shape[0] + values_per_item - 1) // values_per_item

    if len(original_shape) == 1:
        packed_tensor_shape = (row_dim,)
    else:
        packed_tensor_shape = (row_dim, *original_shape[1:])

    packed = torch.zeros(packed_tensor_shape, device=intweights.device, dtype=torch.uint8)
    unpacked = intweights.to(torch.uint8)

    def lshift(t: torch.Tensor, bits: int):
        if t.device.type == "mps":
            # lshift is not supported on MPS device
            return t * (2**bits)
        return t << bits

    it = min(values_per_item, (original_shape[0] // row_dim) + 1)
    for i in range(it):
        start = i * row_dim
        end = min(start + row_dim, original_shape[0])
        packed[: (end - start)] |= lshift(unpacked[start:end], bits * i)

    return packed


class PackedTensor(torch.Tensor):
    @staticmethod
    def __new__(cls, data, bits, size, stride, requires_grad=False):
        # PackedTensor represents uint8 data and can therefore NEVER require gradient
        assert data.dtype == torch.uint8
        assert requires_grad is False
        return torch.Tensor._make_wrapper_subclass(
            cls, size, strides=stride, dtype=torch.uint8, device=data.device, requires_grad=requires_grad
        )

    def __init__(self, data, bits, size, stride, requires_grad=False):
        self._bits = bits
        self._data = data

    def __repr__(self):
        autograd_info = (
            f", grad_fn={self.grad_fn}" if self.grad_fn else ", requires_grad=True" if self.requires_grad else ""
        )
        return f"PackedTensor({self._data}, bits={self._bits}, public_dtype={self.dtype}{autograd_info})"

    @classmethod
    def pack(cls, t, bits=4):
        assert bits in (2, 4)
        assert t.dtype == torch.uint8
        data = pack_weights(t, bits)
        # We need to store size and stride to make sure the unpacked data has the correct shape
        return PackedTensor(data, bits, t.size(), t.stride())

    def unpack(self):
        unpacked_data = torch.ops.quanto.unpack(self._data, self._bits)
        # Adjust the first dimension, as unpacked data may have extra rows if the original shape is not a multiple of 8 // bits
        return unpacked_data[: self.shape[0]]

    @property
    def bits(self):
        return self._bits

    @property
    def dtype(self):
        return torch.uint8

    @staticmethod
    def load_from_state_dict(state_dict, prefix, bits, size, stride, missing_keys):
        if prefix + "_data" not in state_dict:
            missing_keys.append(prefix + "_data")
            return

        inner_tensors_dict = {"_data": state_dict.pop(prefix + "_data")}
        meta = [name.replace(prefix, "") for name in state_dict.keys() if name.startswith(prefix)]
        meta = {"bits": str(bits), "size": str(list(size)), "stride": str(stride)}
        return PackedTensor.__tensor_unflatten__(inner_tensors_dict, meta, None, None)

    def __tensor_flatten__(self):
        inner_tensors = ["_data"]
        # Since meta can be used for serialization, use only AST compatible strings
        meta = {"bits": str(self._bits), "size": str(list(self.size())), "stride": str(self.stride())}
        return inner_tensors, meta

    @staticmethod
    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
        assert len(inner_tensors) == 1
        assert len(meta) == 3
        data = inner_tensors["_data"]
        # Meta should contain only AST compatible strings
        bits = ast.literal_eval(meta["bits"])
        size = ast.literal_eval(meta["size"])
        stride = ast.literal_eval(meta["stride"])
        return PackedTensor(data, bits, size, stride)

    __torch_function__ = torch._C._disabled_torch_function_impl

    @classmethod
    def __torch_dispatch__(cls, op, types, args, kwargs=None):
        # Convert back to tensor before calling any operation except detach
        if op.overloadpacket is torch.ops.aten.detach:
            t = args[0]
            data = op(t._data)
            return PackedTensor(data, t._bits, t.size(), t.stride())
        elif op.overloadpacket in (torch.ops.aten._to_copy, torch.ops.aten.to):
            t = args[0]
            dtype = kwargs.get("dtype", torch.uint8)
            if dtype != torch.uint8:
                raise ValueError(f"PackedTensor are torch.uint8 only and cannot be moved to {dtype}.")
            # Move data
            data = op(t._data, **kwargs)
            return PackedTensor(data, t._bits, t.size(), t.stride())
        args, kwargs = pytree.tree_map_only(PackedTensor, lambda x: x.unpack(), (args, kwargs or {}))
        return op(*args, **kwargs)

    def numpy(self):
        return self.unpack().cpu().numpy()