# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ast
from typing import Optional

import torch
from packaging import version
from torch.autograd import Function

from ...library import is_extension_available
from ..function import QuantizedLinearFunction
from ..grouped import grouped_shape
from ..packed import PackedTensor
from ..qbits import QBitsTensor
from ..qtensor import qfallback
from ..qtype import qint2, qint4, qtype, qtypes


__all__ = ["WeightQBitsTensor"]


class WeightsQBitsQuantizer(Function):
    @staticmethod
    def forward(
        ctx,
        base: torch.Tensor,
        qtype: qtype,
        axis: int,
        group_size: int,
        scale: torch.Tensor,
        shift: torch.Tensor,
        optimized: bool,
    ):
        if qtype not in (qint2, qint4):
            raise ValueError("WeightQBitsTensor can only be of qint2 or qint4 qtype")
        if axis not in (0, -1):
            raise ValueError("WeightQBitsTensor axis parameter must be 0 (first axis) or -1 (last axis)")
        size = base.size()
        stride = base.stride()
        data = torch.ops.quanto.quantize_affine(
            base, bits=qtype.bits, axis=axis, group_size=group_size, scale=scale, shift=shift
        )
        if optimized:
            return WeightQBitsTensor.create(qtype, axis, group_size, size, stride, data, scale, shift)
        return WeightQBitsTensor(qtype, axis, group_size, size, stride, data, scale, shift)

    @staticmethod
    def backward(ctx, gO):
        # For autograd, quantization is a no-op
        return gO, None, None, None, None, None, None


class WeightQBitsTensor(QBitsTensor):
    @staticmethod
    def create(qtype, axis, group_size, size, stride, data, scale, shift, requires_grad=False):
        """Factory method to create a WeightQBitsTensor

        This selects the most appropriate WeightQBitsTensor based on the configuration.

        Args:
            axis (`int`):
                The axis that is preserved by quantization (usually zero for linear weights).
            group_size (`int`):
                The group size that further splits the data elements for each index along the quantization axis.
            size ():
                The Tensor size.
            stride():
                The Tensor stride.
            data (`torch.Tensor`):
                The tensor data, either as a raw uint8 torch.Tensor or as a PackedTensor.
            scale (`torch.Tensor`):
                The floating point scale expressed as a torch.Tensor.
            shift (`torch.Tensor`):
                The shift expressed as a torch.Tensor. It can be either an integer representing zero
                (i.e. zero-point) or a float value.
            requires_grad (`bool`):
                If the Tensor must be receive a gradient or not.

        Returns:
            a `WeightQBitsTensor` (can be a subclass).
        """
        from .awq import AWQWeightQBitsTensor
        from .tinygemm import TinyGemmWeightQBitsTensor

        if (
            qtype == qint4
            and size[0] >= 128  # FIXME Workaround AWQ GEMM crash (GEMV might work for short inputs)
            and scale.dtype == torch.float16
            and axis == 0
            and group_size == 128
            and len(size) == 2
            and (data.device.type == "cuda" and torch.version.cuda)
            and torch.cuda.get_device_capability(data.device)[0] >= 8
            and is_extension_available("quanto_cuda")
        ):
            if type(data) is PackedTensor:
                data = data.unpack()
            return AWQWeightQBitsTensor(qtype, axis, group_size, size, stride, data, scale, shift, requires_grad)
        if qtype == qint4 and scale.dtype == torch.bfloat16 and axis == 0 and group_size == 128 and len(size) == 2:
            if data.device.type == "cpu" or (
                (data.device.type == "cuda" and torch.version.cuda)
                and version.parse(torch.version.cuda).release >= (12, 1)
                and torch.cuda.get_device_capability(data.device)[0] >= 8
            ):
                if type(data) is PackedTensor:
                    data = data.unpack()
                return TinyGemmWeightQBitsTensor(
                    qtype, axis, group_size, size, stride, data, (scale, shift), requires_grad
                )

        return WeightQBitsTensor(qtype, axis, group_size, size, stride, data, scale, shift, requires_grad)

    @staticmethod
    def __new__(cls, qtype, axis, group_size, size, stride, data, scale, shift, requires_grad=False):
        assert data.device == scale.device
        assert data.device == shift.device
        return torch.Tensor._make_wrapper_subclass(
            cls, size, strides=stride, dtype=scale.dtype, device=data.device, requires_grad=requires_grad
        )

    def __init__(self, qtype, axis, group_size, size, stride, data, scale, shift, requires_grad=False):
        if type(data) is torch.Tensor:
            data = PackedTensor.pack(data, qtype.bits)
        super().__init__(qtype, axis, group_size, size, stride, data, scale, shift)

    @classmethod
    def quantize(
        cls,
        base: torch.Tensor,
        qtype: qtype,
        axis: int,
        group_size: int,
        scale: torch.Tensor,
        shift: torch.Tensor,
        optimized: Optional[bool] = True,
    ):
        return WeightsQBitsQuantizer.apply(base, qtype, axis, group_size, scale, shift, optimized)

    @staticmethod
    def load_from_state_dict(state_dict, prefix, qtype, axis, group_size, size, stride, missing_keys):
        if group_size is None:
            data_size = size
            data_stride = stride
        else:
            data_size = grouped_shape(size, axis, group_size)
            assert len(data_size) == 2
            # In row major, inner dimension (stride 1) is the last one
            data_stride = (data_size[1], 1)
        inner_tensors_dict = {
            "_data": PackedTensor.load_from_state_dict(
                state_dict, prefix + "_data.", qtype.bits, data_size, data_stride, missing_keys=missing_keys
            )
        }
        missing = inner_tensors_dict["_data"] is None
        for name in ["_scale", "_shift"]:
            if prefix + name not in state_dict:
                missing_keys.append(prefix + name)
                missing = True
            else:
                inner_tensors_dict[name] = state_dict.pop(prefix + name)

        if missing:  # could not deserialize because of missing keys
            return None

        meta = {
            "qtype": qtype.name,
            "axis": str(axis),
            "group_size": str(group_size),
            "size": str(list(size)),
            "stride": str(list(stride)),
        }
        return WeightQBitsTensor.__tensor_unflatten__(inner_tensors_dict, meta, None, None)

    def optimize(self):
        """Allows to convert an existing WeightQBitsTensor to an optimized subclass

        This is used in particular after reloading a serialized WeightQBitsTensor (which is
        always saved using the kernel-agnostic packing).
        """
        if type(self) is not WeightQBitsTensor:
            return self
        data = self._data.unpack()
        # Call dedicated helper to select the best subclass for this device
        return WeightQBitsTensor.create(
            self.qtype,
            self.axis,
            self._group_size,
            self.size(),
            self.stride(),
            data,
            self._scale,
            self._shift,
            self.requires_grad,
        )

    def save_to_state_dict(self, destination, prefix, keep_vars):
        if type(self) is WeightQBitsTensor:
            super().save_to_state_dict(destination, prefix, keep_vars)
        else:
            # Convert back subclass before serializing
            self.weight_qbits_tensor().save_to_state_dict(destination, prefix, keep_vars)

    def weight_qbits_tensor(self):
        """Convert back a subclass to a WeightQBitsTensor

        This is required to make sure only standard packing is used when serializing.
        """
        raise NotImplementedError

    def __tensor_flatten__(self):
        inner_tensors = ["_data", "_scale", "_shift"]
        # Since meta can be used for serialization, use only strings
        meta = {
            "qtype": self._qtype.name,
            "axis": str(self._axis),
            "group_size": str(self._group_size),
            "size": str(list(self.size())),
            "stride": str(list(self.stride())),
        }
        return inner_tensors, meta

    @staticmethod
    def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
        assert len(inner_tensors) == 3
        assert len(meta) == 5
        data, scale, shift = inner_tensors["_data"], inner_tensors["_scale"], inner_tensors["_shift"]
        # Meta should only contain strings, AST compatible except qtype
        qtype = qtypes[meta["qtype"]]
        axis = ast.literal_eval(meta["axis"])
        group_size = ast.literal_eval(meta["group_size"])
        size = ast.literal_eval(meta["size"])
        stride = ast.literal_eval(meta["stride"])
        return WeightQBitsTensor(qtype, axis, group_size, size, stride, data, scale, shift)

    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
        """Dispatch torch functions applied on this subtensor

        This method is called whenever a torch function (such as `torch.nn.functional.linear`)
        is called with at least one parameter coresponding to this subtensor:

        - if a quantized implementation exists for the selected function, it is called,
        - otherwise, the original implementation is called, deactivating further functional dispatch.

        During the execution of the standard torch function, a second-level of dispatch will
        happen, but this time directly on individual torch Tensor operations (mainly ATEN).
        """
        kwargs = kwargs or {}
        if func is torch.nn.functional.linear:

            def qlinear(input, other, bias=None):
                return QuantizedLinearFunction.apply(input, other, bias)

            return qlinear(*args, **kwargs)
        elif func is torch.equal:
            input, other = args
            return input.equal(other)
        # Defer to operations dispatcher
        with torch._C.DisableTorchFunctionSubclass():
            return func(*args, **kwargs)

    @classmethod
    def __torch_dispatch__(cls, op, types, args, kwargs=None):
        # Do not use directly op, but rather its overload
        op = op.overloadpacket
        if op is torch.ops.aten.detach:
            t = args[0]
            # Detach is required when copying and deserializing
            inner_tensor_names, meta = t.__tensor_flatten__()
            # Detach inner tensors
            detached_tensors = {}
            for inner_name in inner_tensor_names:
                detached_tensors[inner_name] = op(getattr(t, inner_name))
            return cls.__tensor_unflatten__(detached_tensors, meta, t.size(), t.stride())
        elif op in [torch.ops.aten._to_copy, torch.ops.aten.to]:
            t = args[0]
            dtype = kwargs.pop("dtype", t.dtype)
            device = kwargs.pop("device", t.device)
            if dtype is not None and dtype != t.dtype:
                raise ValueError("The dtype of a WeightQBitsTensor cannot be changed")
            if type(t) is not WeightQBitsTensor and t.device.type != device.type:
                # Before moving to another device type, convert back to a WeightQBitsTensor
                t = t.weight_qbits_tensor()
            scale = op(t._scale, dtype=dtype, device=device, **kwargs)
            data = op(t._data, device=device, **kwargs)
            shift = op(t._shift, device=device, **kwargs)
            return WeightQBitsTensor.create(t._qtype, t._axis, t._group_size, t.size(), t.stride(), data, scale, shift)
        # No dispatch available: qfallback
        kwargs = kwargs or {}
        return qfallback(op, *args, **kwargs)