# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/aws-neuron/neuronx-distributed-inference/blob/9993358ce052fd7a1bb4a7497a6318aac36ed95c/src/neuronx_distributed_inference/modules/generation/sampling.py
import logging
from typing import Optional, Union

import torch
from neuronx_distributed.operators.argmax import argmax as nxd_argmax
from neuronx_distributed.operators.topk import topk as nxd_topk
from neuronx_distributed.parallel_layers import parallel_state
from neuronx_distributed.utils.utils import hardware
from neuronxcc.nki._private_kernels.cumsum import cumsum as nki_cumsum
from torch_neuronx.utils import get_platform_target
from torch_neuronx.xla_impl.ops import nki_jit, xla_hlo_call

from ...config import NxDNeuronConfig


logger = logging.getLogger("Neuron")


def mask_padded_logits(logits, rank_id, world_size, pad_size=None):
    if pad_size is None or pad_size == 0:
        return logits

    # invalid if rank_id == tp_degree - 1
    last_rank_mask = torch.eq(
        torch.full(logits.shape, world_size - 1, device=logits.device, dtype=torch.int32),
        rank_id.broadcast_to(logits.shape),
    )
    #   and index >= logits.shape[-1] - pad
    on_pad_mask = torch.ge(
        torch.arange(logits.shape[-1], device=logits.device, dtype=torch.int32).broadcast_to(logits.shape),
        torch.full(logits.shape, logits.shape[-1] - pad_size, device=logits.device, dtype=torch.int32),
    )
    invalid_mask = last_rank_mask * on_pad_mask
    logits = torch.where(invalid_mask, torch.full_like(logits, torch.finfo(logits.dtype).min), logits)

    return logits


def cumsum(tensor_in, dim, on_cpu: bool = False):
    if on_cpu:
        logger.debug("On CPU, using torch cumsum")
        return torch.cumsum(tensor_in, dim=dim)
    init_shape_len = len(tensor_in.shape)
    cumsum_dim = dim % init_shape_len
    last_dim = init_shape_len - 1
    is_transposed = False
    if cumsum_dim != last_dim:
        tensor_in = torch.transpose(tensor_in, cumsum_dim, last_dim)
        is_transposed = True
    init_shape = tensor_in.shape
    cumsum_len = init_shape[last_dim]
    # Prioritize nki kernel for float dtype, then matmul cumsum if not input is not float
    if torch.is_floating_point(tensor_in):
        logger.debug("Using NKI cumsum")
        tensor_in = tensor_in.view(-1, cumsum_len)
        nki_cumsum_func = nki_jit()(nki_cumsum)
        output = torch.zeros_like(tensor_in, device=tensor_in.device, dtype=tensor_in.dtype)
        nki_cumsum_func(tensor_in, output, axis=1)
        output = output.view(init_shape)
        if is_transposed:
            output = torch.transpose(output, cumsum_dim, last_dim)
        return output
    else:
        logger.debug("Using matmul cumsum")
        triu = torch.triu(
            torch.ones(
                cumsum_len,
                cumsum_len,
                dtype=tensor_in.dtype,
                device=tensor_in.device,
            )
        )
        output = tensor_in @ triu
        if is_transposed:
            output = torch.transpose(output, cumsum_dim, last_dim)
        return output


@xla_hlo_call
def rand_like(tensor):
    dtype = tensor.dtype
    shape = tensor.sizes
    minimum = dtype.Constant(constant_value=0)
    maximum = dtype.Constant(constant_value=1)
    return dtype[shape].Rng(minimum, maximum, distribution=1)  # Uniform distribution


def validate_sampling_params(params: torch.Tensor, max_topk: int) -> None:
    """
    Validates sampling parameters for language models.

    Args:
    params (torch.Tensor): Tensor of shape (batch_size, 3) containing sampling parameters
                           in the order: top-k, top-p, temperature.
    max_topk (int): The maximum number of top tokens to sample from.

    Raises:
    ValueError: If any of the parameters are invalid.
    """
    if params.shape[1] != 3:
        raise ValueError(f"Expected tensor of shape (batch_size, 3), but got {params.shape}")

    # autocast params tensor to float32
    params = params.to(torch.float32)

    # Unpack parameters
    top_k, top_p, temperature = params[:, 0], params[:, 1], params[:, 2]

    # Validate top-k value range
    valid_top_k = (top_k == -1) | ((top_k > 0) & (top_k <= max_topk))
    if not torch.all(valid_top_k):
        raise ValueError(
            f"Invalid top-k values found. top-k must be -1 or greater than 0 but less than or equal to {max_topk}. Found {top_k}."
        )

    # checks if top-k values can be represented as integers
    if not torch.equal(top_k, top_k.floor()):
        raise ValueError(
            f"Invalid top-k values found. top-k values should be able to be represented as integer values, but found decimal parts. Found {top_k=}."
        )

    # Validate top-p
    valid_top_p = (top_p > 0.0) & (top_p <= 1.0)
    if not torch.all(valid_top_p):
        raise ValueError(f"Invalid top-p values found. top-p must be in the range (0.0, 1.0]. Found {top_p=}.")

    # Validate temperature
    valid_temp = temperature > 0.0
    if not torch.all(valid_temp):
        raise ValueError(
            f"Invalid temperature values found. Temperature must be strictly greater than 0.0. Found {temperature=}."
        )


def prepare_sampling_params(batch_size, top_k=[1], top_p=[1.0], temperature=[1.0]):
    top_k = prepare_tensor(top_k)
    top_p = prepare_tensor(top_p)
    temperature = prepare_tensor(temperature)

    assert top_k.shape[0] == top_p.shape[0] == temperature.shape[0], (
        f"sampling params shapes don't match. \
        Got top_k shape: {top_k.shape}, top_p shape: {top_p.shape}, temperature shape: {temperature.shape}"
    )

    if top_k.shape[0] == 1:
        top_k = top_k.broadcast_to(batch_size)
        top_p = top_p.broadcast_to(batch_size)
        temperature = temperature.broadcast_to(batch_size)
    stacked = torch.stack([top_k, top_p, temperature], dim=1)
    return stacked


def prepare_tensor(val: Union[torch.Tensor, list, float]):
    if not torch.is_tensor(val):
        if not isinstance(val, list):
            val = [val]
        val = torch.tensor(val)
    return val


class Sampler(torch.nn.Module):
    """Add sampling code to the model graph.

    The sampling method is set when compiling the model, and cannot be changed at runtime.
    If the model was compiled for multinomial sampling, it is still possible
    to perform greedy sampling by passing top_k=1 and top_p=1.0.
    On the other hand, if the model was compiled for greedy sampling, it is not
    possible to perform multinomial sampling at runtime.
    For that reason, multinomial sampling is the default sampling method.

    Args:
        do_sample(`Optional[bool]`): whether to use sampling or not. If False, argmax sampling is used, whatever sampling parameters are passed at runtime.
        max_topk(`Optional[int]`): the maximum number of top tokens to sample from. It is used to optimize calculations
        by performing a single topk operation on all logits in a batch then apply a mask by sequence instead of
        applying top_k on each sequence in the batch individually. Defaults to 0, which means no optimization.
        on_cpu(`Optional[bool]`): whether to run on CPU or not
    """

    def __init__(
        self, neuron_config: NxDNeuronConfig, do_sample: Optional[bool] = True, on_cpu: Optional[bool] = False
    ):
        super().__init__()
        if not do_sample:
            logger.warning("Greedy sampling is used. Sampling parameters will be ignored at runtime.")
        self.neuron_config = neuron_config
        self.do_sample = do_sample
        if self.neuron_config.max_topk < 0:
            logger.warning("max_topk optimization is disabled: this can lead to extremely long compilation times.")
        self.IGNORED_LOGITS_VALUE = -3000  # large negative values will be transformed to ~0 in softmax, this is to ignore tokens that are beyond topk range

        self.on_cpu = on_cpu
        if on_cpu:
            self.process_group = None
        else:
            self.process_group = parallel_state.get_tensor_model_parallel_group()

    def _soft_max(self, logits, dim):
        return torch.nn.functional.softmax(input=logits, dim=dim)

    def _get_top_k_num_stages(self):
        hardware_type = hardware(get_platform_target())
        if (
            hardware_type == hardware.TRN2
            and self.neuron_config.tp_degree == self.neuron_config.world_size == 64
            and self.neuron_config.logical_nc_config == 2
        ):
            return 3
        elif hardware_type == hardware.TRN1 and self.neuron_config.tp_degree == self.neuron_config.world_size == 32:
            return 2
        else:
            return 1

    def _top_k_masked(self, logits, top_k, dim, rank_id):
        if self.neuron_config.max_topk > 0:
            if self.on_cpu:
                sorted_logits, indeces = torch.topk(input=logits, k=self.neuron_config.max_topk, dim=dim)
            else:
                sorted_logits, indeces = nxd_topk(
                    tensor=logits,
                    k=self.neuron_config.max_topk,
                    dim=dim,
                    gather_dim=dim,
                    process_group=self.process_group,
                    stages=self._get_top_k_num_stages(),
                    rank_id=rank_id,
                )
        else:
            sorted_logits, indeces = torch.sort(input=logits, dim=dim, descending=True)

        vocab_size = sorted_logits.shape[-1]
        mask = torch.arange(vocab_size, device=logits.device)
        mask = mask.broadcast_to(*sorted_logits.shape)

        mask = torch.greater_equal(mask, top_k)
        sorted_logits = sorted_logits.masked_fill_(mask, self.IGNORED_LOGITS_VALUE)
        return sorted_logits, indeces

    def _top_p(self, top_k_logits_values, probs_cumsum, top_p, dim):
        top_p_mask = torch.greater(probs_cumsum, top_p)
        top_k_logits_values = top_k_logits_values.masked_fill_(top_p_mask, self.IGNORED_LOGITS_VALUE)
        probs_soft_max = self._soft_max(top_k_logits_values, dim)  # custom call
        probs_cumsum = cumsum(tensor_in=probs_soft_max, dim=dim, on_cpu=self.on_cpu)
        return probs_cumsum

    def _rand_selector(self, probs_cumsum, num_samples=1):
        zeros = torch.zeros((probs_cumsum.shape[0], num_samples), device=probs_cumsum.device, dtype=probs_cumsum.dtype)
        return torch.rand_like(zeros) if self.on_cpu else rand_like(zeros)

    def _multinomial(self, probs, dim, num_samples=1):
        probs_cumsum = cumsum(tensor_in=probs, dim=dim, on_cpu=self.on_cpu)
        rand_selector = self._rand_selector(probs_cumsum, num_samples)
        greater_than_rand = torch.greater(rand_selector, probs_cumsum)
        counts = torch.sum(greater_than_rand, dim=dim).unsqueeze(dim)
        return counts

    def _argmax_sample(self, token_logits, return_values, dim):
        if self.on_cpu:
            return torch.argmax(token_logits, dim=dim)
        else:
            # distributed argmax
            tokens = nxd_argmax(
                tensor=token_logits,
                dim=dim,
                gather_dim=dim,
                keepdim=False,
                process_group=self.process_group,
            )
            values = torch.ones(tokens.shape, dtype=token_logits.dtype, device=tokens.device)
            if return_values:
                return tokens, values
            return tokens

    def _multinomial_sample(self, token_logits, sampling_params, return_values, dim, rank_id):
        batch_size = token_logits.shape[0]
        top_k = sampling_params[:, 0].reshape(batch_size, 1)
        top_p = sampling_params[:, 1].reshape(batch_size, 1)
        temperature = sampling_params[:, 2].reshape(batch_size, 1)

        # Apply top_k first
        top_k_logits_values, top_k_logits_indices = self._top_k_masked(token_logits, top_k, dim, rank_id)

        # Apply temperature
        top_k_logits_values = torch.divide(top_k_logits_values, temperature)

        # Apply top_p
        probs_soft_max = self._soft_max(top_k_logits_values, dim)
        probs_cumsum = cumsum(tensor_in=probs_soft_max, dim=dim, on_cpu=self.on_cpu)
        top_p = torch.max(torch.min(probs_cumsum), top_p)
        top_p_mask = torch.greater(probs_cumsum, top_p).index_fill_(
            dim, torch.tensor([0], device=top_p.device), False
        )  # need to keep at least one token
        top_k_logits_values = top_k_logits_values.masked_fill_(top_p_mask, self.IGNORED_LOGITS_VALUE)

        probs_soft_max = self._soft_max(top_k_logits_values, dim)  # custom call
        if return_values:
            return top_k_logits_indices, probs_soft_max

        counts = self._multinomial(probs_soft_max, dim)
        return torch.gather(input=top_k_logits_indices, dim=dim, index=counts).flatten()

    def forward(self, token_logits, sampling_params, return_values=False, rank_id=None):
        """
        forward to perform topk, topp, temperature and multinomial sampling.

        This method is only used when compiling the model, which means that the
        decision to use multinomial sampling cannot be made at runtime.
        If the model was compiled for multinomial sampling, it is still possible
        to perform greedy sampling by passing top_k=1 and top_p=1.0.
        On the other hand, if the model was compiled for greedy sampling, it is not
        possible to perform multinomial sampling at runtime.
        For that reason, multinomial sampling is the default sampling method.

        Inputs:
            token_logits: tensor whose first dimension is Batch Size
                and whose final dimension is Vocabulary Size
            sampling_params: a 2D tensor of size (Batch Size, 3)
            containing the following sampling params:
                * top_k: value to use for top_k sampling
                * top_p: value to use for top_p sampling
                * temperature: value to use for temperature sampling

        Output:
            Tensor containing 1 sampled token id per batch size.
            Output size is (1, Batch Size)

        Note: Using torch.multinomial on device causes trace to hang.
        This is because torch.multinomial performs a number of distribution
        validation steps, which is content dependent. Hence we implement multinomial
        distribution here instead.
        """
        dim = len(token_logits.shape) - 1  # vocab_size dimension
        if self.do_sample:
            return self._multinomial_sample(token_logits, sampling_params, return_values, dim, rank_id)
        else:
            return self._argmax_sample(token_logits, return_values, dim)