#!/usr/bin/env python
# encoding: utf-8
#
# Copyright 2022 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This module is comprised of PyTorch layers from NNAudio and ported to TensorFlow:
# https://github.com/KinWaiCheuk/nnAudio
# The above code is released under an MIT license.

import warnings
import tensorflow as tf
import numpy as np
from typing import Any, List, Optional, Tuple, Union

import scipy.signal


def create_lowpass_filter(
    band_center: float = 0.5,
    kernel_length: int = 256,
    transition_bandwidth: float = 0.03,
    dtype: tf.dtypes.DType = tf.float32,
) -> np.ndarray:
    """
    Calculate the highest frequency we need to preserve and the lowest frequency we allow
    to pass through. Note that frequency is on a scale from 0 to 1 where 0 is 0 and 1 is
    the Nyquist frequency of the signal BEFORE downsampling.
    """

    passband_max = band_center / (1 + transition_bandwidth)
    stopband_min = band_center * (1 + transition_bandwidth)

    # We specify a list of key frequencies for which we will require
    # that the filter match a specific output gain.
    # From [0.0 to passband_max] is the frequency range we want to keep
    # untouched and [stopband_min, 1.0] is the range we want to remove
    key_frequencies = [0.0, passband_max, stopband_min, 1.0]

    # We specify a list of output gains to correspond to the key
    # frequencies listed above.
    # The first two gains are 1.0 because they correspond to the first
    # two key frequencies. the second two are 0.0 because they
    # correspond to the stopband frequencies
    gain_at_key_frequencies = [1.0, 1.0, 0.0, 0.0]

    # This command produces the filter kernel coefficients
    filter_kernel = scipy.signal.firwin2(kernel_length, key_frequencies, gain_at_key_frequencies)

    return tf.constant(filter_kernel, dtype=dtype)


def next_power_of_2(A: int) -> int:
    """A helper function to calculate the next nearest number to the power of 2."""
    return int(np.ceil(np.log2(A)))


def early_downsample(
    sr: Union[float, int],
    hop_length: int,
    n_octaves: int,
    nyquist_hz: float,
    filter_cutoff_hz: float,
) -> Tuple[Union[float, int], int, int]:
    """Return new sampling rate and hop length after early downsampling"""
    downsample_count = early_downsample_count(nyquist_hz, filter_cutoff_hz, hop_length, n_octaves)
    downsample_factor = 2 ** (downsample_count)

    hop_length //= downsample_factor  # Getting new hop_length
    new_sr = sr / float(downsample_factor)  # Getting new sampling rate

    return new_sr, hop_length, downsample_factor


# The following two downsampling count functions are obtained from librosa CQT
# They are used to determine the number of pre resamplings if the starting and ending frequency
# are both in low frequency regions.
def early_downsample_count(nyquist_hz: float, filter_cutoff_hz: float, hop_length: int, n_octaves: int) -> int:
    """Compute the number of early downsampling operations"""

    downsample_count1 = max(0, int(np.ceil(np.log2(0.85 * nyquist_hz / filter_cutoff_hz)) - 1) - 1)
    num_twos = next_power_of_2(hop_length)
    downsample_count2 = max(0, num_twos - n_octaves + 1)

    return min(downsample_count1, downsample_count2)


def get_early_downsample_params(
    sr: Union[float, int],
    hop_length: int,
    fmax_t: float,
    Q: float,
    n_octaves: int,
    dtype: tf.dtypes.DType,
) -> Tuple[Union[float, int], int, float, np.array, bool]:
    """Compute downsampling parameters used for early downsampling"""

    window_bandwidth = 1.5  # for hann window
    filter_cutoff = fmax_t * (1 + 0.5 * window_bandwidth / Q)
    sr, hop_length, downsample_factor = early_downsample(sr, hop_length, n_octaves, sr // 2, filter_cutoff)
    if downsample_factor != 1:
        earlydownsample = True
        early_downsample_filter = create_lowpass_filter(
            band_center=1 / downsample_factor,
            kernel_length=256,
            transition_bandwidth=0.03,
            dtype=dtype,
        )
    else:
        early_downsample_filter = None
        earlydownsample = False

    return sr, hop_length, downsample_factor, early_downsample_filter, earlydownsample


def get_window_dispatch(window: Union[str, Tuple[str, float]], N: int, fftbins: bool = True) -> np.array:
    if isinstance(window, str):
        return scipy.signal.get_window(window, N, fftbins=fftbins)
    elif isinstance(window, tuple):
        if window[0] == "gaussian":
            assert window[1] >= 0
            sigma = np.floor(-N / 2 / np.sqrt(-2 * np.log(10 ** (-window[1] / 20))))
            return scipy.signal.get_window(("gaussian", sigma), N, fftbins=fftbins)
        else:
            Warning("Tuple windows may have undesired behaviour regarding Q factor")
    elif isinstance(window, float):
        Warning("You are using Kaiser window with beta factor " + str(window) + ". Correct behaviour not checked.")
    else:
        raise Exception("The function get_window from scipy only supports strings, tuples and floats.")


def create_cqt_kernels(
    Q: float,
    fs: float,
    fmin: float,
    n_bins: int = 84,
    bins_per_octave: int = 12,
    norm: int = 1,
    window: str = "hann",
    fmax: Optional[float] = None,
    topbin_check: bool = True,
) -> Tuple[np.array, int, np.array, np.array]:
    """
    Automatically create CQT kernels in time domain
    """

    fftLen = 2 ** next_power_of_2(np.ceil(Q * fs / fmin))

    if (fmax is not None) and (n_bins is None):
        n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin))  # Calculate the number of bins
        freqs = fmin * 2.0 ** (np.r_[0:n_bins] / float(bins_per_octave))

    elif (fmax is None) and (n_bins is not None):
        freqs = fmin * 2.0 ** (np.r_[0:n_bins] / float(bins_per_octave))

    else:
        warnings.warn("If fmax is given, n_bins will be ignored", SyntaxWarning)
        n_bins = np.ceil(bins_per_octave * np.log2(fmax / fmin))  # Calculate the number of bins
        freqs = fmin * 2.0 ** (np.r_[0:n_bins] / float(bins_per_octave))

    if np.max(freqs) > fs / 2 and topbin_check is True:
        raise ValueError(
            "The top bin {}Hz has exceeded the Nyquist frequency, please reduce the n_bins".format(np.max(freqs))
        )

    tempKernel = np.zeros((int(n_bins), int(fftLen)), dtype=np.complex64)

    lengths = np.ceil(Q * fs / freqs)
    for k in range(0, int(n_bins)):
        freq = freqs[k]
        _l = np.ceil(Q * fs / freq)

        # Centering the kernels, pad more zeros on RHS
        start = int(np.ceil(fftLen / 2.0 - _l / 2.0)) - int(_l % 2)

        sig = (
            get_window_dispatch(window, int(_l), fftbins=True)
            * np.exp(np.r_[-_l // 2 : _l // 2] * 1j * 2 * np.pi * freq / fs)
            / _l
        )

        if norm:  # Normalizing the filter # Trying to normalize like librosa
            tempKernel[k, start : start + int(_l)] = sig / np.linalg.norm(sig, norm)
        else:
            tempKernel[k, start : start + int(_l)] = sig

    return tempKernel, fftLen, lengths, freqs


def get_cqt_complex(
    x: tf.Tensor,
    cqt_kernels_real: tf.Tensor,
    cqt_kernels_imag: tf.Tensor,
    hop_length: int,
    padding: tf.keras.layers.Layer,
) -> tf.Tensor:
    """Multiplying the STFT result with the cqt_kernel, check out the 1992 CQT paper [1]
    for how to multiple the STFT result with the CQT kernel
    [2] Brown, Judith C.C. and Miller Puckette. “An efficient algorithm for the calculation of
    a constant Q transform.” (1992)."""

    try:
        x = padding(x)  # When center is True, we need padding at the beginning and ending
    except Exception:
        warnings.warn(
            f"\ninput size = {x.shape}\tkernel size = {cqt_kernels_real.shape[-1]}\n"
            "padding with reflection mode might not be the best choice, try using constant padding",
            UserWarning,
        )
        x = tf.pad(x, (cqt_kernels_real.shape[-1] // 2, cqt_kernels_real.shape[-1] // 2))
    CQT_real = tf.transpose(
        tf.nn.conv1d(
            tf.transpose(x, [0, 2, 1]),
            tf.transpose(cqt_kernels_real, [2, 1, 0]),
            padding="VALID",
            stride=hop_length,
        ),
        [0, 2, 1],
    )
    CQT_imag = -tf.transpose(
        tf.nn.conv1d(
            tf.transpose(x, [0, 2, 1]),
            tf.transpose(cqt_kernels_imag, [2, 1, 0]),
            padding="VALID",
            stride=hop_length,
        ),
        [0, 2, 1],
    )

    return tf.stack((CQT_real, CQT_imag), axis=-1)


def downsampling_by_n(x: tf.Tensor, filter_kernel: tf.Tensor, n: float, match_torch_exactly: bool = True) -> tf.Tensor:
    """
    Downsample the given tensor using the given filter kernel.
    The input tensor is expected to have shape `(n_batches, channels, width)`,
    and the filter kernel is expected to have shape `(num_output_channels,)` (i.e.: 1D)

    If match_torch_exactly is passed, we manually pad the input rather than having TensorFlow do so with "SAME".
    The result is subtly different than Torch's output, but it is compatible with TensorFlow Lite (as of v2.4.1).
    """

    if match_torch_exactly:
        paddings = [
            [0, 0],
            [0, 0],
            [(filter_kernel.shape[-1] - 1) // 2, (filter_kernel.shape[-1] - 1) // 2],
        ]
        padded = tf.pad(x, paddings)

        # Store this tensor in the shape `(n_batches, width, channels)`
        padded_nwc = tf.transpose(padded, [0, 2, 1])
        result_nwc = tf.nn.conv1d(padded_nwc, filter_kernel[:, None, None], padding="VALID", stride=n)
    else:
        x_nwc = tf.transpose(x, [0, 2, 1])
        result_nwc = tf.nn.conv1d(x_nwc, filter_kernel[:, None, None], padding="SAME", stride=n)
    result_ncw = tf.transpose(result_nwc, [0, 2, 1])
    return result_ncw


class ReflectionPad1D(tf.keras.layers.Layer):
    """
    Replica of Torch's nn.ReflectionPad1D in TF.
    """

    def __init__(self, padding: Union[int, Tuple[int]] = 1, **kwargs: Any):
        self.padding = padding
        self.input_spec = [tf.keras.layers.InputSpec(ndim=3)]
        super(ReflectionPad1D, self).__init__(**kwargs)

    def compute_output_shape(self, s: List[int]) -> Tuple[int, int, int]:
        return (s[0], s[1], s[2] + 2 * self.padding if isinstance(self.padding, int) else self.padding[0])

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return tf.pad(x, [[0, 0], [0, 0], [self.padding, self.padding]], "REFLECT")


class ConstantPad1D(tf.keras.layers.Layer):
    """
    Replica of Torch's nn.ConstantPad1D in TF.
    """

    def __init__(self, padding: Union[int, Tuple[int]] = 1, value: int = 0, **kwargs: Any):
        self.padding = padding
        self.value = value
        self.input_spec = [tf.keras.layers.InputSpec(ndim=3)]
        super(ConstantPad1D, self).__init__(**kwargs)

    def compute_output_shape(self, s: List[int]) -> Tuple[int, int, int]:
        return (s[0], s[1], s[2] + 2 * self.padding if isinstance(self.padding, int) else self.padding[0])

    def call(self, x: tf.Tensor) -> tf.Tensor:
        return tf.pad(x, [[0, 0], [0, 0], [self.padding, self.padding]], "CONSTANT", self.value)


def pad_center(data: np.ndarray, size: int, axis: int = -1, **kwargs: Any) -> np.ndarray:
    """Wrapper for np.pad to automatically center an array prior to padding.
    This is analogous to `str.center()`

    Examples
    --------
    >>> # Generate a vector
    >>> data = np.ones(5)
    >>> librosa.util.pad_center(data, 10, mode='constant')
    array([ 0.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.])
    >>> # Pad a matrix along its first dimension
    >>> data = np.ones((3, 5))
    >>> librosa.util.pad_center(data, 7, axis=0)
    array([[ 0.,  0.,  0.,  0.,  0.],
           [ 0.,  0.,  0.,  0.,  0.],
           [ 1.,  1.,  1.,  1.,  1.],
           [ 1.,  1.,  1.,  1.,  1.],
           [ 1.,  1.,  1.,  1.,  1.],
           [ 0.,  0.,  0.,  0.,  0.],
           [ 0.,  0.,  0.,  0.,  0.]])
    >>> # Or its second dimension
    >>> librosa.util.pad_center(data, 7, axis=1)
    array([[ 0.,  1.,  1.,  1.,  1.,  1.,  0.],
           [ 0.,  1.,  1.,  1.,  1.,  1.,  0.],
           [ 0.,  1.,  1.,  1.,  1.,  1.,  0.]])

    Parameters
    ----------
    data : np.ndarray
        Vector to be padded and centered
    size : int >= len(data) [scalar]
        Length to pad `data`
    axis : int
        Axis along which to pad and center the data
    kwargs : additional keyword arguments
      arguments passed to `np.pad()`

    Returns
    -------
    data_padded : np.ndarray
        `data` centered and padded to length `size` along the
        specified axis

    Raises
    ------
    ValueError
        If `size < data.shape[axis]`

    See Also
    --------
    numpy.pad
    """

    kwargs.setdefault("mode", "constant")

    n = data.shape[axis]

    lpad = int((size - n) // 2)

    lengths = [(0, 0)] * data.ndim
    lengths[axis] = (lpad, int(size - n - lpad))

    if lpad < 0:
        raise ValueError(("Target size ({:d}) must be at least input size ({:d})").format(size, n))

    return np.pad(data, lengths, **kwargs)


class CQT2010v2(tf.keras.layers.Layer):
    """This layer calculates the CQT of the input signal.
    Input signal should be in either of the following shapes.
    1. (len_audio)
    2. (num_audio, len_audio)
    3. (num_audio, 1, len_audio)
    The correct shape will be inferred autommatically if the input follows these 3 shapes.
    Most of the arguments follow the convention from librosa.

    This layer uses about 1MB of memory per second of input audio with its default arguments.

    This alogrithm uses the resampling method proposed in [1].
    Instead of convoluting the STFT results with a gigantic CQT kernel covering the full frequency
    spectrum, we make a small CQT kernel covering only the top octave. Then we keep downsampling the
    input audio by a factor of 2 to convoluting it with the small CQT kernel.
    Everytime the input audio is downsampled, the CQT relative to the downsampled input is equivalent
    to the next lower octave.
    The kernel creation process is still same as the 1992 algorithm. Therefore, we can reuse the
    code from the 1992 alogrithm [2]
    [1] Schörkhuber, Christian. “CONSTANT-Q TRANSFORM TOOLBOX FOR MUSIC PROCESSING.” (2010).
    [2] Brown, Judith C.C. and Miller Puckette. “An efficient algorithm for the calculation of a
    constant Q transform.” (1992).
    Early downsampling factor is to downsample the input audio to reduce the CQT kernel size.
    The result with and without early downsampling are more or less the same except in the very low
    frequency region where freq < 40Hz.
    Parameters
    ----------
    sr : int
        The sampling rate for the input audio. It is used to calucate the correct ``fmin`` and ``fmax``.
        Setting the correct sampling rate is very important for calculating the correct frequency.
    hop_length : int
        The hop (or stride) size. Default value is 512.
    fmin : float
        The frequency for the lowest CQT bin. Default is 32.70Hz, which coresponds to the note C0.
    fmax : float
        The frequency for the highest CQT bin. Default is ``None``, therefore the higest CQT bin is
        inferred from the ``n_bins`` and ``bins_per_octave``.  If ``fmax`` is not ``None``, then the
        argument ``n_bins`` will be ignored and ``n_bins`` will be calculated automatically.
        Default is ``None``
    n_bins : int
        The total numbers of CQT bins. Default is 84. Will be ignored if ``fmax`` is not ``None``.
    bins_per_octave : int
        Number of bins per octave. Default is 12.
    norm : bool
        Normalization for the CQT result.
    basis_norm : int
        Normalization for the CQT kernels. ``1`` means L1 normalization, and ``2`` means L2 normalization.
        Default is ``1``, which is same as the normalization used in librosa.
    window : str
        The windowing function for CQT. It uses ``scipy.signal.get_window``, please refer to
        scipy documentation for possible windowing functions. The default value is 'hann'
    pad_mode : str
        The padding method. Default value is 'reflect'.
    trainable : bool
        Determine if the CQT kernels are trainable or not. If ``True``, the gradients for CQT kernels
        will also be caluclated and the CQT kernels will be updated during model training.
        Default value is ``False``
    output_format : str
        Determine the return type.
        'Magnitude' will return the magnitude of the STFT result, shape = ``(num_samples, freq_bins, time_steps)``;
        'Complex' will return the STFT result in complex number, shape = ``(num_samples, freq_bins, time_steps, 2)``;
        'Phase' will return the phase of the STFT reuslt, shape = ``(num_samples, freq_bins,time_steps, 2)``.
        The complex number is stored as ``(real, imag)`` in the last axis. Default value is 'Magnitude'.
    verbose : bool
        If ``True``, it shows layer information. If ``False``, it suppresses all prints.
    device : str
        Choose which device to initialize this layer. Default value is 'cpu'.
    Returns
    -------
    spectrogram : tf.Tensor

    It returns a tensor of spectrograms.
    shape = ``(num_samples, freq_bins,time_steps)`` if ``output_format='Magnitude'``;
    shape = ``(num_samples, freq_bins,time_steps, 2)`` if ``output_format='Complex' or 'Phase'``;
    Examples
    --------
    >>> spec_layer = Spectrogram.CQT2010v2()
    >>> specs = spec_layer(x)
    """

    def __init__(
        self,
        sr: int = 22050,
        hop_length: int = 512,
        fmin: float = 32.70,
        fmax: Optional[float] = None,
        n_bins: int = 84,
        filter_scale: int = 1,
        bins_per_octave: int = 12,
        norm: bool = True,
        basis_norm: int = 1,
        window: str = "hann",
        pad_mode: str = "reflect",
        earlydownsample: bool = True,
        trainable: bool = False,
        output_format: str = "Magnitude",
        match_torch_exactly: bool = True,
    ):
        super().__init__()

        self.sample_rate: Union[float, int] = sr
        self.hop_length = hop_length
        self.fmin = fmin
        self.fmax = fmax
        self.n_bins = n_bins
        self.filter_scale = filter_scale
        self.bins_per_octave = bins_per_octave
        self.norm = norm
        self.basis_norm = basis_norm
        self.window = window
        self.pad_mode = pad_mode
        self.earlydownsample = earlydownsample
        self.trainable = trainable
        self.output_format = output_format
        self.match_torch_exactly = match_torch_exactly
        self.normalization_type = "librosa"

    def get_config(self) -> Any:
        config = super().get_config().copy()
        config.update(
            {
                "sample_rate": self.sample_rate,
                "hop_length": self.hop_length,
                "fmin": self.fmin,
                "fmax": self.fmax,
                "n_bins": self.n_bins,
                "filter_scale": self.filter_scale,
                "bins_per_octave": self.bins_per_octave,
                "norm": self.norm,
                "basis_norm": self.basis_norm,
                "window": self.window,
                "pad_mode": self.pad_mode,
                "output_format": self.output_format,
                "earlydownsample": self.earlydownsample,
                "trainable": self.trainable,
                "match_torch_exactly": self.match_torch_exactly,
            }
        )
        return config

    def build(self, input_shape: tf.TensorShape) -> None:
        # This will be used to calculate filter_cutoff and creating CQT kernels
        Q = float(self.filter_scale) / (2 ** (1 / self.bins_per_octave) - 1)

        self.lowpass_filter = create_lowpass_filter(band_center=0.5, kernel_length=256, transition_bandwidth=0.001)

        # Calculate num of filter requires for the kernel
        # n_octaves determines how many resampling requires for the CQT
        n_filters = min(self.bins_per_octave, self.n_bins)
        self.n_octaves = int(np.ceil(float(self.n_bins) / self.bins_per_octave))

        # Calculate the lowest frequency bin for the top octave kernel
        self.fmin_t = self.fmin * 2 ** (self.n_octaves - 1)
        remainder = self.n_bins % self.bins_per_octave

        if remainder == 0:
            # Calculate the top bin frequency
            fmax_t = self.fmin_t * 2 ** ((self.bins_per_octave - 1) / self.bins_per_octave)
        else:
            # Calculate the top bin frequency
            fmax_t = self.fmin_t * 2 ** ((remainder - 1) / self.bins_per_octave)

        self.fmin_t = fmax_t / 2 ** (1 - 1 / self.bins_per_octave)  # Adjusting the top minium bins
        if fmax_t > self.sample_rate / 2:
            raise ValueError(
                "The top bin {}Hz has exceeded the Nyquist frequency, please reduce the n_bins".format(fmax_t)
            )

        if self.earlydownsample is True:  # Do early downsampling if this argument is True
            (
                self.sample_rate,
                self.hop_length,
                self.downsample_factor,
                early_downsample_filter,
                self.earlydownsample,
            ) = get_early_downsample_params(self.sample_rate, self.hop_length, fmax_t, Q, self.n_octaves, self.dtype)

            self.early_downsample_filter = early_downsample_filter
        else:
            self.downsample_factor = 1.0

        # Preparing CQT kernels
        basis, self.n_fft, _, _ = create_cqt_kernels(
            Q,
            self.sample_rate,
            self.fmin_t,
            n_filters,
            self.bins_per_octave,
            norm=self.basis_norm,
            topbin_check=False,
        )

        # For the normalization in the end
        # The freqs returned by create_cqt_kernels cannot be used
        # Since that returns only the top octave bins
        # We need the information for all freq bin
        freqs = self.fmin * 2.0 ** (np.r_[0 : self.n_bins] / float(self.bins_per_octave))
        self.frequencies = freqs

        self.lengths = np.ceil(Q * self.sample_rate / freqs)

        self.basis = basis
        # NOTE(psobot): this is where the implementation here starts to differ from CQT2010.

        # These cqt_kernel is already in the frequency domain
        self.cqt_kernels_real = tf.expand_dims(basis.real.astype(self.dtype), 1)
        self.cqt_kernels_imag = tf.expand_dims(basis.imag.astype(self.dtype), 1)

        if self.trainable:
            self.cqt_kernels_real = tf.Variable(initial_value=self.cqt_kernels_real, trainable=True)
            self.cqt_kernels_imag = tf.Variable(initial_value=self.cqt_kernels_imag, trainable=True)

        # If center==True, the STFT window will be put in the middle, and paddings at the beginning
        # and ending are required.
        if self.pad_mode == "constant":
            self.padding = ConstantPad1D(self.n_fft // 2, 0)
        elif self.pad_mode == "reflect":
            self.padding = ReflectionPad1D(self.n_fft // 2)

        rank = len(input_shape)
        if rank == 2:
            self.reshape_input = lambda x: x[:, None, :]
        elif rank == 1:
            self.reshape_input = lambda x: x[None, None, :]
        elif rank == 3:
            self.reshape_input = lambda x: x
        else:
            raise ValueError(f"Input shape must be rank <= 3, found shape {input_shape}")

    def call(self, x: tf.Tensor) -> tf.Tensor:
        x = self.reshape_input(x)  # type: ignore

        if self.earlydownsample is True:
            x = downsampling_by_n(x, self.early_downsample_filter, self.downsample_factor, self.match_torch_exactly)

        hop = self.hop_length

        # Getting the top octave CQT
        CQT = get_cqt_complex(x, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding)

        x_down = x  # Preparing a new variable for downsampling

        for _ in range(self.n_octaves - 1):
            hop = hop // 2
            x_down = downsampling_by_n(x_down, self.lowpass_filter, 2, self.match_torch_exactly)
            CQT1 = get_cqt_complex(x_down, self.cqt_kernels_real, self.cqt_kernels_imag, hop, self.padding)
            CQT = tf.concat((CQT1, CQT), axis=1)

        CQT = CQT[:, -self.n_bins :, :]  # Removing unwanted bottom bins

        # Normalizing the output with the downsampling factor, 2**(self.n_octaves-1) is make it
        # same mag as 1992
        CQT = CQT * self.downsample_factor

        # Normalize again to get same result as librosa
        if self.normalization_type == "librosa":
            CQT *= tf.math.sqrt(tf.cast(self.lengths.reshape((-1, 1, 1)), self.dtype))
        elif self.normalization_type == "convolutional":
            pass
        elif self.normalization_type == "wrap":
            CQT *= 2
        else:
            raise ValueError("The normalization_type %r is not part of our current options." % self.normalization_type)

        # Transpose the output to match the output of the other spectrogram layers.
        if self.output_format.lower() == "magnitude":
            # Getting CQT Amplitude
            return tf.transpose(tf.math.sqrt(tf.math.reduce_sum(tf.math.pow(CQT, 2), axis=-1)), [0, 2, 1])

        elif self.output_format.lower() == "complex":
            return CQT

        elif self.output_format.lower() == "phase":
            phase_real = tf.math.cos(tf.math.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0]))
            phase_imag = tf.math.sin(tf.math.atan2(CQT[:, :, :, 1], CQT[:, :, :, 0]))
            return tf.stack((phase_real, phase_imag), axis=-1)


CQT = CQT2010v2