def forward()

in models/encoders.py [0:0]


    def forward(self, audio: th.Tensor):
        """
        :param audio: B x T x 16000 Tensor containing 1 sec of audio centered around the current time frame
        :return: code: B x T x latent_dim Tensor containing a latent audio code/embedding
        """
        B, T = audio.shape[0], audio.shape[1]
        x = self.melspec(audio).squeeze(1)
        x = th.log(x.clamp(min=1e-10, max=None))
        if T == 1:
            x = x.unsqueeze(1)

        # Convert to the right dimensionality
        x = x.view(-1, x.shape[2], x.shape[3])
        x = F.leaky_relu(self.convert_dimensions(x), .2)

        # Process stacks
        for conv in self.convs:
            x_ = F.leaky_relu(conv(x), .2)
            if self.training:
                x_ = F.dropout(x_, .2)
            l = (x.shape[2] - x_.shape[2]) // 2
            x = (x[:, :, l:-l] + x_) / 2

        x = th.mean(x, dim=-1)
        x = x.view(B, T, x.shape[-1])
        x = self.code(x)

        return {"code": x}