in models/encoders.py [0:0]
def forward(self, audio: th.Tensor):
"""
:param audio: B x T x 16000 Tensor containing 1 sec of audio centered around the current time frame
:return: code: B x T x latent_dim Tensor containing a latent audio code/embedding
"""
B, T = audio.shape[0], audio.shape[1]
x = self.melspec(audio).squeeze(1)
x = th.log(x.clamp(min=1e-10, max=None))
if T == 1:
x = x.unsqueeze(1)
# Convert to the right dimensionality
x = x.view(-1, x.shape[2], x.shape[3])
x = F.leaky_relu(self.convert_dimensions(x), .2)
# Process stacks
for conv in self.convs:
x_ = F.leaky_relu(conv(x), .2)
if self.training:
x_ = F.dropout(x_, .2)
l = (x.shape[2] - x_.shape[2]) // 2
x = (x[:, :, l:-l] + x_) / 2
x = th.mean(x, dim=-1)
x = x.view(B, T, x.shape[-1])
x = self.code(x)
return {"code": x}