in models/encoders.py [0:0]
def __init__(self, latent_dim: int = 128, model_name: str = 'audio_encoder'):
"""
:param latent_dim: size of the latent audio embedding
:param model_name: name of the model, used to load and save the model
"""
super().__init__(model_name)
self.melspec = ta.transforms.MelSpectrogram(
sample_rate=16000, n_fft=2048, win_length=800, hop_length=160, n_mels=80
)
conv_len = 5
self.convert_dimensions = th.nn.Conv1d(80, 128, kernel_size=conv_len)
self.weights_init(self.convert_dimensions)
self.receptive_field = conv_len
convs = []
for i in range(6):
dilation = 2 * (i % 3 + 1)
self.receptive_field += (conv_len - 1) * dilation
convs += [th.nn.Conv1d(128, 128, kernel_size=conv_len, dilation=dilation)]
self.weights_init(convs[-1])
self.convs = th.nn.ModuleList(convs)
self.code = th.nn.Linear(128, latent_dim)
self.apply(lambda x: self.weights_init(x))