in models/src/wavenet_vocoder/wavenet.py [0:0]
def forward(self, x, c=None, g=None, softmax=False):
"""Forward step
Args:
x (Tensor): One-hot encoded audio signal, shape (B x C x T)
c (Tensor): Local conditioning features,
shape (B x cin_channels x T)
g (Tensor): Global conditioning features,
shape (B x gin_channels x 1) or speaker Ids of shape (B x 1).
Note that ``self.use_speaker_embedding`` must be False when you
want to disable embedding layer and use external features
directly (e.g., one-hot vector).
Also type of input tensor must be FloatTensor, not LongTensor
in case of ``self.use_speaker_embedding`` equals False.
softmax (bool): Whether applies softmax or not.
Returns:
Tensor: output, shape B x out_channels x T
"""
B, _, T = x.size()
if g is not None:
if self.embed_speakers is not None:
# (B x 1) -> (B x 1 x gin_channels)
g = self.embed_speakers(g.view(B, -1))
# (B x gin_channels x 1)
g = g.transpose(1, 2)
assert g.dim() == 3
# Expand global conditioning features to all time steps
g_bct = _expand_global_features(B, T, g, bct=True)
if c is not None and self.upsample_net is not None:
c = self.upsample_net(c)
assert c.size(-1) == x.size(-1)
# Feed data to network
x = self.first_conv(x)
skips = 0
for f in self.conv_layers:
x, h = f(x, c, g_bct)
skips += h
skips *= math.sqrt(1.0 / len(self.conv_layers))
x = skips
for f in self.last_conv_layers:
x = f(x)
x = F.softmax(x, dim=1) if softmax else x
return x