in torchbenchmark/models/demucs/demucs/model.py [0:0]
def __init__(self,
sources: int=4,
audio_channels: int=2,
channels: int=64,
depth: int=6,
rewrite: bool=True,
glu: bool=True,
upsample: bool=False,
rescale: float=0.1,
kernel_size: int=8,
stride: int=4,
growth: float=2.,
lstm_layers: int=2,
context: int=3) -> None:
"""
Args:
sources (int): number of sources to separate
audio_channels (int): stereo or mono
channels (int): first convolution channels
depth (int): number of encoder/decoder layers
rewrite (bool): add 1x1 convolution to each encoder layer
and a convolution to each decoder layer.
For the decoder layer, `context` gives the kernel size.
glu (bool): use glu instead of ReLU
upsample (bool): use linear upsampling with convolutions
Wave-U-Net style, instead of transposed convolutions
rescale (int): rescale initial weights of convolutions
to get their standard deviation closer to `rescale`
kernel_size (int): kernel size for convolutions
stride (int): stride for convolutions
growth (float): multiply (resp divide) number of channels by that
for each layer of the encoder (resp decoder)
lstm_layers (int): number of lstm layers, 0 = no lstm
context (int): kernel size of the convolution in the
decoder before the transposed convolution. If > 1,
will provide some context from neighboring time
steps.
"""
super().__init__()
self.audio_channels = audio_channels
self.sources = sources
self.kernel_size = kernel_size
self.context = context
self.stride = stride
self.depth = depth
self.upsample = upsample
self.channels = channels
self.encoder = nn.ModuleList()
self.decoder = nn.ModuleList()
self.final = None
if upsample:
self.final = nn.Conv1d(channels + audio_channels, sources * audio_channels, 1)
stride = 1
if glu:
activation = nn.GLU(dim=1)
ch_scale = 2
else:
activation = nn.ReLU()
ch_scale = 1
in_channels = audio_channels
for index in range(depth):
encode = []
encode += [nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU()]
if rewrite:
encode += [nn.Conv1d(channels, ch_scale * channels, 1), activation]
self.encoder.append(nn.Sequential(*encode))
decode = []
if index > 0:
out_channels = in_channels
else:
if upsample:
out_channels = channels
else:
out_channels = sources * audio_channels
if rewrite:
decode += [nn.Conv1d(channels, ch_scale * channels, context), activation]
if upsample:
decode += [
nn.Conv1d(channels, out_channels, kernel_size, stride=1),
]
else:
decode += [nn.ConvTranspose1d(channels, out_channels, kernel_size, stride)]
if index > 0:
decode.append(nn.ReLU())
self.decoder.insert(0, nn.Sequential(*decode))
in_channels = channels
channels = int(growth * channels)
channels = in_channels
if lstm_layers:
self.lstm = BLSTM(channels, lstm_layers)
else:
self.lstm = None
if rescale:
rescale_module(self, reference=rescale)