in demucs/demucs.py [0:0]
def __init__(self,
sources,
# Channels
audio_channels=2,
channels=64,
growth=2.,
# Main structure
depth=6,
rewrite=True,
lstm_layers=0,
# Convolutions
kernel_size=8,
stride=4,
context=1,
# Activations
gelu=True,
glu=True,
# Normalization
norm_starts=4,
norm_groups=4,
# DConv residual branch
dconv_mode=1,
dconv_depth=2,
dconv_comp=4,
dconv_attn=4,
dconv_lstm=4,
dconv_init=1e-4,
# Pre/post processing
normalize=True,
resample=True,
# Weight init
rescale=0.1,
# Metadata
samplerate=44100,
segment=4 * 10):
"""
Args:
sources (list[str]): list of source names
audio_channels (int): stereo or mono
channels (int): first convolution channels
depth (int): number of encoder/decoder layers
growth (float): multiply (resp divide) number of channels by that
for each layer of the encoder (resp decoder)
depth (int): number of layers in the encoder and in the decoder.
rewrite (bool): add 1x1 convolution to each layer.
lstm_layers (int): number of lstm layers, 0 = no lstm. Deactivated
by default, as this is now replaced by the smaller and faster small LSTMs
in the DConv branches.
kernel_size (int): kernel size for convolutions
stride (int): stride for convolutions
context (int): kernel size of the convolution in the
decoder before the transposed convolution. If > 1,
will provide some context from neighboring time steps.
gelu: use GELU activation function.
glu (bool): use glu instead of ReLU for the 1x1 rewrite conv.
norm_starts: layer at which group norm starts being used.
decoder layers are numbered in reverse order.
norm_groups: number of groups for group norm.
dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
dconv_depth: depth of residual DConv branch.
dconv_comp: compression of DConv branch.
dconv_attn: adds attention layers in DConv branch starting at this layer.
dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
dconv_init: initial scale for the DConv branch LayerScale.
normalize (bool): normalizes the input audio on the fly, and scales back
the output by the same amount.
resample (bool): upsample x2 the input and downsample /2 the output.
rescale (int): rescale initial weights of convolutions
to get their standard deviation closer to `rescale`.
samplerate (int): stored as meta information for easing
future evaluations of the model.
segment (float): duration of the chunks of audio to ideally evaluate the model on.
This is used by `demucs.apply.apply_model`.
"""
super().__init__()
self.audio_channels = audio_channels
self.sources = sources
self.kernel_size = kernel_size
self.context = context
self.stride = stride
self.depth = depth
self.resample = resample
self.channels = channels
self.normalize = normalize
self.samplerate = samplerate
self.segment = segment
self.encoder = nn.ModuleList()
self.decoder = nn.ModuleList()
self.skip_scales = nn.ModuleList()
if glu:
activation = nn.GLU(dim=1)
ch_scale = 2
else:
activation = nn.ReLU()
ch_scale = 1
if gelu:
act2 = nn.GELU
else:
act2 = nn.ReLU
in_channels = audio_channels
padding = 0
for index in range(depth):
norm_fn = lambda d: nn.Identity() # noqa
if index >= norm_starts:
norm_fn = lambda d: nn.GroupNorm(norm_groups, d) # noqa
encode = []
encode += [
nn.Conv1d(in_channels, channels, kernel_size, stride),
norm_fn(channels),
act2(),
]
attn = index >= dconv_attn
lstm = index >= dconv_lstm
if dconv_mode & 1:
encode += [DConv(channels, depth=dconv_depth, init=dconv_init,
compress=dconv_comp, attn=attn, lstm=lstm)]
if rewrite:
encode += [
nn.Conv1d(channels, ch_scale * channels, 1),
norm_fn(ch_scale * channels), activation]
self.encoder.append(nn.Sequential(*encode))
decode = []
if index > 0:
out_channels = in_channels
else:
out_channels = len(self.sources) * audio_channels
if rewrite:
decode += [
nn.Conv1d(channels, ch_scale * channels, 2 * context + 1, padding=context),
norm_fn(ch_scale * channels), activation]
if dconv_mode & 2:
decode += [DConv(channels, depth=dconv_depth, init=dconv_init,
compress=dconv_comp, attn=attn, lstm=lstm)]
decode += [nn.ConvTranspose1d(channels, out_channels,
kernel_size, stride, padding=padding)]
if index > 0:
decode += [norm_fn(out_channels), act2()]
self.decoder.insert(0, nn.Sequential(*decode))
in_channels = channels
channels = int(growth * channels)
channels = in_channels
if lstm_layers:
self.lstm = BLSTM(channels, lstm_layers)
else:
self.lstm = None
if rescale:
rescale_module(self, reference=rescale)