in models/networks.py [0:0]
def __init__(self, ngf=64, input_nc=2, output_nc=2):
super(AudioNet, self).__init__()
#initialize layers
self.audionet_convlayer1 = unet_conv(input_nc, ngf)
self.audionet_convlayer2 = unet_conv(ngf, ngf * 2)
self.audionet_convlayer3 = unet_conv(ngf * 2, ngf * 4)
self.audionet_convlayer4 = unet_conv(ngf * 4, ngf * 8)
self.audionet_convlayer5 = unet_conv(ngf * 8, ngf * 8)
self.audionet_upconvlayer1 = unet_upconv(1296, ngf * 8) #1296 (audio-visual feature) = 784 (visual feature) + 512 (audio feature)
self.audionet_upconvlayer2 = unet_upconv(ngf * 16, ngf *4)
self.audionet_upconvlayer3 = unet_upconv(ngf * 8, ngf * 2)
self.audionet_upconvlayer4 = unet_upconv(ngf * 4, ngf)
self.audionet_upconvlayer5 = unet_upconv(ngf * 2, output_nc, True) #outermost layer use a sigmoid to bound the mask
self.conv1x1 = create_conv(512, 8, 1, 0) #reduce dimension of extracted visual features