in model.py [0:0]
def forward(self, img, spec, params=None):
## B: batch size
## N: num of chunk (num of sec)
## C: num of channel
## L: num of input frames for video per chunk
## H: height
## W: width
## T: num of input windows for audio per chunk
## S: num of bank (spectrogram)
# Run backbone architecture
# B C LN H W => B H V
img = self.video_network(img).squeeze()
# B C S TN => B H A
spec = self.audio_network(spec).squeeze()
# Feature Cropping Layer
if params is not None:
# params = [ space , crops]
# space = [[ largecrop_locations], [small_croplocations]
# location = [xmin,xmax,ymin,ymax] or [tmin,tmax]
crop_nces = [[],[]]
tcrop_nces = [[],[]]
s_large_crops, s_small_crops = len(params[0][0]),len(params[0][1])
t_large_crops, t_small_crops = len(params[1][0]),len(params[1][1])
for i in range(s_large_crops):
xmin, xmax, ymin, ymax = params[0][0][i]
crop_nces[0].append(self.feat2nce(img[..., xmin:xmax,ymin:ymax]))
for j in range(s_small_crops):
xmin, xmax, ymin, ymax = params[0][1][j]
crop_nces[1].append(self.feat2nce(img[..., xmin:xmax,ymin:ymax]))
for ti in range(t_large_crops):
tmin,tmax= params[1][0][ti]
tcrop_nces[0].append(self.feat2nce(img[:,:, tmin:tmax, :,:]))
for tj in range(t_small_crops):
tmin,tmax= params[1][1][tj]
tcrop_nces[1].append(self.feat2nce(img[:,:, tmin:tmax, :,:]))
# Temporal Pooling: B V H => B H
img = self.video_pooling(img)
# Reshape Layer
if len(spec.shape) == 1:
spec = spec.unsqueeze(0)
img = img.view(-1, self.encoder_dim)
# MLP projection layer
img = self.mlp_v(img)
spec = self.mlp_a(spec)
# Normalization layer
if self.norm_feat:
img = F.normalize(img, p=2, dim=1)
spec = F.normalize(spec, p=2, dim=1)
return (img, [crop_nces, tcrop_nces], spec)