def forward()

in model.py [0:0]


    def forward(self, img, spec, params=None):
        ## B: batch size
        ## N: num of chunk (num of sec)
        ## C: num of channel
        ## L: num of input frames for video per chunk
        ## H: height
        ## W: width
        ## T: num of input windows for audio per chunk
        ## S: num of bank (spectrogram)

        # Run backbone architecture
        # B C LN H W => B H V
        img = self.video_network(img).squeeze()
        # B C S TN => B H A
        spec = self.audio_network(spec).squeeze()

        # Feature Cropping Layer
        if params is not None:
            # params = [ space , crops]
            # space = [[ largecrop_locations], [small_croplocations]
            # location = [xmin,xmax,ymin,ymax] or [tmin,tmax]
            crop_nces = [[],[]]
            tcrop_nces = [[],[]]
            s_large_crops, s_small_crops = len(params[0][0]),len(params[0][1])
            t_large_crops, t_small_crops = len(params[1][0]),len(params[1][1])
            for i in range(s_large_crops):
                xmin, xmax, ymin, ymax = params[0][0][i]
                crop_nces[0].append(self.feat2nce(img[..., xmin:xmax,ymin:ymax]))
                for j in range(s_small_crops):
                    xmin, xmax, ymin, ymax = params[0][1][j]
                    crop_nces[1].append(self.feat2nce(img[..., xmin:xmax,ymin:ymax]))
            for ti in range(t_large_crops):
                tmin,tmax= params[1][0][ti]
                tcrop_nces[0].append(self.feat2nce(img[:,:, tmin:tmax, :,:]))
                for tj in range(t_small_crops):
                    tmin,tmax= params[1][1][tj]
                    tcrop_nces[1].append(self.feat2nce(img[:,:, tmin:tmax, :,:]))
        
        # Temporal Pooling: B V H => B H
        img = self.video_pooling(img) 

        # Reshape Layer
        if len(spec.shape) == 1:
            spec = spec.unsqueeze(0)
        img = img.view(-1, self.encoder_dim)

        # MLP projection layer
        img = self.mlp_v(img)
        spec = self.mlp_a(spec)

        # Normalization layer
        if self.norm_feat:
            img = F.normalize(img, p=2, dim=1)
            spec = F.normalize(spec, p=2, dim=1)

        return (img, [crop_nces, tcrop_nces], spec)