def __getitem__()

in datasets/AVideoDataset.py [0:0]


    def __getitem__(self, index):
        index_capped = index
        index = self.valid_indices[index_capped]
        if self.mode in ["train", "val"]:
            # -1 indicates random sampling.
            temporal_sample_index = -1
            spatial_sample_index = -1
            min_scale = self.train_jitter_scles[0]
            max_scale = self.train_jitter_scles[1]
            crop_size = self.train_crop_size
            if self.center_crop:
                spatial_sample_index = 1
                min_scale = self.train_crop_size
                max_scale = self.train_crop_size
                crop_size = self.train_crop_size
        elif self.mode in ["test"]:
            temporal_sample_index = (
                self._spatial_temporal_idx[index] // self.num_spatial_crops
            )
            # spatial_sample_index is in [0, 1, 2]. Corresponding to left,
            # center, or right if width is larger than height, and top, middle,
            # or bottom if height is larger than width.
            spatial_sample_index = (
                self._spatial_temporal_idx[index] % self.num_spatial_crops
            )
            min_scale, max_scale, crop_size = [self.test_crop_size] * 3
            # The testing is deterministic and no jitter should be performed.
            # min_scale, max_scale, and crop_size are expect to be the same.
            assert len({min_scale, max_scale, crop_size}) == 1
        else:
            raise NotImplementedError(
                "Does not support {} mode".format(self.mode)
            )

        # Get number of clips
        if self.mode in ["train", "val"] and self.dual_data:
            num_clips = 2
        else:
            num_clips = 1
        V = []
        A = []

        for i in range(num_clips): 
            # Try to decode and sample a clip from a video. 
            video_container = get_video_container(
                self._path_to_videos[index],
                ENABLE_MULTI_THREAD_DECODE,
                DECODING_BACKEND,
            )

            # Decode video. Meta info is used to perform selective decoding.
            frames, audio = decode(
                self._path_to_videos[index],
                video_container,
                self.sample_rate,
                self.num_frames,
                temporal_sample_index if self.temp_jitter else 500,
                self.num_ensemble_views if self.temp_jitter else 1000,
                video_meta=self._video_meta[index],
                target_fps=int(self.target_fps),
                backend=DECODING_BACKEND,
                max_spatial_scale=max_scale,
                decode_audio=self.decode_audio,
                aug_audio=self.aug_audio,
                num_sec=int(self.num_sec),
                aud_sample_rate=self.aud_sample_rate,
                aud_spec_type=self.aud_spec_type,
                use_volume_jittering=self.use_volume_jittering,
                use_temporal_jittering=self.use_temporal_jittering,
                z_normalize=self.z_normalize,
            )
            
            # Perform data augmentation on video clip.
            frames = clip_augmentation(
                frames,
                spatial_idx=spatial_sample_index,
                min_scale=min_scale,
                max_scale=max_scale,
                crop_size=crop_size,
                colorjitter=self.colorjitter,
                use_grayscale=self.use_grayscale,
                use_gaussian=self.use_gaussian,
            )
            V.append(frames)
            A.append(audio)

        # Stack video frames
        frames = torch.cat(V, dim=0)

        # Get labels and indices
        label = self._labels[index]
        vid_idx = self._vid_indices[index]
        idx = index

        # return results
        if self.decode_audio:
            audio = torch.cat(A, dim=0)
            return frames, audio, label, index_capped, vid_idx
        else:
            return frames, label, index_capped, vid_idx