def __getitem__()

in datasets/AVideoDataset.py [0:0]


    def __getitem__(self, index):
        index_capped = index
        index = self.valid_indices[index_capped]
        if self.mode in ["train", "val"]:
            # -1 indicates random sampling.
            temporal_sample_index = -1
            spatial_sample_index = -1
            max_scale = self.train_jitter_scales[1]
            if self.center_crop:
                spatial_sample_index = 1
                max_scale = self.train_crop_size
        elif self.mode in ["test"]:
            temporal_sample_index = (
                    self._spatial_temporal_idx[index] // self.num_spatial_crops
            )
            # spatial_sample_index is in [0, 1, 2]. Corresponding to left,
            # center, or right if width is larger than height, and top, middle,
            # or bottom if height is larger than width.
            spatial_sample_index = (
                    self._spatial_temporal_idx[index] % self.num_spatial_crops
            )
            min_scale, max_scale, crop_size = [self.test_crop_size] * 3
            # The testing is deterministic and no jitter should be performed.
            # min_scale, max_scale, and crop_size are expect to be the same.
            # assert len({min_scale, max_scale, crop_size}) == 1
        else:
            raise NotImplementedError(
                "Does not support {} mode".format(self.mode)
            )

        # Get number of clips
        num_clips = 1
        V = []
        A = []

        for i in range(num_clips):
            # Try to decode and sample a clip from a video.
            video_container = get_video_container(
                self._path_to_videos[index],
                ENABLE_MULTI_THREAD_DECODE,
                DECODING_BACKEND,
            )

            # Decode video. Meta info is used to perform selective decoding.
            frames, audio = decode(
                self._path_to_videos[index],
                video_container,
                self.sample_rate,
                self.num_frames,
                temporal_sample_index,
                self.num_ensemble_views,
                video_meta=self._video_meta[index],
                target_fps=int(self.target_fps),
                backend=DECODING_BACKEND,
                max_spatial_scale=max_scale,
                decode_audio=self.decode_audio,
                aud_sample_rate=self.aud_sample_rate,
                aud_spec_type=self.aud_spec_type,
                use_volume_jittering=self.use_volume_jittering,
                num_sec=int(self.num_sec),
                use_temporal_jittering=self.use_temporal_jittering,
                z_normalize=self.z_normalize,
            )

            # Perform data augmentation on video clip.
            if self.multi_crop:
                multi_crop_clips = 2
            else:
                multi_crop_clips = 1
            use_random_resize_crop = (self.use_random_resize_crop) and (self.mode == 'train')
            for j in range(multi_crop_clips):
                min_scale, max_scale, crop_size = (self.train_jitter_scales[0], 
                    self.train_jitter_scales[1], self.train_crop_size)
                if use_random_resize_crop:
                    min_scale, max_scale, crop_size = 0.14, 1.0, self.train_crop_size
                V.append(clip_augmentation(
                    frames.clone(),
                    spatial_idx=spatial_sample_index,
                    min_scale=min_scale,
                    max_scale=max_scale,
                    crop_size=crop_size,
                    colorjitter=self.colorjitter,
                    use_grayscale=self.use_grayscale,
                    use_gaussian=self.use_gaussian,
                    use_random_resize_crop=use_random_resize_crop
                ))
            A.append(audio)

        # Get labels and indices
        label = self._labels[index]
        vid_idx = self._vid_indices[index]

        # return results
        if not self.multi_crop:
            V = torch.cat(V, dim=0)
        if self.decode_audio:
            audio = torch.cat(A, dim=0)
            return V, audio, label, index_capped, vid_idx
        else:
            return V, label, index_capped, vid_idx