def __getitem__()

in datasets/base_video_dataset.py [0:0]


    def __getitem__(self, idx):
        idx = self._class_balance_data_idx(idx)  # Must be run before repeat
        idx = self._repeat_process_idx(idx)
        df_row = self.df.loc[idx, :]
        if self.conv_to_anticipate_fn_runtime is not None:
            df_row = hydra.utils.call(self.conv_to_anticipate_fn_runtime,
                                      df_row, self.df, self.root,
                                      self.addl_df_proc_for_dense)
        if df_row is None:
            return None
        if self.process_df_before_read_fn is not None:
            df_row = hydra.utils.call(self.process_df_before_read_fn, df_row,
                                      self.root, self.rng, self.label_type,
                                      self.frames_per_clip, self.frame_rate,
                                      self.sample_strategy, self.dummy_label)
        if df_row is None:
            return None
        video_dict = self._get_video(df_row)
        video = video_dict['video']
        orig_video_shape = video.shape
        if len(orig_video_shape) == 5:
            # #ncrops, C, T, H, W -- flatten first 2 dims for subclips
            video = video.flatten(0, 1)
        # #ncrops * C, T, H, W -> #clips, #ncrops * C, T', H, W
        video = self._get_subclips(video, **self.subclips_options)
        if len(orig_video_shape) == 5:
            # unflatten back
            video = video.reshape((video.size(0), ) + orig_video_shape[:2] +
                                  video.shape[-3:])
        video_dict['video'] = video
        video_dict['video_frame_sec'] = self._get_subclips(
            video_dict['video_frame_sec'].unsqueeze(0),
            # squeeze(1) because the 0th dim now will be the clips
            **self.subclips_options).squeeze(1)
        sentence = self._get_text(df_row)  # Not used at the moment
        label_idx = self._get_labels(df_row)
        video_dict.update({
            'idx':
            idx,
            'text':
            sentence,
            'target':
            label_idx,
            'audio': [],  # TODO?
            'orig_vid_len':
            df_row.video_len if 'video_len' in df_row else -1,
            'uid':
            df_row.uid,
        })
        if self.load_seg_labels:
            video_dict.update({
                'target_subclips':
                self._get_vidseg_labels(df_row, video_dict['video_frame_sec'])
            })
        if self.load_long_term_future_labels > 0:
            # This is only really used for visualization for now
            last_frame = video_dict['video_frame_sec'][-1].item()
            gap_in_frames = (video_dict['video_frame_sec'][-1].item() -
                             video_dict['video_frame_sec'][-2].item())
            video_dict.update({
                'future_subclips':
                self._get_vidseg_labels(
                    df_row,
                    torch.FloatTensor([
                        last_frame + gap_in_frames * i
                        for i in range(1, self.load_long_term_future_labels +
                                       1)
                    ]).reshape(-1, 1))
            })
        return video_dict