def __getitem__()

in datasets/video_db.py [0:0]


    def __getitem__(self, index):
        if self.mode == 'clip':
            try:
                sample_idx = index % self.num_samples
                video_ctr, audio_ctr = self._load_sample(sample_idx)
                v_ss, v_dur, a_ss, a_dur = self._sample_snippet(video_ctr, audio_ctr)
                sample = self._get_clip(sample_idx, video_ctr, audio_ctr, v_ss, a_ss, video_clip_duration=v_dur, audio_clip_duration=a_dur)
                if sample is None:
                    return self[(index+1) % len(self)]

                return sample
            except Exception:
                return self[(index+1) % len(self)]

        else:
            video_ctr, audio_ctr = self._load_sample(index)

            # Load entire video
            vs, vf, ss, sf = self._get_time_lims(video_ctr, audio_ctr)
            start_time = vs
            final_time = vf
            if self.return_audio:
                start_time = max(vs, ss) if ss < 0 else vs
                final_time = min(vf, sf) if ss < 0 else vf
            if final_time <= start_time:
                final_time = start_time + max(self.video_clip_duration, self.audio_clip_duration)
            video_dur = final_time - start_time
            sample = self._get_clip(index, video_ctr, audio_ctr, start_time, start_time, video_clip_duration=video_dur, audio_clip_duration=video_dur)

            # Split video into overlapping chunks
            chunks = defaultdict(list)
            if self.return_video:
                nf = sample['frames'].shape[1]
                chunk_size = int(self.video_clip_duration * self.video_fps)
                if chunk_size >= nf:
                    chunks['frames'] = torch.stack([sample['frames'] for _ in range(self.clips_per_video)])
                else:
                    timestamps = np.linspace(0, max(nf - chunk_size, 1), self.clips_per_video).astype(int)
                    chunks['frames'] = torch.stack([sample['frames'][:, ss:ss+chunk_size] for ss in timestamps])

            if self.return_audio:
                nf = sample['audio'].shape[1]
                chunk_size = int(self.audio_clip_duration * self.audio_fps_out)
                if chunk_size >= nf:
                    chunks['audio'] = torch.stack([sample['audio'] for _ in range(self.clips_per_video)])
                else:
                    timestamps = np.linspace(0, max(nf - chunk_size, 1), self.clips_per_video).astype(int)
                    chunks['audio'] = torch.stack([sample['audio'][:, ss:ss+chunk_size] for ss in timestamps])

            if self.return_labels:
                chunks['label'] = sample['label']

            if self.return_index:
                ts = torch.from_numpy(np.linspace(start_time, final_time-self.video_clip_duration, self.clips_per_video))
                chunks['index'] = torch.stack([sample['index'][:1].repeat(self.clips_per_video), ts.float()], dim=1)

            return chunks