in datasets/base_video_dataset.py [0:0]
def __getitem__(self, idx):
idx = self._class_balance_data_idx(idx) # Must be run before repeat
idx = self._repeat_process_idx(idx)
df_row = self.df.loc[idx, :]
if self.conv_to_anticipate_fn_runtime is not None:
df_row = hydra.utils.call(self.conv_to_anticipate_fn_runtime,
df_row, self.df, self.root,
self.addl_df_proc_for_dense)
if df_row is None:
return None
if self.process_df_before_read_fn is not None:
df_row = hydra.utils.call(self.process_df_before_read_fn, df_row,
self.root, self.rng, self.label_type,
self.frames_per_clip, self.frame_rate,
self.sample_strategy, self.dummy_label)
if df_row is None:
return None
video_dict = self._get_video(df_row)
video = video_dict['video']
orig_video_shape = video.shape
if len(orig_video_shape) == 5:
# #ncrops, C, T, H, W -- flatten first 2 dims for subclips
video = video.flatten(0, 1)
# #ncrops * C, T, H, W -> #clips, #ncrops * C, T', H, W
video = self._get_subclips(video, **self.subclips_options)
if len(orig_video_shape) == 5:
# unflatten back
video = video.reshape((video.size(0), ) + orig_video_shape[:2] +
video.shape[-3:])
video_dict['video'] = video
video_dict['video_frame_sec'] = self._get_subclips(
video_dict['video_frame_sec'].unsqueeze(0),
# squeeze(1) because the 0th dim now will be the clips
**self.subclips_options).squeeze(1)
sentence = self._get_text(df_row) # Not used at the moment
label_idx = self._get_labels(df_row)
video_dict.update({
'idx':
idx,
'text':
sentence,
'target':
label_idx,
'audio': [], # TODO?
'orig_vid_len':
df_row.video_len if 'video_len' in df_row else -1,
'uid':
df_row.uid,
})
if self.load_seg_labels:
video_dict.update({
'target_subclips':
self._get_vidseg_labels(df_row, video_dict['video_frame_sec'])
})
if self.load_long_term_future_labels > 0:
# This is only really used for visualization for now
last_frame = video_dict['video_frame_sec'][-1].item()
gap_in_frames = (video_dict['video_frame_sec'][-1].item() -
video_dict['video_frame_sec'][-2].item())
video_dict.update({
'future_subclips':
self._get_vidseg_labels(
df_row,
torch.FloatTensor([
last_frame + gap_in_frames * i
for i in range(1, self.load_long_term_future_labels +
1)
]).reshape(-1, 1))
})
return video_dict