in pytorchvideo/data/charades.py [0:0]
def __next__(self) -> dict:
"""
Retrieves the next clip based on the clip sampling strategy and video sampler.
Returns:
A dictionary with the following format.
.. code-block:: text
{
'video': <video_tensor>,
'label': <index_label>,
'video_label': <index_label>
'video_index': <video_index>,
'clip_index': <clip_index>,
'aug_index': <aug_index>,
}
"""
if not self._video_sampler_iter:
# Setup MultiProcessSampler here - after PyTorch DataLoader workers are spawned.
self._video_sampler_iter = iter(MultiProcessSampler(self._video_sampler))
if self._loaded_video:
video, video_index = self._loaded_video
else:
video_index = next(self._video_sampler_iter)
path_to_video_frames = self._path_to_videos[video_index]
video = FrameVideo.from_frame_paths(path_to_video_frames)
self._loaded_video = (video, video_index)
clip_start, clip_end, clip_index, aug_index, is_last_clip = self._clip_sampler(
self._next_clip_start_time, video.duration, {}
)
# Only load the clip once and reuse previously stored clip if there are multiple
# views for augmentations to perform on the same clip.
if aug_index == 0:
self._loaded_clip = video.get_clip(clip_start, clip_end, self._frame_filter)
frames, frame_indices = (
self._loaded_clip["video"],
self._loaded_clip["frame_indices"],
)
self._next_clip_start_time = clip_end
if is_last_clip:
self._loaded_video = None
self._next_clip_start_time = 0.0
# Merge unique labels from each frame into clip label.
labels_by_frame = [
self._labels[video_index][i]
for i in range(min(frame_indices), max(frame_indices) + 1)
]
sample_dict = {
"video": frames,
"label": labels_by_frame,
"video_label": self._video_labels[video_index],
"video_name": str(video_index),
"video_index": video_index,
"clip_index": clip_index,
"aug_index": aug_index,
}
if self._transform is not None:
sample_dict = self._transform(sample_dict)
return sample_dict