in datasets/AVideoDataset.py [0:0]
def __getitem__(self, index):
index_capped = index
index = self.valid_indices[index_capped]
if self.mode in ["train", "val"]:
# -1 indicates random sampling.
temporal_sample_index = -1
spatial_sample_index = -1
max_scale = self.train_jitter_scales[1]
if self.center_crop:
spatial_sample_index = 1
max_scale = self.train_crop_size
elif self.mode in ["test"]:
temporal_sample_index = (
self._spatial_temporal_idx[index] // self.num_spatial_crops
)
# spatial_sample_index is in [0, 1, 2]. Corresponding to left,
# center, or right if width is larger than height, and top, middle,
# or bottom if height is larger than width.
spatial_sample_index = (
self._spatial_temporal_idx[index] % self.num_spatial_crops
)
min_scale, max_scale, crop_size = [self.test_crop_size] * 3
# The testing is deterministic and no jitter should be performed.
# min_scale, max_scale, and crop_size are expect to be the same.
# assert len({min_scale, max_scale, crop_size}) == 1
else:
raise NotImplementedError(
"Does not support {} mode".format(self.mode)
)
# Get number of clips
num_clips = 1
V = []
A = []
for i in range(num_clips):
# Try to decode and sample a clip from a video.
video_container = get_video_container(
self._path_to_videos[index],
ENABLE_MULTI_THREAD_DECODE,
DECODING_BACKEND,
)
# Decode video. Meta info is used to perform selective decoding.
frames, audio = decode(
self._path_to_videos[index],
video_container,
self.sample_rate,
self.num_frames,
temporal_sample_index,
self.num_ensemble_views,
video_meta=self._video_meta[index],
target_fps=int(self.target_fps),
backend=DECODING_BACKEND,
max_spatial_scale=max_scale,
decode_audio=self.decode_audio,
aud_sample_rate=self.aud_sample_rate,
aud_spec_type=self.aud_spec_type,
use_volume_jittering=self.use_volume_jittering,
num_sec=int(self.num_sec),
use_temporal_jittering=self.use_temporal_jittering,
z_normalize=self.z_normalize,
)
# Perform data augmentation on video clip.
if self.multi_crop:
multi_crop_clips = 2
else:
multi_crop_clips = 1
use_random_resize_crop = (self.use_random_resize_crop) and (self.mode == 'train')
for j in range(multi_crop_clips):
min_scale, max_scale, crop_size = (self.train_jitter_scales[0],
self.train_jitter_scales[1], self.train_crop_size)
if use_random_resize_crop:
min_scale, max_scale, crop_size = 0.14, 1.0, self.train_crop_size
V.append(clip_augmentation(
frames.clone(),
spatial_idx=spatial_sample_index,
min_scale=min_scale,
max_scale=max_scale,
crop_size=crop_size,
colorjitter=self.colorjitter,
use_grayscale=self.use_grayscale,
use_gaussian=self.use_gaussian,
use_random_resize_crop=use_random_resize_crop
))
A.append(audio)
# Get labels and indices
label = self._labels[index]
vid_idx = self._vid_indices[index]
# return results
if not self.multi_crop:
V = torch.cat(V, dim=0)
if self.decode_audio:
audio = torch.cat(A, dim=0)
return V, audio, label, index_capped, vid_idx
else:
return V, label, index_capped, vid_idx