in slowfast/datasets/epickitchens.py [0:0]
def __getitem__(self, index):
"""
Given the video index, return the list of frames, label, and video
index if the video can be fetched and decoded successfully, otherwise
repeatly find a random video that can be decoded as a replacement.
Args:
index (int): the video index provided by the pytorch sampler.
Returns:
frames (tensor): the frames of sampled from the video. The dimension
is `channel` x `num frames` x `height` x `width`.
label (int): the label of the current video.
index (int): if the video provided by pytorch sampler can be
decoded, then return the index of the video. If not, return the
index of the video replacement that can be decoded.
"""
if self.mode in ["train", "val", "train+val"]:
# -1 indicates random sampling.
temporal_sample_index = -1
spatial_sample_index = -1
min_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[0]
max_scale = self.cfg.DATA.TRAIN_JITTER_SCALES[1]
crop_size = self.cfg.DATA.TRAIN_CROP_SIZE
elif self.mode in ["test"]:
temporal_sample_index = (
self._spatial_temporal_idx[index]
// self.cfg.TEST.NUM_SPATIAL_CROPS
)
# spatial_sample_index is in [0, 1, 2]. Corresponding to left,
# center, or right if width is larger than height, and top, middle,
# or bottom if height is larger than width.
if self.cfg.TEST.NUM_SPATIAL_CROPS == 3:
spatial_sample_index = (
self._spatial_temporal_idx[index]
% self.cfg.TEST.NUM_SPATIAL_CROPS
)
elif self.cfg.TEST.NUM_SPATIAL_CROPS == 1:
spatial_sample_index = 1
min_scale, max_scale, crop_size = [self.cfg.DATA.TEST_CROP_SIZE] * 3
# The testing is deterministic and no jitter should be performed.
# min_scale, max_scale, and crop_size are expect to be the same.
assert len({min_scale, max_scale, crop_size}) == 1
else:
raise NotImplementedError(
"Does not support {} mode".format(self.mode)
)
frames = pack_frames_to_video_clip(self.cfg, self._video_records[index], temporal_sample_index, target_fps=self.target_fps)
if self.cfg.DATA.USE_RAND_AUGMENT and self.mode in ["train"]:
# Transform to PIL Image
frames = [transforms.ToPILImage()(frame.squeeze().numpy()) for frame in frames]
# Perform RandAugment
img_size_min = crop_size
auto_augment_desc = "rand-m15-mstd0.5-inc1"
aa_params = dict(
translate_const=int(img_size_min * 0.45),
img_mean=tuple([min(255, round(255 * x)) for x in self.cfg.DATA.MEAN]),
)
seed = random.randint(0, 100000000)
frames = [autoaugment.rand_augment_transform(
auto_augment_desc, aa_params, seed)(frame) for frame in frames]
# To Tensor: T H W C
frames = [torch.tensor(np.array(frame)) for frame in frames]
frames = torch.stack(frames)
# Perform color normalization.
frames = utils.tensor_normalize(
frames, self.cfg.DATA.MEAN, self.cfg.DATA.STD
)
# T H W C -> C T H W.
frames = frames.permute(3, 0, 1, 2)
# Perform data augmentation.
use_random_resize_crop = self.cfg.DATA.USE_RANDOM_RESIZE_CROPS
if use_random_resize_crop:
if self.mode in ["train", "val"]:
frames = transform.random_resize_crop_video(frames, crop_size, interpolation_mode="bilinear")
frames, _ = transform.horizontal_flip(0.5, frames)
else:
assert len({min_scale, max_scale, crop_size}) == 1
frames, _ = transform.random_short_side_scale_jitter(
frames, min_scale, max_scale
)
frames, _ = transform.uniform_crop(frames, crop_size, spatial_sample_index)
else:
# Perform data augmentation.
frames = utils.spatial_sampling(
frames,
spatial_idx=spatial_sample_index,
min_scale=min_scale,
max_scale=max_scale,
crop_size=crop_size,
random_horizontal_flip=self.cfg.DATA.RANDOM_FLIP,
inverse_uniform_sampling=self.cfg.DATA.INV_UNIFORM_SAMPLE,
)
# T H W C -> T C H W.
if self.mode in ["train", "val"]:
frames = frames.permute(1, 0, 2, 3) # C T H W -> T C H W
frames = utils.frames_augmentation(
frames,
colorjitter=self.cfg.DATA.COLORJITTER,
use_grayscale=self.cfg.DATA.GRAYSCALE,
use_gaussian=self.cfg.DATA.GAUSSIAN
)
label = self._video_records[index].label
frames = utils.pack_pathway_output(self.cfg, frames)
metadata = self._video_records[index].metadata
return frames, label, index, metadata