in slowfast/datasets/ava_dataset.py [0:0]
def _images_and_boxes_preprocessing(self, imgs, boxes):
"""
This function performs preprocessing for the input images and
corresponding boxes for one clip.
Args:
imgs (tensor): the images.
boxes (ndarray): the boxes for the current clip.
Returns:
imgs (tensor): list of preprocessed images.
boxes (ndarray): preprocessed boxes.
"""
# Image [0, 255] -> [0, 1].
imgs = imgs.float()
imgs = imgs / 255.0
height, width = imgs.shape[2], imgs.shape[3]
# The format of boxes is [x1, y1, x2, y2]. The input boxes are in the
# range of [0, 1].
boxes[:, [0, 2]] *= width
boxes[:, [1, 3]] *= height
boxes = transform.clip_boxes_to_image(boxes, height, width)
if self._split == "train":
# Train split
imgs, boxes = transform.random_short_side_scale_jitter(
imgs,
min_size=self._jitter_min_scale,
max_size=self._jitter_max_scale,
boxes=boxes,
)
imgs, boxes = transform.random_crop(
imgs, self._crop_size, boxes=boxes
)
# Random flip.
imgs, boxes = transform.horizontal_flip(0.5, imgs, boxes=boxes)
elif self._split == "val":
# Val split
# Resize short side to crop_size. Non-local and STRG uses 256.
imgs, boxes = transform.random_short_side_scale_jitter(
imgs,
min_size=self._crop_size,
max_size=self._crop_size,
boxes=boxes,
)
# Apply center crop for val split
imgs, boxes = transform.uniform_crop(
imgs, size=self._crop_size, spatial_idx=1, boxes=boxes
)
if self._test_force_flip:
imgs, boxes = transform.horizontal_flip(1, imgs, boxes=boxes)
elif self._split == "test":
# Test split
# Resize short side to crop_size. Non-local and STRG uses 256.
imgs, boxes = transform.random_short_side_scale_jitter(
imgs,
min_size=self._crop_size,
max_size=self._crop_size,
boxes=boxes,
)
if self._test_force_flip:
imgs, boxes = transform.horizontal_flip(1, imgs, boxes=boxes)
else:
raise NotImplementedError(
"{} split not supported yet!".format(self._split)
)
# Do color augmentation (after divided by 255.0).
if self._split == "train" and self._use_color_augmentation:
if not self._pca_jitter_only:
imgs = transform.color_jitter(
imgs,
img_brightness=0.4,
img_contrast=0.4,
img_saturation=0.4,
)
imgs = transform.lighting_jitter(
imgs,
alphastd=0.1,
eigval=np.array(self._pca_eigval).astype(np.float32),
eigvec=np.array(self._pca_eigvec).astype(np.float32),
)
# Normalize images by mean and std.
imgs = transform.color_normalization(
imgs,
np.array(self._data_mean, dtype=np.float32),
np.array(self._data_std, dtype=np.float32),
)
if not self._use_bgr:
# Convert image format from BGR to RGB.
# Note that Kinetics pre-training uses RGB!
imgs = imgs[:, [2, 1, 0], ...]
boxes = transform.clip_boxes_to_image(
boxes, self._crop_size, self._crop_size
)
return imgs, boxes