in mask2former_video/data_video/dataset_mapper.py [0:0]
def __call__(self, dataset_dict):
"""
Args:
dataset_dict (dict): Metadata of one video, in YTVIS Dataset format.
Returns:
dict: a format that builtin models in detectron2 accept
"""
# TODO consider examining below deepcopy as it costs huge amount of computations.
dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
video_length = dataset_dict["length"]
if self.is_train:
ref_frame = random.randrange(video_length)
start_idx = max(0, ref_frame-self.sampling_frame_range)
end_idx = min(video_length, ref_frame+self.sampling_frame_range + 1)
selected_idx = np.random.choice(
np.array(list(range(start_idx, ref_frame)) + list(range(ref_frame+1, end_idx))),
self.sampling_frame_num - 1,
)
selected_idx = selected_idx.tolist() + [ref_frame]
selected_idx = sorted(selected_idx)
if self.sampling_frame_shuffle:
random.shuffle(selected_idx)
else:
selected_idx = range(video_length)
video_annos = dataset_dict.pop("annotations", None)
file_names = dataset_dict.pop("file_names", None)
if self.is_train:
_ids = set()
for frame_idx in selected_idx:
_ids.update([anno["id"] for anno in video_annos[frame_idx]])
ids = dict()
for i, _id in enumerate(_ids):
ids[_id] = i
dataset_dict["image"] = []
dataset_dict["instances"] = []
dataset_dict["file_names"] = []
for frame_idx in selected_idx:
dataset_dict["file_names"].append(file_names[frame_idx])
# Read image
image = utils.read_image(file_names[frame_idx], format=self.image_format)
utils.check_image_size(dataset_dict, image)
aug_input = T.AugInput(image)
transforms = self.augmentations(aug_input)
image = aug_input.image
image_shape = image.shape[:2] # h, w
# Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
# but not efficient on large generic data structures due to the use of pickle & mp.Queue.
# Therefore it's important to use torch.Tensor.
dataset_dict["image"].append(torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))))
if (video_annos is None) or (not self.is_train):
continue
# NOTE copy() is to prevent annotations getting changed from applying augmentations
_frame_annos = []
for anno in video_annos[frame_idx]:
_anno = {}
for k, v in anno.items():
_anno[k] = copy.deepcopy(v)
_frame_annos.append(_anno)
# USER: Implement additional transformations if you have other types of data
annos = [
utils.transform_instance_annotations(obj, transforms, image_shape)
for obj in _frame_annos
if obj.get("iscrowd", 0) == 0
]
sorted_annos = [_get_dummy_anno(self.num_classes) for _ in range(len(ids))]
for _anno in annos:
idx = ids[_anno["id"]]
sorted_annos[idx] = _anno
_gt_ids = [_anno["id"] for _anno in sorted_annos]
instances = utils.annotations_to_instances(sorted_annos, image_shape, mask_format="bitmask")
instances.gt_ids = torch.tensor(_gt_ids)
if instances.has("gt_masks"):
instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
instances = filter_empty_instances(instances)
else:
instances.gt_masks = BitMasks(torch.empty((0, *image_shape)))
dataset_dict["instances"].append(instances)
return dataset_dict