in vilbert/datasets/visual7w_pointing_dataset.py [0:0]
def _load_annotations(self, clean_datasets):
# Build an index which maps image id with a list of caption annotations.
entries = []
remove_ids = []
if clean_datasets or self.split == "mteval":
remove_ids = np.load(
os.path.join(self.dataroot, "cache", "genome_test_ids.npy")
)
remove_ids = [int(x) for x in remove_ids]
with open(os.path.join(self.dataroot, "dataset_v7w_pointing.json"), "rb") as f:
visual7w = json.load(f)
boxes_dict = {}
for b in visual7w["boxes"]:
boxes_dict[b["box_id"]] = [
b["x"],
b["y"],
b["x"] + b["width"],
b["y"] + b["height"],
]
if self.split == "mteval":
split = "train"
else:
split = self.split
for img in visual7w["images"]:
if img["split"] == split:
if self.split == "train" and int(img["image_id"]) in remove_ids:
continue
elif self.split == "mteval" and int(img["image_id"]) not in remove_ids:
continue
bboxes = []
for qa in img["qa_pairs"]:
bboxes.extend(qa["multiple_choices"])
bboxes.append(qa["answer"])
bboxes = list(set(bboxes))
bboxes = sorted(bboxes)
for qa in img["qa_pairs"]:
bbox_idx = []
for a in sorted(qa["multiple_choices"] + [qa["answer"]]):
bbox_idx.append(bboxes.index(a))
entries.append(
{
"caption": qa["question"],
"sent_id": qa["qa_id"],
"image_id": img["image_id"],
"refBox": boxes_dict[qa["answer"]],
"ref_id": qa["answer"],
"mc_idx": bbox_idx,
}
)
return entries