in vilbert/datasets/visual_entailment_dataset.py [0:0]
def _load_dataset(dataroot, name, clean_datasets):
"""Load entries
dataroot: root path of dataset
name: 'train', 'dev', 'test'
"""
if name == "train" or name == "dev" or name == "test":
annotations_path = os.path.join(dataroot, "snli_ve_%s.jsonl" % name)
with jsonlines.open(annotations_path) as reader:
remove_ids = []
if clean_datasets:
remove_ids = np.load(
os.path.join(dataroot, "cache", "flickr_test_ids.npy")
)
remove_ids = [int(x) for x in remove_ids]
# Build an index which maps image id with a list of hypothesis annotations.
items = []
count = 0
for annotation in reader:
# logger.info(annotation)
dictionary = {}
dictionary["image_id"] = int(annotation["Flikr30kID"].split(".")[0])
if name == "train" and dictionary["image_id"] in remove_ids:
continue
dictionary["question_id"] = count
dictionary["hypothesis"] = str(annotation["sentence2"])
if str(annotation["gold_label"]) == "-":
dictionary["labels"] = []
dictionary["scores"] = []
else:
dictionary["labels"] = [
int(LABEL_MAP[str(annotation["gold_label"])])
]
dictionary["scores"] = [1.0]
items.append(dictionary)
count += 1
else:
assert False, "data split is not recognized."
entries = []
for item in items:
entries.append(_create_entry(item))
return entries