def _load_dataset()

in vilbert/datasets/visual_entailment_dataset.py [0:0]


def _load_dataset(dataroot, name, clean_datasets):
    """Load entries

    dataroot: root path of dataset
    name: 'train', 'dev', 'test'
    """
    if name == "train" or name == "dev" or name == "test":
        annotations_path = os.path.join(dataroot, "snli_ve_%s.jsonl" % name)
        with jsonlines.open(annotations_path) as reader:

            remove_ids = []
            if clean_datasets:
                remove_ids = np.load(
                    os.path.join(dataroot, "cache", "flickr_test_ids.npy")
                )
                remove_ids = [int(x) for x in remove_ids]
            # Build an index which maps image id with a list of hypothesis annotations.
            items = []
            count = 0
            for annotation in reader:
                # logger.info(annotation)
                dictionary = {}
                dictionary["image_id"] = int(annotation["Flikr30kID"].split(".")[0])
                if name == "train" and dictionary["image_id"] in remove_ids:
                    continue
                dictionary["question_id"] = count
                dictionary["hypothesis"] = str(annotation["sentence2"])
                if str(annotation["gold_label"]) == "-":
                    dictionary["labels"] = []
                    dictionary["scores"] = []
                else:
                    dictionary["labels"] = [
                        int(LABEL_MAP[str(annotation["gold_label"])])
                    ]
                    dictionary["scores"] = [1.0]
                items.append(dictionary)
                count += 1
    else:
        assert False, "data split is not recognized."

    entries = []
    for item in items:
        entries.append(_create_entry(item))
    return entries