def load()

in vissl/utils/instance_retrieval_utils/data_util.py [0:0]
61 lines of code
19 McCabe index (conditional complexity)

    def load(self, num_samples=None):
        """
        Load the data ground truth and parse the data so it's ready to be used.
        """
        # Load the dataset GT
        self.lab_root = f"{self.path}/lab/"
        self.img_root = f"{self.path}/jpg/"
        logging.info(f"Loading data: {self.path}")
        lab_filenames = np.sort(g_pathmgr.ls(self.lab_root))
        # Get the filenames without the extension
        self.img_filenames = [
            e[:-4]
            for e in np.sort(g_pathmgr.ls(self.img_root))
            if e[:-4] not in self.blacklisted
        ]

        # Parse the label files. Some challenges as filenames do not correspond
        # exactly to query names. Go through all the labels to:
        # i) map names to filenames and vice versa
        # ii) get the relevant regions of interest of the queries,
        # iii) get the indexes of the dataset images that are queries
        # iv) get the relevants / non-relevants list
        self.relevants = {}
        self.junk = {}
        self.non_relevants = {}

        self.filename_to_name = {}
        self.name_to_filename = OrderedDict()
        self.q_roi = {}
        for e in lab_filenames:
            if e.endswith("_query.txt"):
                q_name = e[: -len("_query.txt")]
                with g_pathmgr.open(f"{self.lab_root}/{e}") as fopen:
                    q_data = fopen.readline().split(" ")
                if q_data[0].startswith("oxc1_"):
                    q_filename = q_data[0][5:]
                else:
                    q_filename = q_data[0]
                self.filename_to_name[q_filename] = q_name
                self.name_to_filename[q_name] = q_filename
                with g_pathmgr.open(f"{self.lab_root}/{q_name}_ok.txt") as fopen:
                    good = {e.strip() for e in fopen}
                with g_pathmgr.open(f"{self.lab_root}/{q_name}_good.txt") as fopen:
                    good = good.union({e.strip() for e in fopen})
                with g_pathmgr.open(f"{self.lab_root}/{q_name}_junk.txt") as fopen:
                    junk = {e.strip() for e in fopen}
                good_plus_junk = good.union(junk)
                self.relevants[q_name] = [
                    i
                    for i in range(len(self.img_filenames))
                    if self.img_filenames[i] in good
                ]
                self.junk[q_name] = [
                    i
                    for i in range(len(self.img_filenames))
                    if self.img_filenames[i] in junk
                ]
                self.non_relevants[q_name] = [
                    i
                    for i in range(len(self.img_filenames))
                    if self.img_filenames[i] not in good_plus_junk
                ]
                self.q_roi[q_name] = np.array(
                    [float(q) for q in q_data[1:]], dtype=np.float32
                )

        self.q_names = list(self.name_to_filename.keys())
        self.q_index = np.array(
            [self.img_filenames.index(self.name_to_filename[qn]) for qn in self.q_names]
        )

        self.N_images = len(self.img_filenames)
        self.N_queries = len(self.q_index)

        if num_samples is not None:
            self.N_queries = min(self.N_queries, num_samples)
            self.N_images = min(self.N_images, num_samples)