in vissl/utils/instance_retrieval_utils/data_util.py [0:0]
def load(self, num_samples=None):
"""
Load the data ground truth and parse the data so it's ready to be used.
"""
# Load the dataset GT
self.lab_root = f"{self.path}/lab/"
self.img_root = f"{self.path}/jpg/"
logging.info(f"Loading data: {self.path}")
lab_filenames = np.sort(g_pathmgr.ls(self.lab_root))
# Get the filenames without the extension
self.img_filenames = [
e[:-4]
for e in np.sort(g_pathmgr.ls(self.img_root))
if e[:-4] not in self.blacklisted
]
# Parse the label files. Some challenges as filenames do not correspond
# exactly to query names. Go through all the labels to:
# i) map names to filenames and vice versa
# ii) get the relevant regions of interest of the queries,
# iii) get the indexes of the dataset images that are queries
# iv) get the relevants / non-relevants list
self.relevants = {}
self.junk = {}
self.non_relevants = {}
self.filename_to_name = {}
self.name_to_filename = OrderedDict()
self.q_roi = {}
for e in lab_filenames:
if e.endswith("_query.txt"):
q_name = e[: -len("_query.txt")]
with g_pathmgr.open(f"{self.lab_root}/{e}") as fopen:
q_data = fopen.readline().split(" ")
if q_data[0].startswith("oxc1_"):
q_filename = q_data[0][5:]
else:
q_filename = q_data[0]
self.filename_to_name[q_filename] = q_name
self.name_to_filename[q_name] = q_filename
with g_pathmgr.open(f"{self.lab_root}/{q_name}_ok.txt") as fopen:
good = {e.strip() for e in fopen}
with g_pathmgr.open(f"{self.lab_root}/{q_name}_good.txt") as fopen:
good = good.union({e.strip() for e in fopen})
with g_pathmgr.open(f"{self.lab_root}/{q_name}_junk.txt") as fopen:
junk = {e.strip() for e in fopen}
good_plus_junk = good.union(junk)
self.relevants[q_name] = [
i
for i in range(len(self.img_filenames))
if self.img_filenames[i] in good
]
self.junk[q_name] = [
i
for i in range(len(self.img_filenames))
if self.img_filenames[i] in junk
]
self.non_relevants[q_name] = [
i
for i in range(len(self.img_filenames))
if self.img_filenames[i] not in good_plus_junk
]
self.q_roi[q_name] = np.array(
[float(q) for q in q_data[1:]], dtype=np.float32
)
self.q_names = list(self.name_to_filename.keys())
self.q_index = np.array(
[self.img_filenames.index(self.name_to_filename[qn]) for qn in self.q_names]
)
self.N_images = len(self.img_filenames)
self.N_queries = len(self.q_index)
if num_samples is not None:
self.N_queries = min(self.N_queries, num_samples)
self.N_images = min(self.N_images, num_samples)