in src/pixparse/data/datasets_utils.py [0:0]
def __init__(self, root_dir, split, transform=None):
self.extra_tokens = ['<s_answer>', '</s_answer>', '</s_question>', '<s_question>']
self.root_dir = root_dir
self.split = split
assert split in ["train", "test", "val"], "split is not train, test or val."
if split == "test" or split == "val":
json_path = os.path.join(root_dir, split, f"{split}_v1.0.json")
else:
json_path = os.path.join(root_dir, split, f"processed_{split}_v1.0.json")
assert os.path.isdir(self.root_dir), f"Can't find {root_dir}. Make sure you have DocVQA files locally."
assert os.path.isfile(json_path), f"{json_path} not found. Make sure you have the processed dataset."
self.img_dir = os.path.join(root_dir, split)
with open(json_path, 'r') as f:
self.data_dict = json.load(f)
self.all_images = list(self.data_dict.keys())
self.transform = transform