in download_imgur5k.py [0:0]
def main():
args = parse_args()
os.makedirs(args.output_dir, exist_ok=True)
# Create a hash dictionary with image index and its correspond gt hash
with open(f"{args.dataset_info_dir}/imgur5k_hashes.lst", "r", encoding="utf-8") as _H:
hashes = _H.readlines()
hash_dict = {}
for hash in hashes:
hash_dict[f"{hash.split()[0]}"] = f"{hash.split()[1]}"
tot_evals = 0
num_match = 0
invalid_urls = []
# Download the urls and save only the ones with valid hash o ensure underlying image has not changed
for index in list(hash_dict.keys()):
image_url = f'https://i.imgur.com/{index}.jpg'
img_data = requests.get(image_url).content
if len(img_data) < 100:
print(f"URL retrieval for {index} failed!!\n")
invalid_urls.append(image_url)
continue
with open(f'{args.output_dir}/{index}.jpg', 'wb') as handler:
handler.write(img_data)
compute_image_hash(f'{args.output_dir}/{index}.jpg')
tot_evals += 1
if hash_dict[index] != compute_image_hash(f'{args.output_dir}/{index}.jpg'):
print(f"For IMG: {index}, ref hash: {hash_dict[index]} != cur hash: {compute_image_hash(f'{args.output_dir}/{index}.jpg')}")
os.remove(f'{args.output_dir}/{index}.jpg')
invalid_urls.append(image_url)
continue
else:
num_match += 1
# Generate the final annotations file
# Format: { "index_id" : {indexes}, "index_to_annotation_map" : { annotations ids for an index}, "annotation_id": { each annotation's info } }
# Bounding boxes with '.' mean the annotations were not done for various reasons
_F = np.loadtxt(f'{args.dataset_info_dir}/imgur5k_data.lst', delimiter="\t", dtype=np.str, encoding="utf-8")
anno_json = {}
anno_json['index_id'] = {}
anno_json['index_to_ann_map'] = {}
anno_json['ann_id'] = {}
cur_index = ''
for cnt, image_url in enumerate(_F[:,0]):
if image_url in invalid_urls:
continue
index = image_url.split('/')[-1][:-4]
if index != cur_index:
anno_json['index_id'][index] = {'image_url': image_url, 'image_path': f'{args.output_dir}/{index}.jpg', 'image_hash': hash_dict[index]}
anno_json['index_to_ann_map'][index] = []
ann_id = f"{index}_{len(anno_json['index_to_ann_map'][index])}"
anno_json['index_to_ann_map'][index].append(ann_id)
anno_json['ann_id'][ann_id] = {'word': _F[cnt,2], 'bounding_box': _F[cnt,1]}
cur_index = index
json.dump(anno_json, open(f'{args.dataset_info_dir}/imgur5k_annotations.json', 'w'), indent=4)
# Now split the annotations json in train, validation and test jsons
splits = ['train', 'val', 'test']
for split in splits:
_split_idx = np.loadtxt(f'{args.dataset_info_dir}/{split}_index_ids.lst', delimiter="\n", dtype=np.str)
split_json = _create_split_json(anno_json, _split_idx)
json.dump(split_json, open(f'{args.dataset_info_dir}/imgur5k_annotations_{split}.json', 'w'), indent=4)
print(f"MATCHES: {num_match}/{tot_evals}\n")