def main()

in download_imgur5k.py [0:0]


def main():
    args = parse_args()
    os.makedirs(args.output_dir, exist_ok=True)

    # Create a hash dictionary with image index and its correspond gt hash
    with open(f"{args.dataset_info_dir}/imgur5k_hashes.lst", "r", encoding="utf-8") as _H:
        hashes = _H.readlines()
        hash_dict = {}

        for hash in hashes:
            hash_dict[f"{hash.split()[0]}"] = f"{hash.split()[1]}"


    tot_evals = 0
    num_match = 0
    invalid_urls = []
    # Download the urls and save only the ones with valid hash o ensure underlying image has not changed
    for index in list(hash_dict.keys()):
        image_url = f'https://i.imgur.com/{index}.jpg'
        img_data = requests.get(image_url).content
        if len(img_data) < 100:
            print(f"URL retrieval for {index} failed!!\n")
            invalid_urls.append(image_url)
            continue
        with open(f'{args.output_dir}/{index}.jpg', 'wb') as handler:
            handler.write(img_data)

        compute_image_hash(f'{args.output_dir}/{index}.jpg')
        tot_evals += 1
        if hash_dict[index] != compute_image_hash(f'{args.output_dir}/{index}.jpg'):
            print(f"For IMG: {index}, ref hash: {hash_dict[index]} != cur hash: {compute_image_hash(f'{args.output_dir}/{index}.jpg')}")
            os.remove(f'{args.output_dir}/{index}.jpg')
            invalid_urls.append(image_url)
            continue
        else:
            num_match += 1

    # Generate the final annotations file
    # Format: { "index_id" : {indexes}, "index_to_annotation_map" : { annotations ids for an index}, "annotation_id": { each annotation's info } }
    # Bounding boxes with '.' mean the annotations were not done for various reasons

    _F = np.loadtxt(f'{args.dataset_info_dir}/imgur5k_data.lst', delimiter="\t", dtype=np.str, encoding="utf-8")
    anno_json = {}

    anno_json['index_id'] = {}
    anno_json['index_to_ann_map'] = {}
    anno_json['ann_id'] = {}

    cur_index = ''
    for cnt, image_url in enumerate(_F[:,0]):
        if image_url in invalid_urls:
            continue

        index = image_url.split('/')[-1][:-4]
        if index != cur_index:
            anno_json['index_id'][index] = {'image_url': image_url, 'image_path': f'{args.output_dir}/{index}.jpg', 'image_hash': hash_dict[index]}
            anno_json['index_to_ann_map'][index] = []

        ann_id = f"{index}_{len(anno_json['index_to_ann_map'][index])}"
        anno_json['index_to_ann_map'][index].append(ann_id)
        anno_json['ann_id'][ann_id] = {'word': _F[cnt,2], 'bounding_box': _F[cnt,1]}

        cur_index = index

    json.dump(anno_json, open(f'{args.dataset_info_dir}/imgur5k_annotations.json', 'w'), indent=4)

    # Now split the annotations json in train, validation and test jsons
    splits = ['train', 'val', 'test']
    for split in splits:
        _split_idx = np.loadtxt(f'{args.dataset_info_dir}/{split}_index_ids.lst', delimiter="\n", dtype=np.str)
        split_json = _create_split_json(anno_json, _split_idx)
        json.dump(split_json, open(f'{args.dataset_info_dir}/imgur5k_annotations_{split}.json', 'w'), indent=4)

    print(f"MATCHES: {num_match}/{tot_evals}\n")