in data_management/databases/sanity_check_json_db.py [0:0]
def sanity_check_json_db(jsonFile, options=None):
'''
jsonFile can be a filename or an already-loaded json database
return sortedCategories, data, errorInfo
'''
if options is None:
options = SanityCheckOptions()
if options.bCheckImageSizes:
options.bCheckImageExistence = True
print(options.__dict__)
baseDir = options.baseDir
##%% Read .json file if necessary, sanity-check fields
if isinstance(jsonFile,dict):
data = jsonFile
elif isinstance(jsonFile,str):
assert os.path.isfile(jsonFile), '.json file {} does not exist'.format(jsonFile)
print('Reading .json {} with base dir [{}]...'.format(
jsonFile,baseDir))
with open(jsonFile,'r') as f:
data = json.load(f)
else:
raise ValueError('Illegal value for jsonFile')
images = data['images']
annotations = data['annotations']
categories = data['categories']
# info = data['info']
assert 'info' in data
if len(baseDir) > 0:
assert os.path.isdir(baseDir), 'Base directory {} does not exist'.format(baseDir)
##%% Build dictionaries, checking ID uniqueness and internal validity as we go
imageIdToImage = {}
annIdToAnn = {}
catIdToCat = {}
imageLocationSet = set()
print('Checking categories...')
for cat in tqdm(categories):
# Confirm that required fields are present
assert 'name' in cat
assert 'id' in cat
assert isinstance(cat['id'],int), 'Illegal category ID type'
assert isinstance(cat['name'],str), 'Illegal category name type'
catId = cat['id']
# Confirm ID uniqueness
assert catId not in catIdToCat
catIdToCat[catId] = cat
cat['_count'] = 0
# ...for each category
print('\nChecking images...')
if options.iMaxNumImages > 0 and len(images) > options.iMaxNumImages:
print('Trimming image list to {}'.format(options.iMaxNumImages))
images = images[0:options.iMaxNumImages]
imagePathsInJson = set()
sequences = set()
# image = images[0]
for image in tqdm(images):
image['_count'] = 0
# Confirm that required fields are present
assert 'file_name' in image
assert 'id' in image
image['file_name'] = os.path.normpath(image['file_name'])
imagePathsInJson.add(image['file_name'])
assert isinstance(image['file_name'],str), 'Illegal image filename type'
assert isinstance(image['id'],str), 'Illegal image ID type'
imageId = image['id']
# Confirm ID uniqueness
assert imageId not in imageIdToImage, 'Duplicate image ID {}'.format(imageId)
imageIdToImage[imageId] = image
if 'height' in image:
assert 'width' in image, 'Image with height but no width: {}'.format(image['id'])
if 'width' in image:
assert 'height' in image, 'Image with width but no height: {}'.format(image['id'])
if options.bRequireLocation:
assert 'location' in image, 'No location available for: {}'.format(image['id'])
if 'location' in image:
# We previously supported ints here; this should be strings now
# assert isinstance(image['location'], str) or isinstance(image['location'], int), 'Illegal image location type'
assert isinstance(image['location'], str)
imageLocationSet.add(image['location'])
if 'seq_id' in image:
sequences.add(image['seq_id'])
assert not ('sequence_id' in image or 'sequence' in image), 'Illegal sequence identifier'
unusedFiles = []
# Are we checking for unused images?
if (len(baseDir) > 0) and options.bFindUnusedImages:
print('Enumerating images...')
# Recursively enumerate images
imagePaths = []
for root, dirs, files in os.walk(baseDir):
for file in files:
if file.lower().endswith(('.jpeg', '.jpg', '.png')):
relDir = os.path.relpath(root, baseDir)
relFile = os.path.join(relDir,file)
relFile = os.path.normpath(relFile)
if len(relFile) > 2 and \
(relFile[0:2] == './' or relFile[0:2] == '.\\'):
relFile = relFile[2:]
imagePaths.append(relFile)
for p in imagePaths:
if p not in imagePathsInJson:
# print('Image {} is unused'.format(p))
unusedFiles.append(p)
validationErrors = []
# Are we checking file existence and/or image size?
if options.bCheckImageSizes or options.bCheckImageExistence:
if len(baseDir) == 0:
print('Warning: checking image sizes without a base directory, assuming "."')
print('Checking image existence and/or image sizes...')
pool = ThreadPool(nThreads)
# results = pool.imap_unordered(lambda x: fetch_url(x,nImages), indexedUrlList)
defaultOptions.baseDir = options.baseDir
defaultOptions.bCheckImageSizes = options.bCheckImageSizes
defaultOptions.bCheckImageExistence = options.bCheckImageExistence
results = tqdm(pool.imap(check_image_existence_and_size, images), total=len(images))
for iImage,r in enumerate(results):
if not r:
print('Image validation error for image {}'.format(iImage))
validationErrors.append(os.path.join(options.baseDir,image['file_name']))
# ...for each image
print('Checking annotations...')
nBoxes = 0
for ann in tqdm(annotations):
# Confirm that required fields are present
assert 'image_id' in ann
assert 'id' in ann
assert 'category_id' in ann
assert isinstance(ann['id'],str), 'Illegal annotation ID type'
assert isinstance(ann['category_id'],int), 'Illegal annotation category ID type'
assert isinstance(ann['image_id'],str), 'Illegal annotation image ID type'
if 'bbox' in ann:
nBoxes += 1
annId = ann['id']
# Confirm ID uniqueness
assert annId not in annIdToAnn
annIdToAnn[annId] = ann
# Confirm validity
assert ann['category_id'] in catIdToCat
assert ann['image_id'] in imageIdToImage
imageIdToImage[ann['image_id']]['_count'] += 1
catIdToCat[ann['category_id']]['_count'] +=1
# ...for each annotation
##%% Print statistics
# Find un-annotated images and multi-annotation images
nUnannotated = 0
nMultiAnnotated = 0
for image in images:
if image['_count'] == 0:
nUnannotated += 1
elif image['_count'] > 1:
nMultiAnnotated += 1
print('Found {} unannotated images, {} images with multiple annotations'.format(
nUnannotated,nMultiAnnotated))
if (len(baseDir) > 0) and options.bFindUnusedImages:
print('Found {} unused image files'.format(len(unusedFiles)))
nUnusedCategories = 0
# Find unused categories
for cat in categories:
if cat['_count'] == 0:
print('Unused category: {}'.format(cat['name']))
nUnusedCategories += 1
print('Found {} unused categories'.format(nUnusedCategories))
sequenceString = 'no sequence info'
if len(sequences) > 0:
sequenceString = '{} sequences'.format(len(sequences))
print('\nDB contains {} images, {} annotations, {} bboxes, {} categories, {}\n'.format(
len(images),len(annotations),nBoxes,len(categories),sequenceString))
if len(imageLocationSet) > 0:
print('DB contains images from {} locations\n'.format(len(imageLocationSet)))
# Prints a list of categories sorted by count
# https://stackoverflow.com/questions/72899/how-do-i-sort-a-list-of-dictionaries-by-a-value-of-the-dictionary
sortedCategories = sorted(categories, key=itemgetter('_count'), reverse=True)
print('Categories and counts:\n')
for cat in sortedCategories:
print('{:6} {}'.format(cat['_count'],cat['name']))
print('')
errorInfo = {}
errorInfo['unusedFiles'] = unusedFiles
errorInfo['validationErrors'] = validationErrors
return sortedCategories, data, errorInfo