in api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py [0:0]
def find_repeat_detections(inputFilename, outputFilename=None, options=None):
##%% Input handling
if options is None:
options = RepeatDetectionOptions()
# Validate some options
if options.customDirNameFunction is not None:
assert options.nDirLevelsFromLeaf == 0, 'Cannot mix custom dir name functions with nDirLevelsFromLeaf'
if options.nDirLevelsFromLeaf != 0:
assert options.customDirNameFunction is None, 'Cannot mix custom dir name functions with nDirLevelsFromLeaf'
if options.filterFileToLoad is not None and len(options.filterFileToLoad) > 0:
print('Bypassing detection-finding, loading from {}'.format(options.filterFileToLoad))
# Load the filtering file
detectionIndexFileName = options.filterFileToLoad
sIn = open(detectionIndexFileName, 'r').read()
detectionInfo = jsonpickle.decode(sIn)
filteringBaseDir = os.path.dirname(options.filterFileToLoad)
suspiciousDetections = detectionInfo['suspiciousDetections']
# Load the same options we used when finding repeat detections
options = detectionInfo['options']
# ...except for things that explicitly tell this function not to
# find repeat detections.
options.filterFileToLoad = detectionIndexFileName
options.bWriteFilteringFolder = False
options.bRenderHtml = False
# ...if we're loading from an existing filtering file
toReturn = RepeatDetectionResults()
# Check early to avoid problems with the output folder
if options.bWriteFilteringFolder or options.bRenderHtml:
assert options.outputBase is not None and len(options.outputBase) > 0
os.makedirs(options.outputBase,exist_ok=True)
# Load file
detectionResults, otherFields = load_api_results(inputFilename, normalize_paths=True,
filename_replacements=options.filenameReplacements)
toReturn.detectionResults = detectionResults
toReturn.otherFields = otherFields
# detectionResults[detectionResults['failure'].notna()]
# Before doing any real work, make sure we can *probably* access images
# This is just a cursory check on the first image, but it heads off most
# problems related to incorrect mount points, etc. Better to do this before
# spending 20 minutes finding repeat detections.
if options.bWriteFilteringFolder or options.bRenderHtml:
if not is_sas_url(options.imageBase):
row = detectionResults.iloc[0]
relativePath = row['file']
for s in options.filenameReplacements.keys():
relativePath = relativePath.replace(s,options.filenameReplacements[s])
absolutePath = os.path.join(options.imageBase,relativePath)
assert os.path.isfile(absolutePath), 'Could not find file {}'.format(absolutePath)
##%% Separate files into directories
# This will be a map from a directory name to smaller data frames
rowsByDirectory = {}
# This is a mapping back into the rows of the original table
filenameToRow = {}
# TODO: in the case where we're loading an existing set of FPs after manual filtering,
# we should load these data frames too, rather than re-building them from the input.
print('Separating files into directories...')
nCustomDirReplacements = 0
# iRow = 0; row = detectionResults.iloc[0]
for iRow, row in detectionResults.iterrows():
relativePath = row['file']
if options.customDirNameFunction is not None:
basicDirName = os.path.dirname(relativePath.replace('\\','/'))
dirName = options.customDirNameFunction(relativePath)
if basicDirName != dirName:
nCustomDirReplacements += 1
else:
dirName = os.path.dirname(relativePath)
if len(dirName) == 0:
assert options.nDirLevelsFromLeaf == 0, 'Can''t use the dirLevelsFromLeaf option with flat filenames'
else:
if options.nDirLevelsFromLeaf > 0:
iLevel = 0
while (iLevel < options.nDirLevelsFromLeaf):
iLevel += 1
dirName = os.path.dirname(dirName)
assert len(dirName) > 0
if not dirName in rowsByDirectory:
# Create a new DataFrame with just this row
# rowsByDirectory[dirName] = pd.DataFrame(row)
rowsByDirectory[dirName] = []
rowsByDirectory[dirName].append(row)
assert relativePath not in filenameToRow
filenameToRow[relativePath] = iRow
# ...for each unique detection
if options.customDirNameFunction is not None:
print('Custom dir name function made {} replacements (of {} images)'.format(
nCustomDirReplacements,len(detectionResults)))
# Convert lists of rows to proper DataFrames
dirs = list(rowsByDirectory.keys())
for d in dirs:
rowsByDirectory[d] = pd.DataFrame(rowsByDirectory[d])
toReturn.rowsByDirectory = rowsByDirectory
toReturn.filenameToRow = filenameToRow
print('Finished separating {} files into {} directories'.format(len(detectionResults),
len(rowsByDirectory)))
##% Look for matches (or load them from file)
dirsToSearch = list(rowsByDirectory.keys())
if options.debugMaxDir > 0:
dirsToSearch = dirsToSearch[0:options.debugMaxDir]
# Are we actually looking for matches, or just loading from a file?
if len(options.filterFileToLoad) == 0:
# length-nDirs list of lists of DetectionLocation objects
suspiciousDetections = [None] * len(dirsToSearch)
# We're actually looking for matches...
print('Finding similar detections...')
allCandidateDetections = [None] * len(dirsToSearch)
if not options.bParallelizeComparisons:
options.pbar = None
# iDir = 0; dirName = dirsToSearch[iDir]
# for iDir, dirName in enumerate(tqdm(dirsToSearch)):
for iDir, dirName in enumerate(dirsToSearch):
print('Processing dir {} of {}: {}'.format(iDir,len(dirsToSearch),dirName))
allCandidateDetections[iDir] = find_matches_in_directory(dirName, options, rowsByDirectory)
else:
options.pbar = tqdm(total=len(dirsToSearch))
allCandidateDetections = Parallel(n_jobs=options.nWorkers, prefer='threads')(
delayed(find_matches_in_directory)(dirName, options, rowsByDirectory) for dirName in tqdm(dirsToSearch))
print('\nFinished looking for similar bounding boxes')
##%% Find suspicious locations based on match results
print('Filtering out repeat detections...')
nImagesWithSuspiciousDetections = 0
nSuspiciousDetections = 0
# For each directory
#
# iDir = 51
for iDir in range(len(dirsToSearch)):
# A list of DetectionLocation objects
suspiciousDetectionsThisDir = []
# A list of DetectionLocation objects
candidateDetectionsThisDir = allCandidateDetections[iDir]
for iLocation, candidateLocation in enumerate(candidateDetectionsThisDir):
# occurrenceList is a list of file/detection pairs
nOccurrences = len(candidateLocation.instances)
if nOccurrences < options.occurrenceThreshold:
continue
nImagesWithSuspiciousDetections += nOccurrences
nSuspiciousDetections += 1
suspiciousDetectionsThisDir.append(candidateLocation)
# Find the images corresponding to this bounding box, render boxes
suspiciousDetections[iDir] = suspiciousDetectionsThisDir
print(
'Finished searching for repeat detections\nFound {} unique detections on {} images that are suspicious'.format(
nSuspiciousDetections, nImagesWithSuspiciousDetections))
else:
assert len(suspiciousDetections) == len(dirsToSearch)
nDetectionsRemoved = 0
nDetectionsLoaded = 0
# We're skipping detection-finding, but to see which images are actually legit false
# positives, we may be looking for physical files or loading from a text file.
fileList = None
if options.filteredFileListToLoad is not None:
with open(options.filteredFileListToLoad) as f:
fileList = f.readlines()
fileList = [x.strip() for x in fileList]
nSuspiciousDetections = sum([len(x) for x in suspiciousDetections])
print('Loaded false positive list from file, will remove {} of {} suspicious detections'.format(
len(fileList), nSuspiciousDetections))
# For each directory
# iDir = 0; detections = suspiciousDetections[0]
#
# suspiciousDetections is an array of DetectionLocation objects,
# one per directory.
for iDir, detections in enumerate(suspiciousDetections):
bValidDetection = [True] * len(detections)
nDetectionsLoaded += len(detections)
# For each detection that was present before filtering
# iDetection = 0; detection = detections[iDetection]
for iDetection, detection in enumerate(detections):
# Are we checking the directory to see whether detections were actually false
# positives, or reading from a list?
if fileList is None:
# Is the image still there?
imageFullPath = os.path.join(filteringBaseDir, detection.sampleImageRelativeFileName)
# If not, remove this from the list of suspicious detections
if not os.path.isfile(imageFullPath):
nDetectionsRemoved += 1
bValidDetection[iDetection] = False
else:
if detection.sampleImageRelativeFileName not in fileList:
nDetectionsRemoved += 1
bValidDetection[iDetection] = False
# ...for each detection
nRemovedThisDir = len(bValidDetection) - sum(bValidDetection)
if nRemovedThisDir > 0:
print('Removed {} of {} detections from directory {}'.format(nRemovedThisDir,
len(detections), iDir))
detectionsFiltered = list(compress(detections, bValidDetection))
suspiciousDetections[iDir] = detectionsFiltered
# ...for each directory
print('Removed {} of {} total detections via manual filtering'.format(nDetectionsRemoved, nDetectionsLoaded))
# ...if we are/aren't finding detections (vs. loading from file)
toReturn.suspiciousDetections = suspiciousDetections
if options.bRenderHtml:
# Render problematic locations with html (loop)
print('Rendering html')
nDirs = len(dirsToSearch)
directoryHtmlFiles = [None] * nDirs
if options.bParallelizeRendering:
# options.pbar = tqdm(total=nDirs)
options.pbar = None
directoryHtmlFiles = Parallel(n_jobs=options.nWorkers, prefer='threads')(delayed(
render_images_for_directory)(iDir, directoryHtmlFiles, suspiciousDetections, options) for iDir in
tqdm(range(nDirs)))
else:
options.pbar = None
# For each directory
# iDir = 51
for iDir in range(nDirs):
# Add this directory to the master list of html files
directoryHtmlFiles[iDir] = render_images_for_directory(iDir, directoryHtmlFiles, suspiciousDetections,
options)
# ...for each directory
# Write master html file
masterHtmlFile = os.path.join(options.outputBase, 'index.html')
os.makedirs(options.outputBase, exist_ok=True)
toReturn.masterHtmlFile = masterHtmlFile
with open(masterHtmlFile, 'w') as fHtml:
fHtml.write('<html><body>\n')
fHtml.write('<h2><b>Repeat detections by directory</b></h2></br>\n')
for iDir, dirHtmlFile in enumerate(directoryHtmlFiles):
if dirHtmlFile is None:
continue
relPath = os.path.relpath(dirHtmlFile, options.outputBase)
dirName = dirsToSearch[iDir]
# Remove unicode characters before formatting
relPath = relPath.encode('ascii', 'ignore').decode('ascii')
dirName = dirName.encode('ascii', 'ignore').decode('ascii')
fHtml.write('<a href={}>{}</a><br/>\n'.format(relPath, dirName))
fHtml.write('</body></html>\n')
# ...if we're rendering html
toReturn.allRowsFiltered = update_detection_table(toReturn, options, outputFilename)
# Create filtering directory
if options.bWriteFilteringFolder:
print('Creating filtering folder...')
dateString = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
filteringDir = os.path.join(options.outputBase, 'filtering_' + dateString)
os.makedirs(filteringDir, exist_ok=True)
# iDir = 0; suspiciousDetectionsThisDir = suspiciousDetections[iDir]
for iDir, suspiciousDetectionsThisDir in enumerate(tqdm(suspiciousDetections)):
# suspiciousDetectionsThisDir is a list of DetectionLocation objects
# iDetection = 0; detection = suspiciousDetectionsThisDir[0]
for iDetection, detection in enumerate(suspiciousDetectionsThisDir):
instance = detection.instances[0]
relativePath = instance.filename
outputRelativePath = 'dir{:0>4d}_det{:0>4d}_n{:0>4d}.jpg'.format(iDir, iDetection, len(detection.instances))
outputFullPath = os.path.join(filteringDir, outputRelativePath)
if is_sas_url(options.imageBase):
inputFullPath = relative_sas_url(options.imageBase, relativePath)
else:
inputFullPath = os.path.join(options.imageBase, relativePath)
assert (os.path.isfile(inputFullPath)), 'Not a file: {}'.format(inputFullPath)
try:
render_bounding_box(detection, inputFullPath, outputFullPath,
lineWidth=options.lineThickness, expansion=options.boxExpansion)
except Exception as e:
print('Warning: error rendering bounding box from {} to {}: {}'.format(
inputFullPath,outputFullPath,e))
if options.bFailOnRenderError:
raise
detection.sampleImageRelativeFileName = outputRelativePath
# Write out the detection index
detectionIndexFileName = os.path.join(filteringDir, DETECTION_INDEX_FILE_NAME)
jsonpickle.set_encoder_options('json', sort_keys=True, indent=4)
detectionInfo = {}
detectionInfo['suspiciousDetections'] = suspiciousDetections
options.pbar = None
detectionInfo['options'] = options
s = jsonpickle.encode(detectionInfo,make_refs=False)
with open(detectionIndexFileName, 'w') as f:
f.write(s)
toReturn.filterFile = detectionIndexFileName
print('Done')
# ...if we're writing filtering info
return toReturn