def find_repeat_detections()

in api/batch_processing/postprocessing/repeat_detection_elimination/repeat_detections_core.py [0:0]
205 lines of code
41 McCabe index (conditional complexity)

def find_repeat_detections(inputFilename, outputFilename=None, options=None):
    
    ##%% Input handling

    if options is None:
        
        options = RepeatDetectionOptions()

    # Validate some options
    
    if options.customDirNameFunction is not None:
        assert options.nDirLevelsFromLeaf == 0, 'Cannot mix custom dir name functions with nDirLevelsFromLeaf'
        
    if options.nDirLevelsFromLeaf != 0:
        assert options.customDirNameFunction is None, 'Cannot mix custom dir name functions with nDirLevelsFromLeaf'
            
    if options.filterFileToLoad is not None and len(options.filterFileToLoad) > 0:
    
        print('Bypassing detection-finding, loading from {}'.format(options.filterFileToLoad))

        # Load the filtering file
        detectionIndexFileName = options.filterFileToLoad
        sIn = open(detectionIndexFileName, 'r').read()
        detectionInfo = jsonpickle.decode(sIn)
        filteringBaseDir = os.path.dirname(options.filterFileToLoad)
        suspiciousDetections = detectionInfo['suspiciousDetections']
        
        # Load the same options we used when finding repeat detections
        options = detectionInfo['options']
        
        # ...except for things that explicitly tell this function not to
        # find repeat detections.
        options.filterFileToLoad = detectionIndexFileName
        options.bWriteFilteringFolder = False
        options.bRenderHtml = False        
        
    # ...if we're loading from an existing filtering file
    
    toReturn = RepeatDetectionResults()

    
    # Check early to avoid problems with the output folder
    
    if options.bWriteFilteringFolder or options.bRenderHtml:
        assert options.outputBase is not None and len(options.outputBase) > 0
        os.makedirs(options.outputBase,exist_ok=True)


    # Load file

    detectionResults, otherFields = load_api_results(inputFilename, normalize_paths=True,
                                         filename_replacements=options.filenameReplacements)
    toReturn.detectionResults = detectionResults
    toReturn.otherFields = otherFields

    # detectionResults[detectionResults['failure'].notna()]
        
    # Before doing any real work, make sure we can *probably* access images
    # This is just a cursory check on the first image, but it heads off most 
    # problems related to incorrect mount points, etc.  Better to do this before
    # spending 20 minutes finding repeat detections.  
    
    if options.bWriteFilteringFolder or options.bRenderHtml:
        
        if not is_sas_url(options.imageBase):
            
            row = detectionResults.iloc[0]
            relativePath = row['file']
            for s in options.filenameReplacements.keys():
                relativePath = relativePath.replace(s,options.filenameReplacements[s])
            absolutePath = os.path.join(options.imageBase,relativePath)
            assert os.path.isfile(absolutePath), 'Could not find file {}'.format(absolutePath)


    ##%% Separate files into directories

    # This will be a map from a directory name to smaller data frames
    rowsByDirectory = {}

    # This is a mapping back into the rows of the original table
    filenameToRow = {}

    # TODO: in the case where we're loading an existing set of FPs after manual filtering,
    # we should load these data frames too, rather than re-building them from the input.

    print('Separating files into directories...')

    nCustomDirReplacements = 0
    
    # iRow = 0; row = detectionResults.iloc[0]
    for iRow, row in detectionResults.iterrows():
        
        relativePath = row['file']
        
        if options.customDirNameFunction is not None:
            basicDirName = os.path.dirname(relativePath.replace('\\','/'))
            dirName = options.customDirNameFunction(relativePath)
            if basicDirName != dirName:
                nCustomDirReplacements += 1
        else:
            dirName = os.path.dirname(relativePath)
        
        if len(dirName) == 0:
            assert options.nDirLevelsFromLeaf == 0, 'Can''t use the dirLevelsFromLeaf option with flat filenames'
        else:
            if options.nDirLevelsFromLeaf > 0:
                iLevel = 0
                while (iLevel < options.nDirLevelsFromLeaf):
                    iLevel += 1
                    dirName = os.path.dirname(dirName)
            assert len(dirName) > 0

        if not dirName in rowsByDirectory:
            # Create a new DataFrame with just this row
            # rowsByDirectory[dirName] = pd.DataFrame(row)
            rowsByDirectory[dirName] = []

        rowsByDirectory[dirName].append(row)

        assert relativePath not in filenameToRow
        filenameToRow[relativePath] = iRow

    # ...for each unique detection
    
    if options.customDirNameFunction is not None:
        print('Custom dir name function made {} replacements (of {} images)'.format(
            nCustomDirReplacements,len(detectionResults)))
        
    # Convert lists of rows to proper DataFrames
    dirs = list(rowsByDirectory.keys())
    for d in dirs:
        rowsByDirectory[d] = pd.DataFrame(rowsByDirectory[d])

    toReturn.rowsByDirectory = rowsByDirectory
    toReturn.filenameToRow = filenameToRow

    print('Finished separating {} files into {} directories'.format(len(detectionResults),
                                                                    len(rowsByDirectory)))


    ##% Look for matches (or load them from file)

    dirsToSearch = list(rowsByDirectory.keys())
    if options.debugMaxDir > 0:
        dirsToSearch = dirsToSearch[0:options.debugMaxDir]

    # Are we actually looking for matches, or just loading from a file?
    if len(options.filterFileToLoad) == 0:

        # length-nDirs list of lists of DetectionLocation objects
        suspiciousDetections = [None] * len(dirsToSearch)

        # We're actually looking for matches...
        print('Finding similar detections...')

        allCandidateDetections = [None] * len(dirsToSearch)

        if not options.bParallelizeComparisons:

            options.pbar = None
            # iDir = 0; dirName = dirsToSearch[iDir]
            # for iDir, dirName in enumerate(tqdm(dirsToSearch)):
            for iDir, dirName in enumerate(dirsToSearch):
                print('Processing dir {} of {}: {}'.format(iDir,len(dirsToSearch),dirName))
                allCandidateDetections[iDir] = find_matches_in_directory(dirName, options, rowsByDirectory)

        else:

            options.pbar = tqdm(total=len(dirsToSearch))
            allCandidateDetections = Parallel(n_jobs=options.nWorkers, prefer='threads')(
                delayed(find_matches_in_directory)(dirName, options, rowsByDirectory) for dirName in tqdm(dirsToSearch))

        print('\nFinished looking for similar bounding boxes')

        ##%% Find suspicious locations based on match results

        print('Filtering out repeat detections...')

        nImagesWithSuspiciousDetections = 0
        nSuspiciousDetections = 0

        # For each directory
        #
        # iDir = 51
        for iDir in range(len(dirsToSearch)):

            # A list of DetectionLocation objects
            suspiciousDetectionsThisDir = []

            # A list of DetectionLocation objects
            candidateDetectionsThisDir = allCandidateDetections[iDir]

            for iLocation, candidateLocation in enumerate(candidateDetectionsThisDir):

                # occurrenceList is a list of file/detection pairs
                nOccurrences = len(candidateLocation.instances)

                if nOccurrences < options.occurrenceThreshold:
                    continue

                nImagesWithSuspiciousDetections += nOccurrences
                nSuspiciousDetections += 1

                suspiciousDetectionsThisDir.append(candidateLocation)
                # Find the images corresponding to this bounding box, render boxes

            suspiciousDetections[iDir] = suspiciousDetectionsThisDir

        print(
            'Finished searching for repeat detections\nFound {} unique detections on {} images that are suspicious'.format(
                nSuspiciousDetections, nImagesWithSuspiciousDetections))

    else:

        assert len(suspiciousDetections) == len(dirsToSearch)

        nDetectionsRemoved = 0
        nDetectionsLoaded = 0

        # We're skipping detection-finding, but to see which images are actually legit false
        # positives, we may be looking for physical files or loading from a text file.        
        fileList = None
        if options.filteredFileListToLoad is not None:
            with open(options.filteredFileListToLoad) as f:
                fileList = f.readlines()
                fileList = [x.strip() for x in fileList]
            nSuspiciousDetections = sum([len(x) for x in suspiciousDetections])
            print('Loaded false positive list from file, will remove {} of {} suspicious detections'.format(
                len(fileList), nSuspiciousDetections))

        # For each directory
        # iDir = 0; detections = suspiciousDetections[0]
        #
        # suspiciousDetections is an array of DetectionLocation objects,
        # one per directory.            
        for iDir, detections in enumerate(suspiciousDetections):

            bValidDetection = [True] * len(detections)
            nDetectionsLoaded += len(detections)

            # For each detection that was present before filtering
            # iDetection = 0; detection = detections[iDetection]
            for iDetection, detection in enumerate(detections):

                # Are we checking the directory to see whether detections were actually false
                # positives, or reading from a list?
                if fileList is None:
                    
                    # Is the image still there?                
                    imageFullPath = os.path.join(filteringBaseDir, detection.sampleImageRelativeFileName)

                    # If not, remove this from the list of suspicious detections
                    if not os.path.isfile(imageFullPath):
                        nDetectionsRemoved += 1
                        bValidDetection[iDetection] = False

                else:
                    
                    if detection.sampleImageRelativeFileName not in fileList:
                        nDetectionsRemoved += 1
                        bValidDetection[iDetection] = False

            # ...for each detection

            nRemovedThisDir = len(bValidDetection) - sum(bValidDetection)
            if nRemovedThisDir > 0:
                print('Removed {} of {} detections from directory {}'.format(nRemovedThisDir,
                                                                             len(detections), iDir))

            detectionsFiltered = list(compress(detections, bValidDetection))
            suspiciousDetections[iDir] = detectionsFiltered

        # ...for each directory

        print('Removed {} of {} total detections via manual filtering'.format(nDetectionsRemoved, nDetectionsLoaded))

    # ...if we are/aren't finding detections (vs. loading from file)

    toReturn.suspiciousDetections = suspiciousDetections

    if options.bRenderHtml:

        # Render problematic locations with html (loop)

        print('Rendering html')

        nDirs = len(dirsToSearch)
        directoryHtmlFiles = [None] * nDirs

        if options.bParallelizeRendering:

            # options.pbar = tqdm(total=nDirs)
            options.pbar = None

            directoryHtmlFiles = Parallel(n_jobs=options.nWorkers, prefer='threads')(delayed(
                render_images_for_directory)(iDir, directoryHtmlFiles, suspiciousDetections, options) for iDir in
                                                                                     tqdm(range(nDirs)))

        else:

            options.pbar = None

            # For each directory
            # iDir = 51
            for iDir in range(nDirs):
                # Add this directory to the master list of html files
                directoryHtmlFiles[iDir] = render_images_for_directory(iDir, directoryHtmlFiles, suspiciousDetections,
                                                                       options)

            # ...for each directory

        # Write master html file

        masterHtmlFile = os.path.join(options.outputBase, 'index.html')
        os.makedirs(options.outputBase, exist_ok=True)
        toReturn.masterHtmlFile = masterHtmlFile

        with open(masterHtmlFile, 'w') as fHtml:

            fHtml.write('<html><body>\n')
            fHtml.write('<h2><b>Repeat detections by directory</b></h2></br>\n')

            for iDir, dirHtmlFile in enumerate(directoryHtmlFiles):

                if dirHtmlFile is None:
                    continue

                relPath = os.path.relpath(dirHtmlFile, options.outputBase)
                dirName = dirsToSearch[iDir]

                # Remove unicode characters before formatting
                relPath = relPath.encode('ascii', 'ignore').decode('ascii')
                dirName = dirName.encode('ascii', 'ignore').decode('ascii')

                fHtml.write('<a href={}>{}</a><br/>\n'.format(relPath, dirName))

            fHtml.write('</body></html>\n')

    # ...if we're rendering html

    toReturn.allRowsFiltered = update_detection_table(toReturn, options, outputFilename)
    
    # Create filtering directory
    if options.bWriteFilteringFolder:

        print('Creating filtering folder...')

        dateString = datetime.now().strftime('%Y.%m.%d.%H.%M.%S')
        filteringDir = os.path.join(options.outputBase, 'filtering_' + dateString)
        os.makedirs(filteringDir, exist_ok=True)

        # iDir = 0; suspiciousDetectionsThisDir = suspiciousDetections[iDir]
        for iDir, suspiciousDetectionsThisDir in enumerate(tqdm(suspiciousDetections)):

            # suspiciousDetectionsThisDir is a list of DetectionLocation objects
            # iDetection = 0; detection = suspiciousDetectionsThisDir[0]
            for iDetection, detection in enumerate(suspiciousDetectionsThisDir):
                
                instance = detection.instances[0]
                relativePath = instance.filename
                outputRelativePath = 'dir{:0>4d}_det{:0>4d}_n{:0>4d}.jpg'.format(iDir, iDetection, len(detection.instances))
                outputFullPath = os.path.join(filteringDir, outputRelativePath)
                
                if is_sas_url(options.imageBase):
                    inputFullPath = relative_sas_url(options.imageBase, relativePath)
                else:
                    inputFullPath = os.path.join(options.imageBase, relativePath)
                    assert (os.path.isfile(inputFullPath)), 'Not a file: {}'.format(inputFullPath)
                    
                try:
                    render_bounding_box(detection, inputFullPath, outputFullPath,
                                        lineWidth=options.lineThickness, expansion=options.boxExpansion)
                except Exception as e:
                    print('Warning: error rendering bounding box from {} to {}: {}'.format(
                        inputFullPath,outputFullPath,e))                    
                    if options.bFailOnRenderError:
                        raise
                detection.sampleImageRelativeFileName = outputRelativePath

        # Write out the detection index
        detectionIndexFileName = os.path.join(filteringDir, DETECTION_INDEX_FILE_NAME)
        jsonpickle.set_encoder_options('json', sort_keys=True, indent=4)
        detectionInfo = {}
        detectionInfo['suspiciousDetections'] = suspiciousDetections
        options.pbar = None
        detectionInfo['options'] = options
        s = jsonpickle.encode(detectionInfo,make_refs=False)
        with open(detectionIndexFileName, 'w') as f:
            f.write(s)
        toReturn.filterFile = detectionIndexFileName

        print('Done')

    # ...if we're writing filtering info

    return toReturn