source/depth_estimation/Derp.cpp (778 lines of code) (raw):
/**
* Copyright 2004-present Facebook. All Rights Reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "source/depth_estimation/Derp.h"
#include <random>
#include <boost/algorithm/string/predicate.hpp>
#include <glog/logging.h>
#include <folly/Format.h>
#include "source/depth_estimation/TemporalBilateralFilter.h"
#include "source/util/ImageUtil.h"
#include "source/util/ThreadPool.h"
using namespace fb360_dep::cv_util;
using namespace fb360_dep::image_util;
namespace fb360_dep {
namespace depth_estimation {
void plotMatches(
const PyramidLevel<PixelType>& pyramidLevel,
const std::string& caller,
const filesystem::path& debugDir) {
if (debugDir.empty() || kDebugPlotMatchDst.empty() ||
kDebugPlotMatchLevel != pyramidLevel.level) {
return;
}
LOG(INFO) << folly::sformat("Plotting matches for {}...", kDebugPlotMatchDst);
const int dstIdx = pyramidLevel.findDstIdx(kDebugPlotMatchDst);
const Camera& camDst = pyramidLevel.rigDst[dstIdx];
const cv::Mat_<float>& disparity = pyramidLevel.dstDisparity(dstIdx);
const int xSize = disparity.cols;
const int ySize = disparity.rows;
if ((0 <= kDebugPlotMatchX && 0 <= kDebugPlotMatchY)) {
CHECK(kDebugPlotMatchX < xSize && kDebugPlotMatchY < ySize) << folly::sformat(
"debug coords({}, {}) out of bounds: ({}, {})",
kDebugPlotMatchX,
kDebugPlotMatchY,
xSize,
ySize);
}
for (int srcIdx = 0; srcIdx < int(pyramidLevel.rigSrc.size()); ++srcIdx) {
const Camera& camSrc = pyramidLevel.rigSrc[srcIdx];
const cv::Mat_<PixelType>& srcColor = pyramidLevel.srcColor(srcIdx);
const cv::Mat_<PixelType>& dstColor = pyramidLevel.dstColor(dstIdx);
plotDstPointInSrc(
camDst,
kDebugPlotMatchX,
kDebugPlotMatchY,
disparity(kDebugPlotMatchY, kDebugPlotMatchX),
camSrc,
srcColor,
dstColor,
debugDir,
caller);
}
}
void getPyramidLevelSizes(std::map<int, cv::Size>& sizes, const filesystem::path& imageDir) {
if (!boost::filesystem::exists(imageDir)) {
return;
}
// Use the first image we find at each level
const bool includeHidden = false;
for (const auto& entry : filesystem::directory_iterator(imageDir)) {
const bool isDir = filesystem::is_directory(entry);
const filesystem::path p = entry.path();
const bool isHidden = filesystem::isHidden(p);
if (!isDir || isHidden) {
continue;
}
const std::string levelDelim = "level_";
const std::string filename = p.filename().string();
if (!boost::starts_with(filename, levelDelim)) {
continue;
}
const std::string levelStr = filename.substr(filename.find(levelDelim) + levelDelim.size());
const filesystem::path imageFn = filesystem::getFirstFile(p, includeHidden, false, "", ".tar");
if (imageFn.empty()) {
continue;
}
const cv::Mat_<float> image = cv_util::loadImage<float>(imageFn);
sizes[std::stoi(levelStr)] = image.size();
}
}
// equivalent to TYPE NAME[SIZE] but sized at runtime
#define STACK_ARRAY(TYPE, NAME, SIZE) TYPE* const NAME = (TYPE*)alloca(sizeof(TYPE) * (SIZE))
std::tuple<float, float> computeCost(
const PyramidLevel<PixelType>& pyramidLevel,
const int dstIdx,
const float disparity,
const int x,
const int y) {
// For a given (x, y, depth) in dst, we find the corresponding (x, y) in src,
// and then its reprojection into dst where src and dst are aligned up to
// translation. There we can extract a square patch from src and projected dst
// to compute the cost.
// For a given dst, we do this for every src
// Note that src and dst align up to translation when projected to infinity.
// Also note that when looking out to the world from the center of dst the
// color does not change with depth, so dst does not need to be transformed
//
// (1) pDst -> (2) pWorld -> (3) pSrc -> (4) pInf -> (5) pDstSrc
//
// (4)
// ... ...
// | |
// | |
// | |
// | |
// | |
// | |
// | (2) |
// | / |_ |
// ______|______/_ |_ ___|___________
// | | / | ||__ | |
// | | (1) | | || |
// | (5) | | (3) |
// | | | |
// | | | |
// |______________| |______________|
// dst src
// (1) pDst = (x, y)
// (2) get pWorld
const cv::Mat_<PixelType>& dstColor = pyramidLevel.dstProjColor(dstIdx);
const Camera& camDst = pyramidLevel.rigDst[dstIdx];
const Camera::Vector3 pWorld =
dstToWorldPoint(camDst, x, y, disparity, dstColor.cols, dstColor.rows);
// Compute SSD between dst and projected src for each src
using SSDPair = std::pair<float, float>;
STACK_ARRAY(SSDPair, SSDs, pyramidLevel.rigSrc.size());
int ssdCount = 0;
const cv::Mat_<PixelType>& dstColorBias = pyramidLevel.dstProjColorBias(dstIdx);
for (ssize_t srcIdx = 0; srcIdx < ssize(pyramidLevel.rigSrc); ++srcIdx) {
// No SSD if src = dst
if (srcIdx == pyramidLevel.dst2srcIdxs[dstIdx]) {
continue;
}
// (3) get pSrc
const Camera& camSrc = pyramidLevel.rigSrc[srcIdx];
const cv::Size& srcSize = pyramidLevel.srcColor(srcIdx).size();
Camera::Vector2 pSrc;
if (!worldToSrcPoint(pSrc, pWorld, camSrc, srcSize.width, srcSize.height)) {
continue;
}
// Exclude a half-texel band to simulate proper clamp-to-border semantics
const bool kExcludeHalfTexel = false;
if (kExcludeHalfTexel) {
if (pSrc.x() < 0.5 || srcSize.width - 0.5 < pSrc.x() || pSrc.y() < 0.5 ||
srcSize.height - 0.5 < pSrc.y()) {
continue;
}
}
// (3) -> (4) -> (5) mapping from pre-computed projection warp
const cv::Mat_<cv::Vec2f>& dstProjWarp = pyramidLevel.dstProjWarp(dstIdx, srcIdx);
const cv::Vec2f pDstSrc = cv_util::getPixelBilinear(dstProjWarp, pSrc.x(), pSrc.y());
// Check if pDstSrc is within bounds
const float xDstSrc = pDstSrc[0] + 0.5; // pDstSrc uses opencv coordinate convention
const float yDstSrc = pDstSrc[1] + 0.5;
if (std::isnan(xDstSrc) || std::isnan(yDstSrc)) {
continue;
}
// NOTE: bias of src projected into dst (srcBias) is the average of its
// patch values, which are bilinearly interpolated from floating point
// projected coordinates (pDstSrc). This is mathematically different than
// bilinearly interpolating the pre-computed projected bias around pDstSrc,
// because we are grabbing biases from neighboring footprints, but it seems
// to produce very similar results
const cv::Mat_<PixelType>& dstSrcColorBias = pyramidLevel.dstProjColorBias(dstIdx, srcIdx);
const PixelType dstSrcBias = cv_util::getPixelBilinear(dstSrcColorBias, xDstSrc, yDstSrc);
const PixelType& dstBias = dstColorBias(y, x);
const cv::Mat_<PixelType>& dstSrcColor = pyramidLevel.dstProjColor(dstIdx, srcIdx);
std::pair<float, float> ssd = computeSSD(
dstColor, x, y, dstBias, dstSrcColor, xDstSrc, yDstSrc, dstSrcBias, kSearchWindowRadius);
SSDs[ssdCount] = ssd;
++ssdCount;
}
int keep = kMinOverlappingCams - 1;
if (ssdCount < keep) {
return std::make_tuple(FLT_MAX, 0.0f); // not enough cameras see this disparity, skip
}
// Add up unbiased SSDs for all but the two patches with the worst biased SSDs
keep = std::max<int>(keep, ssdCount - 2);
std::nth_element(SSDs, SSDs + keep, SSDs + ssdCount);
float cost = 0;
for (int i = 0; i < keep; ++i) {
cost += SSDs[i].second;
}
cost /= keep;
// Trust costs when more cameras are involved
// This also means that we penalize closeup proposals (closer to camera rig
// means fewer cameras see that point)
const float trustCoef = 1.0f / keep;
const float dstVariance = pyramidLevel.dstVariance(dstIdx)(y, x);
const float confidence = std::max(dstVariance, kMinVar);
const float costFinal = cost * trustCoef / confidence;
return std::make_tuple(costFinal, confidence);
}
// Creates a cost map where each (x, y) has a cost calculated from all the
// source cameras
void computeBruteForceCosts(
PyramidLevel<PixelType>& pyramidLevel,
const int dstIdx,
const float disparity,
cv::Mat_<float>& costMap,
cv::Mat_<float>& confidenceMap) {
CHECK_EQ(costMap.size(), pyramidLevel.sizeLevel);
CHECK_EQ(costMap.size(), confidenceMap.size());
// When using background disparity, foreground pixels must be closer than background
const cv::Mat_<bool> closerMask = pyramidLevel.hasForegroundMasks
? (pyramidLevel.dstBackgroundDisparity(dstIdx) < disparity)
: cv::Mat_<bool>(pyramidLevel.dstForegroundMask(dstIdx).size(), true); // all-pass
// Ignore margins if dst = src (won't be able to get entire patch)
const int radius = kSearchWindowRadius;
for (int y = radius; y < costMap.rows - radius; ++y) {
for (int x = radius; x < costMap.cols - radius; ++x) {
// Ignore if outside FOV or background pixel or foreground is farther than background
const bool ignore = !pyramidLevel.dstFovMask(dstIdx)(y, x) ||
!pyramidLevel.dstForegroundMask(dstIdx)(y, x) || !closerMask(y, x);
if (ignore) {
costMap(y, x) = NAN;
confidenceMap(y, x) = NAN;
} else {
std::tie(costMap(y, x), confidenceMap(y, x)) =
computeCost(pyramidLevel, dstIdx, disparity, x, y);
}
}
}
}
// Brute force: find disparity with lowest cost at each location, typically at
// coarsest level of the pyramid
void computeBruteForceDisparity(
PyramidLevel<PixelType>& pyramidLevel,
const int dstIdx,
const float minDepthMeters,
const float maxDepthMeters,
const bool partialCoverage,
const bool useForegroundMasks,
const int numThreads) {
cv::Mat_<float>& dstDisparity = pyramidLevel.dstDisparity(dstIdx);
cv::Mat_<float>& dstCosts = pyramidLevel.dstCost(dstIdx);
cv::Mat_<float>& dstConfidences = pyramidLevel.dstConfidence(dstIdx);
LOG(INFO) << "Computing initial costs at " << pyramidLevel.sizeLevel
<< folly::sformat(" ({})", pyramidLevel.rigDst[dstIdx].id);
std::vector<float> disparities(kNumDepths);
const float minDisparity = 1.0f / maxDepthMeters;
const float maxDisparity = 1.0f / minDepthMeters;
for (int i = 0; i < kNumDepths; ++i) {
const float d = probeDisparity(i, kNumDepths, minDisparity, maxDisparity);
disparities[i] = d;
}
// Create a cost map for each possible disparity
ThreadPool threadPool(numThreads);
std::vector<cv::Mat_<float>> costs(disparities.size());
std::vector<cv::Mat_<float>> confidences(disparities.size());
for (int iDisparity = 0; iDisparity < int(disparities.size()); ++iDisparity) {
costs[iDisparity].create(pyramidLevel.sizeLevel);
costs[iDisparity].setTo(NAN);
confidences[iDisparity].create(pyramidLevel.sizeLevel);
confidences[iDisparity].setTo(NAN);
threadPool.spawn(
&computeBruteForceCosts,
std::ref(pyramidLevel),
dstIdx,
disparities[iDisparity],
std::ref(costs[iDisparity]),
std::ref(confidences[iDisparity]));
}
threadPool.join();
// Get best cost on each location
// We have one cost per disparity at each location
// Ignore margins if dst = src (won't be able to get entire patch)
const int margin = kSearchWindowRadius;
for (int y = margin; y < dstDisparity.rows - margin; ++y) {
for (int x = margin; x < dstDisparity.cols - margin; ++x) {
if (!pyramidLevel.dstFovMask(dstIdx)(y, x)) { // outside FOV
dstDisparity(y, x) = NAN;
continue;
}
// Use background value if we're outside the foreground mask
if (!pyramidLevel.dstForegroundMask(dstIdx)(y, x)) {
dstDisparity(y, x) = pyramidLevel.dstBackgroundDisparity(dstIdx)(y, x);
continue;
}
float minCost = FLT_MAX;
float minCostConfidence = 0;
int bestDisparityIdx = -1;
for (int i = 0; i < int(costs.size()); ++i) {
const float cost = costs[i](y, x);
if (cost < minCost) {
minCost = cost;
minCostConfidence = confidences[i](y, x);
bestDisparityIdx = i;
}
}
if (bestDisparityIdx == -1) {
// This can only happen if we're outside the overlapping area when we
// have partial coverage or due to noise in foreground masks
std::string warning = folly::sformat(
"Insufficient coverage at {} ({}, {}) ", pyramidLevel.rigDst[dstIdx].id, x, y);
CHECK(partialCoverage || useForegroundMasks) << warning;
const std::string partialCoverageFailure = "due to partial coverage";
const std::string foregroundMaskFailure = "due to noisy foreground masks";
warning += partialCoverage ? partialCoverageFailure : "";
warning += (partialCoverage && useForegroundMasks) ? " or " : "";
warning += useForegroundMasks ? foregroundMaskFailure : "";
LOG(WARNING) << warning;
dstDisparity(y, x) = minDisparity;
} else {
dstDisparity(y, x) = disparities[bestDisparityIdx];
}
dstCosts(y, x) = minCost;
dstConfidences(y, x) = minCostConfidence;
}
}
// Extend disparities to margin
if (margin > 0) {
for (int y = 0; y < dstDisparity.rows; ++y) {
for (int x = 0; x < dstDisparity.cols; ++x) {
if (x < margin || x >= dstDisparity.cols - margin || y < margin ||
y >= dstDisparity.rows - margin) {
// Use background value if we're outside the foreground mask
if (!pyramidLevel.dstForegroundMask(dstIdx)(y, x)) {
dstDisparity(y, x) = pyramidLevel.dstBackgroundDisparity(dstIdx)(y, x);
continue;
}
dstDisparity(y, x) = dstDisparity(
math_util::clamp(y, margin, dstDisparity.rows - margin - 1),
math_util::clamp(x, margin, dstDisparity.cols - margin - 1));
dstCosts(y, x) = dstCosts(
math_util::clamp(y, margin, dstCosts.rows - margin - 1),
math_util::clamp(x, margin, dstCosts.cols - margin - 1));
dstConfidences(y, x) = dstConfidences(
math_util::clamp(y, margin, dstConfidences.rows - margin - 1),
math_util::clamp(x, margin, dstConfidences.cols - margin - 1));
}
}
}
}
}
void computeBruteForceDisparities(
PyramidLevel<PixelType>& pyramidLevel,
const float minDepthMeters,
const float maxDepthMeters,
const bool partialCoverage,
const bool useForegroundMasks,
const int numThreads) {
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
computeBruteForceDisparity(
pyramidLevel,
dstIdx,
minDepthMeters,
maxDepthMeters,
partialCoverage,
useForegroundMasks,
numThreads);
}
}
void pingPongRectangle(
cv::Mat_<float>& dispRes,
cv::Mat_<float>& costsRes,
cv::Mat_<float>& confidencesRes,
const cv::Mat_<bool>& changed,
const cv::Mat_<cv::Vec3b>& labImage,
const PyramidLevel<PixelType>& pyramidLevel,
const int dstIdx,
const int xBegin,
const int yBegin,
const int xEnd,
const int yEnd) {
const cv::Mat_<float>& disp = pyramidLevel.dstDisparity(dstIdx);
const cv::Mat_<float>& confidences = pyramidLevel.dstConfidence(dstIdx);
const cv::Mat_<bool>& maskFov = pyramidLevel.dstFovMask(dstIdx);
const cv::Mat_<float>& dispBackground = pyramidLevel.dstBackgroundDisparity(dstIdx);
const cv::Mat_<float>& variance = pyramidLevel.dstVariance(dstIdx);
for (int y = yBegin; y < yEnd; ++y) {
for (int x = xBegin; x < xEnd; ++x) {
if (!maskFov(y, x)) {
// Keep value from previous frame
continue;
}
// Use background value if we're outside the foreground mask
if (!pyramidLevel.dstForegroundMask(dstIdx)(y, x)) {
dispRes(y, x) = dispBackground(y, x);
continue;
}
// Ignore locations with low variance
if (variance(y, x) < pyramidLevel.varNoiseFloor) {
continue;
}
float bestCost = INFINITY;
float bestDisparity = disp(y, x);
float bestConfidence = confidences(y, x);
std::vector<std::array<int, 2>> candidateNeighborOffsets;
if (kDoColorPruning) {
const std::array<int, 2>& startPoint = {{x, y}};
candidateNeighborOffsets = prunePingPongCandidates(
candidateTemplateOriginal, labImage, startPoint, kColorPruningNumNeighbors);
} else {
candidateNeighborOffsets = candidateTemplateOriginal;
}
const float backgroundDisparity =
pyramidLevel.hasForegroundMasks ? pyramidLevel.dstBackgroundDisparity(dstIdx)(y, x) : 0;
for (const auto& candidateNeighborOffset : candidateNeighborOffsets) {
const int xx = math_util::clamp(x + candidateNeighborOffset[0], 0, disp.cols - 1);
const int yy = math_util::clamp(y + candidateNeighborOffset[1], 0, disp.rows - 1);
if (maskFov(yy, xx)) { // inside FOV
const float d = disp(yy, xx);
// When using background disparity, foreground pixels must be closer than background
if (d >= backgroundDisparity && changed(yy, xx)) {
const auto costValues = computeCost(pyramidLevel, dstIdx, d, x, y);
const float cost = std::get<0>(costValues);
if (cost < bestCost) {
bestCost = cost;
bestDisparity = d;
bestConfidence = std::get<1>(costValues);
}
}
}
}
dispRes(y, x) = bestDisparity;
costsRes(y, x) = bestCost;
confidencesRes(y, x) = bestConfidence;
}
}
}
void pingPong(PyramidLevel<PixelType>& pyramidLevel, const int iterations, const int numThreads) {
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
cv::Mat_<float>& disp = pyramidLevel.dstDisparity(dstIdx);
cv::Mat_<float>& costs = pyramidLevel.dstCost(dstIdx);
cv::Mat_<float> dispRes(disp.size());
disp.copyTo(dispRes);
cv::Mat_<float> costsRes(disp.size(), INFINITY);
cv::Mat_<float> confidencesRes(disp.size(), 0);
cv::Mat_<bool> changed(disp.size(), true);
cv::Mat_<cv::Vec3b> labImage;
if (kDoColorPruning) {
const cv::Mat_<PixelType>& color = pyramidLevel.dstColor(dstIdx);
cv::Mat_<cv::Vec4b> imageScaled;
cv::Mat_<cv::Vec3b> bgrImage;
color.convertTo(imageScaled, CV_8UC4, 255);
cv::cvtColor(imageScaled, bgrImage, cv::COLOR_BGRA2BGR);
cv::cvtColor(bgrImage, labImage, cv::COLOR_BGR2Lab);
}
for (int it = 1; it <= iterations; ++it) {
LOG(INFO) << folly::sformat(
"-- ping pong: iter {}/{}, {}", it, iterations, pyramidLevel.rigDst[dstIdx].id);
const int radius = kSearchWindowRadius;
ThreadPool threadPool(numThreads);
const int edgeX = dispRes.cols;
const int edgeY = 1;
for (int y = radius; y < dispRes.rows - radius; y += edgeY) {
for (int x = radius; x < dispRes.cols - radius; x += edgeX) {
threadPool.spawn(
&pingPongRectangle,
std::ref(dispRes),
std::ref(costsRes),
std::ref(confidencesRes),
std::cref(changed),
std::cref(labImage),
std::cref(pyramidLevel),
dstIdx,
x,
y,
std::min(x + edgeX, dispRes.cols - radius),
std::min(y + edgeY, dispRes.rows - radius));
}
}
threadPool.join();
changed = disp != dispRes;
dispRes.copyTo(disp);
costsRes.copyTo(costs);
const cv::Mat_<bool>& fovMask = pyramidLevel.dstFovMask(dstIdx);
const int countFov = cv::countNonZero(fovMask);
const int count = cv::countNonZero(changed);
const float changedPct = 100.0f * count / countFov;
LOG(INFO) << std::fixed << std::setprecision(2) << folly::sformat("changed: {}%", changedPct);
}
}
}
void pingPongPropagation(
PyramidLevel<PixelType>& pyramidLevel,
const int iterations,
const int numThreads,
const filesystem::path& debugDir) {
if (pyramidLevel.level == pyramidLevel.numLevels - 1) {
return;
}
pingPong(pyramidLevel, iterations, numThreads);
plotMatches(pyramidLevel, "ping_pong", debugDir);
}
void getSrcMismatches(
std::vector<float>& dispMatches,
std::vector<float>& dispMismatches,
const PyramidLevel<PixelType>& pyramidLevel,
const int dstIdx,
const int x,
const int y) {
CHECK_EQ(pyramidLevel.rigDst.size(), pyramidLevel.rigSrc.size());
// Do not mark as mismatch if (x, y) is outside foreground mask
if (!pyramidLevel.dstForegroundMask(dstIdx)(y, x)) {
return;
}
// Compute world point
const cv::Mat_<float>& dstDisp = pyramidLevel.dstDisparity(dstIdx);
const Camera& camDst = pyramidLevel.rigDst[dstIdx];
const auto ptWorld = dstToWorldPoint(camDst, x, y, dstDisp(y, x), dstDisp.cols, dstDisp.rows);
for (int srcIdx = 0; srcIdx < int(pyramidLevel.rigSrc.size()); ++srcIdx) {
if (srcIdx == pyramidLevel.dst2srcIdxs[dstIdx]) { // ignore itself
continue;
}
const Camera& camSrc = pyramidLevel.rigSrc[srcIdx];
const cv::Size sizeSrc = pyramidLevel.srcVariance(srcIdx).size();
const int srcW = sizeSrc.width;
const int srcH = sizeSrc.height;
Camera::Vector2 ptSrc;
if (!worldToSrcPoint(ptSrc, ptWorld, camSrc, srcW, srcH)) {
continue;
}
const float dSrc =
cv_util::getPixelBilinear(pyramidLevel.dstDisparity(srcIdx), ptSrc.x(), ptSrc.y());
// Check if disparity in src is within 10% of dst
// NOTE: Technically we should be using distances from the rig origin, not from each camera
// origin, but the mismatch unlock is an approximation, and this approach is faster and less
// complex. This works well because the distance between the cameras is at least an order of
// magnitude smaller than any mismatches
static const float kFractionChange = 0.1f;
const float dDstMin = (1.0f - kFractionChange) * dstDisp(y, x);
const float dDstMax = (1.0f + kFractionChange) * dstDisp(y, x);
if (dDstMin <= dSrc && dSrc <= dDstMax) {
dispMatches.push_back(dSrc);
} else {
dispMismatches.push_back(dSrc);
}
}
}
void updateDstDisparityAndMismatchMask(
bool& dstDispMask,
float& dstDispNew,
const float dispCurr,
const std::vector<float>& dispMatches,
std::vector<float>& dispMismatches,
const float dstVar,
const float varThreshLow,
const float varThreshHigh) {
if (dispMatches.size() + dispMismatches.size() == 0) {
// We could reach this point if we are outside the foreground mask, or if the
// current camera (x, y) falls outside the FOV of the rest of the cameras.
// We want both the new disparity and the masked disparity to have the same
// value
dstDispMask = false;
dstDispNew = dispCurr;
return;
}
// Number of src cameras that have to agree with dst camera for the current
// disparity to be considered good and not change it
static const int kNumMinSrcCams = kMinOverlappingCams - 1;
// Do not modify locations where
// 1) We have a good disparity (i.e. other src cameras see the same)
// 2) Variance is high (i.e. on top of edge, we don't wanna touch them)
// 3) Variance is too low (i.e. noise)
if (int(dispMatches.size()) >= kNumMinSrcCams || varThreshHigh < dstVar ||
dstVar < varThreshLow) {
dstDispMask = false;
dstDispNew = dispCurr;
} else {
dstDispMask = true;
// Pick median of the farther disparities
std::sort(dispMismatches.begin(), dispMismatches.end()); // idx 0 = farthest
int closer;
for (closer = 0; closer < int(dispMismatches.size()); ++closer) {
if (dispMismatches[closer] >= dispCurr) { // stop when closer
break;
}
}
const int median = closer / 2;
// Don't pick median if current disparity is farther
dstDispNew = std::min(dispCurr, dispMismatches[median]);
}
}
// A mismatch happens when (x0, y0, d0) in dst maps to srci (xi, yi), but
// di != d0 for at least a certain number of src cameras
//
// Visually:
//
// + good disparity
// - bad disparities seen by dst
//
// (0) disparity seen by dst at (x, y) = (x, y, d)
// (1) disparity seen by src1 in the direction of (x, y, d)
// (2) disparity seen by src2 in the direction of (x, y, d)
//
// +++++++++++++++ (2)
// |
// | +++++++++ (1)
// | /
// --- (0)
// /||
// / | |
// / | |
// src1 dst src2
//
// Note that (0), (1) and (2) are in separate disparity maps, so (1) and (2) can
// see past (0) in the marked direction.
// Picking (2) unlocks the set of bad disparities so that we have a better
// proposal in the next round
//
// i.e. use overlaping src cameras to look past (0) for proposals
//
// Note that the closer we are to the cameras, the farther apart can (1) and
// (2) potentially be. It'll depend on how far objects behind (0) are
void handleDisparityMismatch(
std::unordered_map<std::string, cv::Mat_<float>>& mapDstDisp,
PyramidLevel<PixelType>& pyramidLevel,
const int dstIdx) {
CHECK_EQ(pyramidLevel.rigDst.size(), pyramidLevel.rigSrc.size())
<< "Mismatches only valid when considering all cameras";
const std::string& dstId = pyramidLevel.rigDst[dstIdx].id;
const cv::Mat_<float>& dstDisp = pyramidLevel.dstDisparity(dstIdx);
cv::Mat_<bool>& dstMask = pyramidLevel.dstMismatchedDisparityMask(dstIdx);
cv::Mat_<float> dstDispNew(dstDisp.size(), NAN);
// For every (x, y, d) in current dst, find (x', y', d') in all src cameras
// and check if d' is similar to d
const cv::Mat_<float>& dstVar = pyramidLevel.dstVariance(dstIdx);
for (int y = 0; y < dstDisp.rows; ++y) {
for (int x = 0; x < dstDisp.cols; ++x) {
if (!pyramidLevel.dstFovMask(dstIdx)(y, x)) { // outside FOV
continue;
}
std::vector<float> dispMatches;
std::vector<float> dispMismatches;
getSrcMismatches(dispMatches, dispMismatches, pyramidLevel, dstIdx, x, y);
updateDstDisparityAndMismatchMask(
dstMask(y, x),
dstDispNew(y, x),
dstDisp(y, x),
dispMatches,
dispMismatches,
dstVar(y, x),
pyramidLevel.varNoiseFloor,
pyramidLevel.varHighThresh);
}
}
mapDstDisp[dstId] = dstDispNew;
}
void handleDisparityMismatches(
PyramidLevel<PixelType>& pyramidLevel,
const int startLevel,
const int numThreads) {
if (pyramidLevel.level > startLevel || pyramidLevel.level == pyramidLevel.numLevels - 1) {
return;
}
LOG(INFO) << "Handling source mismatches...";
CHECK_EQ(pyramidLevel.rigDst.size(), pyramidLevel.rigSrc.size())
<< "Mismatches only valid when considering all cameras";
ThreadPool threadPool(numThreads);
std::unordered_map<std::string, cv::Mat_<float>> mapDstDisp;
for (const Camera& camDst : pyramidLevel.rigDst) {
// Preallocate buckets to avoid double-free errors when multithreading
mapDstDisp[camDst.id] = cv::Mat_<float>();
}
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
threadPool.spawn(
&handleDisparityMismatch, std::ref(mapDstDisp), std::ref(pyramidLevel), dstIdx);
}
threadPool.join();
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
mapDstDisp.at(pyramidLevel.rigDst[dstIdx].id).copyTo(pyramidLevel.dstDisparity(dstIdx));
}
}
void randomProposal(
PyramidLevel<PixelType>& pyramidLevel,
const int dstIdx,
const int y,
const int numProposals,
const float minDepthMeters,
const float maxDepthMeters) {
std::default_random_engine engine;
engine.seed(y * pyramidLevel.level);
cv::Mat_<float>& dstDisparity = pyramidLevel.dstDisparity(dstIdx);
cv::Mat_<float>& dstCosts = pyramidLevel.dstCost(dstIdx);
cv::Mat_<float>& dstConfidence = pyramidLevel.dstConfidence(dstIdx);
const cv::Mat_<float>& variance = pyramidLevel.dstVariance(dstIdx);
for (int x = kSearchWindowRadius; x < dstDisparity.cols - kSearchWindowRadius; ++x) {
if (!pyramidLevel.dstFovMask(dstIdx)(y, x)) { // outside FOV
// Keep value from previous frame
continue;
}
float currDisp = dstDisparity(y, x); // upscaled disparity
// Use background value if we're outside the foreground mask
if (!pyramidLevel.dstForegroundMask(dstIdx)(y, x)) {
dstDisparity(y, x) = pyramidLevel.dstBackgroundDisparity(dstIdx)(y, x);
continue;
}
// Ignore locations with low variance
// Threshold is a little lower than our high variance threshold
// High variance locations include:
// - textured objects: easier to match
// - new objects: not present at coarser levels
// Lower than high variance locations include:
// - weaker edges: as they appear or dissapear
// - pixels around edges: can see what's behind
// We can go pretty low, as long as we ignore smooth and noisy areas
const float varHighDev = kRandomPropHighVarDeviation * pyramidLevel.varHighThresh;
const float varHighThresh = std::max(varHighDev, pyramidLevel.varNoiseFloor);
if (variance(y, x) < varHighThresh) {
continue;
}
float currCost;
float currConfidence;
std::tie(currCost, currConfidence) = computeCost(pyramidLevel, dstIdx, currDisp, x, y);
// We will refine only if we're getting much better cost
const float costThresh = std::fmin(0.5f * currCost, kRandomPropMaxCost);
// When using background, foreground pixels must be closer than background
const float minDisp = pyramidLevel.hasForegroundMasks
? pyramidLevel.dstBackgroundDisparity(dstIdx)(y, x)
: (1.0f / maxDepthMeters);
const float maxDisp = 1.0f / minDepthMeters;
float amplitude = (maxDisp - minDisp) / 2.0f;
for (int i = 0; i < numProposals; ++i) {
float propDisp = std::uniform_real_distribution<float>(
std::max(float(minDisp), currDisp - amplitude),
std::min(float(maxDisp), currDisp + amplitude))(engine);
float propCost;
float propConfidence;
std::tie(propCost, propConfidence) = computeCost(pyramidLevel, dstIdx, propDisp, x, y);
if (propCost < currCost && propCost < costThresh) {
currCost = propCost;
currDisp = propDisp;
currConfidence = propConfidence;
amplitude /= 2.0f;
}
}
dstDisparity(y, x) = currDisp;
dstCosts(y, x) = currCost;
dstConfidence(y, x) = currConfidence;
}
}
void preprocessLevel(
PyramidLevel<PixelType>& pyramidLevel,
const float minDepthMeters,
const float maxDepthMeters,
const bool partialCoverage,
const bool useForegroundMasks,
const int numThreads) {
if (pyramidLevel.level == pyramidLevel.numLevels - 1) {
computeBruteForceDisparities(
pyramidLevel,
minDepthMeters,
maxDepthMeters,
partialCoverage,
useForegroundMasks,
numThreads);
}
}
void randomProposals(
PyramidLevel<PixelType>& pyramidLevel,
const int numProposals,
const float minDepthMeters,
const float maxDepthMeters,
const int numThreads,
const filesystem::path& debugDir) {
if (numProposals <= 0 || pyramidLevel.level == pyramidLevel.numLevels - 1) {
return;
}
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
LOG(INFO) << folly::sformat("-- random proposals: {}", pyramidLevel.rigDst[dstIdx].id);
ThreadPool threadPool(numThreads);
const cv::Size size = pyramidLevel.dstDisparity(dstIdx).size();
for (int y = kSearchWindowRadius; y < size.height - kSearchWindowRadius; ++y) {
threadPool.spawn(
&randomProposal,
std::ref(pyramidLevel),
dstIdx,
y,
numProposals,
minDepthMeters,
maxDepthMeters);
}
threadPool.join();
}
plotMatches(pyramidLevel, "random_prop", debugDir);
}
void bilateralFilter(PyramidLevel<PixelType>& pyramidLevel, const int numThreads) {
const float scale = std::pow(kLevelScale, pyramidLevel.level);
const int spaceRadius =
std::max(std::ceil(kBilateralSpaceRadiusMax * scale), float(kBilateralSpaceRadiusMin));
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
cv::Mat_<float>& disparity = pyramidLevel.dstDisparity(dstIdx);
const cv::Mat_<PixelType>& color = pyramidLevel.dstColor(dstIdx);
const cv::Mat_<bool>& maskFov = pyramidLevel.dstFovMask(dstIdx);
const cv::Mat_<bool>& maskFg = pyramidLevel.dstForegroundMask(dstIdx);
const cv::Mat_<bool> mask = maskFov & maskFg;
const cv::Mat_<float> disparityFiltered =
depth_estimation::generalizedJointBilateralFilter<float, PixelType>(
disparity,
color,
color,
mask,
spaceRadius,
kBilateralSigma,
kBilateralWeightB,
kBilateralWeightG,
kBilateralWeightR,
numThreads);
// Only use filtered version on foreground pixels
disparityFiltered.copyTo(disparity, pyramidLevel.dstForegroundMask(dstIdx));
}
}
void medianFilter(PyramidLevel<PixelType>& pyramidLevel, const int numThreads) {
ThreadPool threadPool(numThreads);
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
threadPool.spawn([&, dstIdx] {
cv::Mat_<float>& disparity = pyramidLevel.dstDisparity(dstIdx);
const cv::Mat_<float>& bgDisparity = pyramidLevel.dstBackgroundDisparity(dstIdx);
const cv::Mat_<bool>& maskFov = pyramidLevel.dstFovMask(dstIdx);
const cv::Mat_<bool>& maskFg = pyramidLevel.dstForegroundMask(dstIdx);
const cv::Mat_<bool> mask = maskFov & maskFg;
cv::Mat_<float> disparityFiltered =
cv_util::maskedMedianBlur(disparity, bgDisparity, mask, kMedianFilterRadius);
disparityFiltered.copyTo(disparity);
});
}
threadPool.join();
}
void saveResults(
PyramidLevel<PixelType>& pyramidLevel,
const bool saveDebugImages,
const std::string& outputFormatsIn) {
if (saveDebugImages) {
LOG(INFO) << folly::sformat("Saving debug images for pyramid level {}...", pyramidLevel.level);
pyramidLevel.saveDebugImages();
}
// Always saving outputs at finest level. Forcing PFM if no format chosen
std::string outputFormats = outputFormatsIn;
if (outputFormats == "") {
LOG(WARNING) << "No explicit output formats specified. Forcing PFM...";
outputFormats = "pfm";
}
pyramidLevel.saveResults(outputFormats);
}
void maskFov(PyramidLevel<PixelType>& pyramidLevel, const int numThreads) {
ThreadPool threadPool(numThreads);
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
threadPool.spawn([&, dstIdx] {
cv::Mat_<float>& disparity = pyramidLevel.dstDisparity(dstIdx);
const cv::Mat_<bool>& maskFov = pyramidLevel.dstFovMask(dstIdx);
const cv::Mat_<float> nanMat(disparity.size(), NAN);
nanMat.copyTo(disparity, 1 - maskFov);
});
}
threadPool.join();
}
// Reproject each src camera into each dst camera assuming a depth of infinity
// At infinity src and dst will align up to translation
void precomputeProjections(PyramidLevel<PixelType>& pyramidLevel, const int threads) {
LOG(INFO) << "Pre-computing projections...";
ThreadPool threadPool(threads);
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
// Project to current level dst size
const cv::Size& dstSize = pyramidLevel.dstColor(dstIdx).size();
Camera camDst = pyramidLevel.rigDst[dstIdx].rescale({dstSize.width, dstSize.height});
// Project every src to current dst
for (int srcIdx = 0; srcIdx < int(pyramidLevel.rigSrc.size()); ++srcIdx) {
threadPool.spawn([&, srcIdx] {
// Project from current level src size
const cv::Size& srcSize = pyramidLevel.srcs[srcIdx].color.size();
Camera camSrc = pyramidLevel.rigSrc[srcIdx].rescale({srcSize.width, srcSize.height});
pyramidLevel.dstProjWarp(dstIdx, srcIdx) = computeWarpDstToSrc(camSrc, camDst);
pyramidLevel.dstProjWarpInv(dstIdx, srcIdx) = computeWarpDstToSrc(camDst, camSrc);
});
}
threadPool.join();
}
}
void reprojectColors(PyramidLevel<PixelType>& pyramidLevel, const int numThreads) {
LOG(INFO) << "Reprojecting colors...";
ThreadPool threadPool(numThreads);
for (int dstIdx = 0; dstIdx < int(pyramidLevel.rigDst.size()); ++dstIdx) {
// Project every src to current dst
for (int srcIdx = 0; srcIdx < int(pyramidLevel.rigSrc.size()); ++srcIdx) {
threadPool.spawn([&, srcIdx] {
// Project from current level src size
const cv::Mat_<PixelType>& srcColor = pyramidLevel.srcColor(srcIdx);
cv::Mat_<PixelType>& srcProjColor = pyramidLevel.dstProjColor(dstIdx, srcIdx);
if (srcIdx == pyramidLevel.dst2srcIdxs[dstIdx]) {
// No projection needed if src = dst
srcProjColor = srcColor;
} else {
const cv::Mat_<cv::Vec2f>& warpDstToSrc = pyramidLevel.dstProjWarpInv(dstIdx, srcIdx);
srcProjColor = project(srcColor, warpDstToSrc);
}
// Color bias is just the average over a given area around each pixel
pyramidLevel.dstProjColorBias(dstIdx, srcIdx) =
colorBias(srcProjColor, kSearchWindowRadius);
});
}
threadPool.join();
}
}
void processLevel(
PyramidLevel<PixelType>& pyramidLevel,
const std::string& outputFormats,
const bool useForegroundMasks,
const std::string& outputRoot,
const int numRandomProposals,
const bool partialCoverage,
const float minDepthM,
const float maxDepthM,
const bool doMedianFilter,
const bool saveDebugImages,
const int pingPongIterations,
const int mismatchesStartLevel,
const bool doBilateralFilter,
const int threads) {
LOG(INFO) << folly::sformat("Processing {} level {}", pyramidLevel.frameName, pyramidLevel.level);
reprojectColors(pyramidLevel, threads);
preprocessLevel(pyramidLevel, minDepthM, maxDepthM, partialCoverage, useForegroundMasks, threads);
randomProposals(pyramidLevel, numRandomProposals, minDepthM, maxDepthM, threads, outputRoot);
pingPongPropagation(pyramidLevel, pingPongIterations, threads, outputRoot);
handleDisparityMismatches(pyramidLevel, mismatchesStartLevel, threads);
if (doBilateralFilter) {
bilateralFilter(pyramidLevel, threads);
}
if (doMedianFilter) {
medianFilter(pyramidLevel, threads);
}
maskFov(pyramidLevel, threads);
saveResults(pyramidLevel, saveDebugImages, outputFormats);
}
} // namespace depth_estimation
} // namespace fb360_dep