std::tuple computeCost()

in source/depth_estimation/Derp.cpp [103:225]


std::tuple<float, float> computeCost(
    const PyramidLevel<PixelType>& pyramidLevel,
    const int dstIdx,
    const float disparity,
    const int x,
    const int y) {
  // For a given (x, y, depth) in dst, we find the corresponding (x, y) in src,
  // and then its reprojection into dst where src and dst are aligned up to
  // translation. There we can extract a square patch from src and projected dst
  // to compute the cost.
  // For a given dst, we do this for every src
  // Note that src and dst align up to translation when projected to infinity.
  // Also note that when looking out to the world from the center of dst the
  // color does not change with depth, so dst does not need to be transformed
  //
  // (1) pDst -> (2) pWorld -> (3) pSrc -> (4) pInf -> (5) pDstSrc
  //
  //             (4)
  //        ...       ...
  //         |          |
  //         |           |
  //         |            |
  //         |             |
  //         |              |
  //         |               |
  //         |       (2)      |
  //         |       /  |_     |
  //   ______|______/_    |_ ___|___________
  //  |      |     / |      ||__ |          |
  //  |      |   (1) |       |  ||          |
  //  |     (5)      |       |  (3)         |
  //  |              |       |              |
  //  |              |       |              |
  //  |______________|       |______________|
  //        dst                    src

  // (1) pDst = (x, y)
  // (2) get pWorld
  const cv::Mat_<PixelType>& dstColor = pyramidLevel.dstProjColor(dstIdx);
  const Camera& camDst = pyramidLevel.rigDst[dstIdx];
  const Camera::Vector3 pWorld =
      dstToWorldPoint(camDst, x, y, disparity, dstColor.cols, dstColor.rows);

  // Compute SSD between dst and projected src for each src
  using SSDPair = std::pair<float, float>;
  STACK_ARRAY(SSDPair, SSDs, pyramidLevel.rigSrc.size());
  int ssdCount = 0;
  const cv::Mat_<PixelType>& dstColorBias = pyramidLevel.dstProjColorBias(dstIdx);
  for (ssize_t srcIdx = 0; srcIdx < ssize(pyramidLevel.rigSrc); ++srcIdx) {
    // No SSD if src = dst
    if (srcIdx == pyramidLevel.dst2srcIdxs[dstIdx]) {
      continue;
    }

    // (3) get pSrc
    const Camera& camSrc = pyramidLevel.rigSrc[srcIdx];
    const cv::Size& srcSize = pyramidLevel.srcColor(srcIdx).size();
    Camera::Vector2 pSrc;
    if (!worldToSrcPoint(pSrc, pWorld, camSrc, srcSize.width, srcSize.height)) {
      continue;
    }

    // Exclude a half-texel band to simulate proper clamp-to-border semantics
    const bool kExcludeHalfTexel = false;
    if (kExcludeHalfTexel) {
      if (pSrc.x() < 0.5 || srcSize.width - 0.5 < pSrc.x() || pSrc.y() < 0.5 ||
          srcSize.height - 0.5 < pSrc.y()) {
        continue;
      }
    }

    // (3) -> (4) -> (5) mapping from pre-computed projection warp
    const cv::Mat_<cv::Vec2f>& dstProjWarp = pyramidLevel.dstProjWarp(dstIdx, srcIdx);
    const cv::Vec2f pDstSrc = cv_util::getPixelBilinear(dstProjWarp, pSrc.x(), pSrc.y());

    // Check if pDstSrc is within bounds
    const float xDstSrc = pDstSrc[0] + 0.5; // pDstSrc uses opencv coordinate convention
    const float yDstSrc = pDstSrc[1] + 0.5;
    if (std::isnan(xDstSrc) || std::isnan(yDstSrc)) {
      continue;
    }

    // NOTE: bias of src projected into dst (srcBias) is the average of its
    // patch values, which are bilinearly interpolated from floating point
    // projected coordinates (pDstSrc). This is mathematically different than
    // bilinearly interpolating the pre-computed projected bias around pDstSrc,
    // because we are grabbing biases from neighboring footprints, but it seems
    // to produce very similar results
    const cv::Mat_<PixelType>& dstSrcColorBias = pyramidLevel.dstProjColorBias(dstIdx, srcIdx);
    const PixelType dstSrcBias = cv_util::getPixelBilinear(dstSrcColorBias, xDstSrc, yDstSrc);
    const PixelType& dstBias = dstColorBias(y, x);

    const cv::Mat_<PixelType>& dstSrcColor = pyramidLevel.dstProjColor(dstIdx, srcIdx);
    std::pair<float, float> ssd = computeSSD(
        dstColor, x, y, dstBias, dstSrcColor, xDstSrc, yDstSrc, dstSrcBias, kSearchWindowRadius);
    SSDs[ssdCount] = ssd;
    ++ssdCount;
  }

  int keep = kMinOverlappingCams - 1;
  if (ssdCount < keep) {
    return std::make_tuple(FLT_MAX, 0.0f); // not enough cameras see this disparity, skip
  }

  // Add up unbiased SSDs for all but the two patches with the worst biased SSDs
  keep = std::max<int>(keep, ssdCount - 2);
  std::nth_element(SSDs, SSDs + keep, SSDs + ssdCount);
  float cost = 0;
  for (int i = 0; i < keep; ++i) {
    cost += SSDs[i].second;
  }
  cost /= keep;

  // Trust costs when more cameras are involved
  // This also means that we penalize closeup proposals (closer to camera rig
  // means fewer cameras see that point)
  const float trustCoef = 1.0f / keep;

  const float dstVariance = pyramidLevel.dstVariance(dstIdx)(y, x);
  const float confidence = std::max(dstVariance, kMinVar);
  const float costFinal = cost * trustCoef / confidence;
  return std::make_tuple(costFinal, confidence);
}