loss/consistency_loss.py [184:237]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    def __call__(
        self,
        depths,
        metadata,
    ):
        """Compute total loss.

        The network predicts a set of depths results. The number of samples, N, is
        not the batch_size, but computed based on the loss.
        For instance, geometry_consistency_loss requires pairs as samples, then
            N = 2 .
        If with more losses, say triplet one from temporal_consistency_loss. Then
            N = 2 + 3.

        Args:
            depths (B, N, H, W):   predicted_depths
            metadata: dictionary of related metadata to compute the loss. Here assumes
                metadata include data as below. But each loss assumes more.
                {
                    'extrinsics': torch.tensor (B, N, 3, 4), # extrinsics of each frame.
                                    Each (3, 4) = [R, t]
                    'intrinsics': torch.tensor (B, N, 4),
                                  # (fx, fy, cx, cy) for each frame in pixels
                }

        Returns:
            loss: python scalar. And set self.total_loss
        """

        def squeeze(x):
            return x.reshape((-1,) + x.shape[2:])

        def unsqueeze(x, N):
            return x.reshape((-1, N) + x.shape[1:])

        depths = depths.unsqueeze(-3)
        intrinsics = metadata["intrinsics"]

        # Pixel coordinates
        B, N, C, H, W = depths.shape
        pixels = pixel_grid(B * N, (H, W))  # (B*N, 2, H, W)

        if self.opt.recon != "colmap":
            # Warp map from spatial transformation
            warp_map = metadata["warp"].view(B * N, 2, H, W)  # (B*N, 2, H, W)
            warp_map[:, 0, :, :] = warp_map[:, 0, :, :] * (W / 2)
            warp_map[:, 1, :, :] = warp_map[:, 1, :, :] * (H / 2)

            # Apply spatial transformation
            pixels = pixels + warp_map

        points_cam = pixels_to_points(squeeze(intrinsics), squeeze(depths), pixels)
        pixels = unsqueeze(pixels, N)
        points_cam = unsqueeze(points_cam, N)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


loss/scene_flow_loss.py [358:411]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    def __call__(
        self,
        depths,
        metadata,
    ):
        """Compute total loss.

        The network predicts a set of depths results. The number of samples, N, is
        not the batch_size, but computed based on the loss.
        The static scene flow loss requires pairs as samples, the N =2
        Adding temporal smoothness loss requires two triplet: (R-1, R, R+1) and (T-1, T, T+1).
        Thus, N = 6

        Args:
            depths (B, N, H, W):   predicted_depths
            metadata: dictionary of related metadata to compute the loss. Here assumes
                metadata include data as below. But each loss assumes more.
                {
                    'extrinsics': torch.tensor (B, N, 3, 4), # extrinsics of each frame.
                                    Each (3, 4) = [R, t]
                    'intrinsics': torch.tensor (B, N, 4),
                                  # (fx, fy, cx, cy) for each frame in pixels
                }

        Returns:
            loss: python scalar. And set self.total_loss
        """

        def squeeze(x):
            return x.reshape((-1,) + x.shape[2:])

        def unsqueeze(x, N):
            return x.reshape((-1, N) + x.shape[1:])

        depths = depths.unsqueeze(-3)
        intrinsics = metadata["intrinsics"]

        B, N, C, H, W = depths.shape

        # Pixel cooridnate
        pixels = pixel_grid(B * N, (H, W))

        if self.opt.recon != "colmap":
            # Warp map from spatial transformation
            warp_map = metadata["warp"].view(B * N, 2, H, W)  # (B*N, 2, H, W)
            warp_map[:, 0, :, :] = warp_map[:, 0, :, :] * (W / 2)
            warp_map[:, 1, :, :] = warp_map[:, 1, :, :] * (H / 2)

            # Apply spatial transformation
            pixels = pixels + warp_map

        points_cam = pixels_to_points(squeeze(intrinsics), squeeze(depths), pixels)
        pixels = unsqueeze(pixels, N)
        points_cam = unsqueeze(points_cam, N)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -