void VulkanReplay::GetTextureData()

in renderdoc/driver/vulkan/vk_replay.cpp [3686:4630]


void VulkanReplay::GetTextureData(ResourceId tex, const Subresource &sub,
                                  const GetTextureDataParams &params, bytebuf &data)
{
  bool wasms = false;
  bool resolve = params.resolve;
  bool copyToBuffer = true;

  if(m_pDriver->m_CreationInfo.m_Image.find(tex) == m_pDriver->m_CreationInfo.m_Image.end())
  {
    RDCERR("Trying to get texture data for unknown ID %s!", ToStr(tex).c_str());
    return;
  }

  const VulkanCreationInfo::Image &imInfo = m_pDriver->m_CreationInfo.m_Image[tex];

  LockedConstImageStateRef lockedImage = m_pDriver->FindConstImageState(tex);
  if(!lockedImage || !lockedImage->isMemoryBound)
    return;
  const ImageState *srcImageState = &*lockedImage;
  ImageState tmpImageState;

  VkMarkerRegion region(StringFormat::Fmt("GetTextureData(%u, %u, %u, remap=%d)", sub.mip,
                                          sub.slice, sub.sample, params.remap));

  Subresource s = sub;

  s.slice = RDCMIN(uint32_t(imInfo.arrayLayers - 1), s.slice);
  s.sample = RDCMIN(uint32_t(imInfo.samples - 1), s.sample);
  s.mip = RDCMIN(uint32_t(imInfo.mipLevels - 1), s.mip);

  VkImageCreateInfo imCreateInfo = {
      VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
      NULL,
      0,
      imInfo.type,
      imInfo.format,
      imInfo.extent,
      imInfo.mipLevels,
      imInfo.arrayLayers,
      imInfo.samples,
      VK_IMAGE_TILING_OPTIMAL,
      VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT,
      VK_SHARING_MODE_EXCLUSIVE,
      0,
      NULL,
      VK_IMAGE_LAYOUT_UNDEFINED,
  };

  VkImageAspectFlags imageAspects = FormatImageAspects(imInfo.format);
  bool isDepth = (imageAspects & VK_IMAGE_ASPECT_DEPTH_BIT) != 0;
  bool isStencil = (imageAspects & VK_IMAGE_ASPECT_STENCIL_BIT) != 0;
  bool isPlanar = (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) != 0;
  uint32_t planeCount = GetYUVPlaneCount(imInfo.format);

  VkImage liveWrappedImage = GetResourceManager()->GetCurrentHandle<VkImage>(tex);

  VkImage srcImage = Unwrap(liveWrappedImage);
  VkImage tmpImage = VK_NULL_HANDLE;
  VkImage wrappedTmpImage = VK_NULL_HANDLE;
  VkDeviceMemory tmpMemory = VK_NULL_HANDLE;

  VkFramebuffer *tmpFB = NULL;
  VkImageView *tmpView = NULL;
  uint32_t numFBs = 0;
  VkRenderPass tmpRP = VK_NULL_HANDLE;
  VkRenderPass tmpRPStencil = VK_NULL_HANDLE;

  VkDevice dev = m_pDriver->GetDev();
  VkCommandBuffer cmd = m_pDriver->GetNextCmd();
  const VkDevDispatchTable *vt = ObjDisp(dev);

  if(cmd == VK_NULL_HANDLE)
    return;

  VkCommandBufferBeginInfo beginInfo = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, NULL,
                                        VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT};

  VkResult vkr = vt->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
  CheckVkResult(vkr);

  uint32_t dataSize = 0;
  VkBuffer readbackBuf = VK_NULL_HANDLE;
  VkDeviceMemory readbackMem = VK_NULL_HANDLE;

  if(imInfo.samples > 1)
  {
    // make image n-array instead of n-samples
    imCreateInfo.arrayLayers *= imCreateInfo.samples;
    imCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;

    wasms = true;
  }

  if(wasms && (isDepth || isStencil))
    resolve = false;

  if(params.remap != RemapTexture::NoRemap)
  {
    int renderFlags = 0;

    // force readback texture to RGBA8 unorm
    if(params.remap == RemapTexture::RGBA8)
    {
      if(IsSRGBFormat(imCreateInfo.format))
      {
        imCreateInfo.format = VK_FORMAT_R8G8B8A8_SRGB;
        renderFlags |= eTexDisplay_RemapSRGB;
      }
      else
      {
        imCreateInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
      }
    }
    else if(params.remap == RemapTexture::RGBA16)
    {
      imCreateInfo.format = VK_FORMAT_R16G16B16A16_SFLOAT;
      renderFlags = eTexDisplay_16Render;
    }
    else if(params.remap == RemapTexture::RGBA32)
    {
      imCreateInfo.format = VK_FORMAT_R32G32B32A32_SFLOAT;
      renderFlags = eTexDisplay_32Render;
    }
    else
    {
      RDCERR("Unsupported remap format: %u", params.remap);
    }

    imCreateInfo.format = GetViewCastedFormat(imCreateInfo.format, BaseRemapType(params));

    if(IsUIntFormat(imCreateInfo.format))
      renderFlags |= eTexDisplay_RemapUInt;
    else if(IsSIntFormat(imCreateInfo.format))
      renderFlags |= eTexDisplay_RemapSInt;
    else
      renderFlags |= eTexDisplay_RemapFloat;

    // force to 1 array slice, 1 mip
    imCreateInfo.arrayLayers = 1;
    imCreateInfo.mipLevels = 1;
    // force to 2D
    imCreateInfo.imageType = VK_IMAGE_TYPE_2D;
    imCreateInfo.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;

    // we'll need to cast to remap the stencil part
    if(IsStencilFormat(imInfo.format))
      imCreateInfo.flags |= VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT;

    imCreateInfo.extent.width = RDCMAX(1U, imCreateInfo.extent.width >> s.mip);
    imCreateInfo.extent.height = RDCMAX(1U, imCreateInfo.extent.height >> s.mip);
    imCreateInfo.extent.depth = RDCMAX(1U, imCreateInfo.extent.depth >> s.mip);

    // convert a 3D texture into a 2D array, so we can render to the slices without needing
    // KHR_maintenance1
    if(imCreateInfo.extent.depth > 1)
    {
      imCreateInfo.arrayLayers = imCreateInfo.extent.depth;
      imCreateInfo.extent.depth = 1;
    }

    // create render texture similar to readback texture
    vt->CreateImage(Unwrap(dev), &imCreateInfo, NULL, &tmpImage);
    wrappedTmpImage = tmpImage;
    GetResourceManager()->WrapResource(Unwrap(dev), wrappedTmpImage);
    tmpImageState = ImageState(wrappedTmpImage, ImageInfo(imCreateInfo), eFrameRef_None);

    NameVulkanObject(wrappedTmpImage, "GetTextureData tmpImage");

    VkMemoryRequirements mrq = {0};
    vt->GetImageMemoryRequirements(Unwrap(dev), tmpImage, &mrq);

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
        NULL,
        mrq.size,
        m_pDriver->GetGPULocalMemoryIndex(mrq.memoryTypeBits),
    };

    vkr = vt->AllocateMemory(Unwrap(dev), &allocInfo, NULL, &tmpMemory);
    CheckVkResult(vkr);

    if(vkr != VK_SUCCESS)
      return;

    vkr = vt->BindImageMemory(Unwrap(dev), tmpImage, tmpMemory, 0);
    CheckVkResult(vkr);

    tmpImageState.InlineTransition(
        cmd, m_pDriver->m_QueueFamilyIdx, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, 0,
        VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, m_pDriver->GetImageTransitionInfo());

    // end this command buffer, the rendertexture below will use its own and we want to ensure
    // ordering
    vt->EndCommandBuffer(Unwrap(cmd));

    if(Vulkan_Debug_SingleSubmitFlushing())
      m_pDriver->SubmitCmds();

    // create framebuffer/render pass to render to
    VkAttachmentDescription attDesc = {0,
                                       imCreateInfo.format,
                                       VK_SAMPLE_COUNT_1_BIT,
                                       VK_ATTACHMENT_LOAD_OP_LOAD,
                                       VK_ATTACHMENT_STORE_OP_STORE,
                                       VK_ATTACHMENT_LOAD_OP_DONT_CARE,
                                       VK_ATTACHMENT_STORE_OP_DONT_CARE,
                                       VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
                                       VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL};

    VkAttachmentReference attRef = {0, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL};

    VkSubpassDescription subpass = {
        0,    VK_PIPELINE_BIND_POINT_GRAPHICS,
        0,    NULL,       // inputs
        1,    &attRef,    // color
        NULL,             // resolve
        NULL,             // depth-stencil
        0,    NULL,       // preserve
    };

    VkRenderPassCreateInfo rpinfo = {
        VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
        NULL,
        0,
        1,
        &attDesc,
        1,
        &subpass,
        0,
        NULL,    // dependencies
    };
    vt->CreateRenderPass(Unwrap(dev), &rpinfo, NULL, &tmpRP);

    numFBs = imCreateInfo.arrayLayers;

    // we'll need twice as many temp views/FBs for stencil views
    if(IsStencilFormat(imInfo.format))
    {
      tmpFB = new VkFramebuffer[numFBs * 2];
      tmpView = new VkImageView[numFBs * 2];
    }
    else
    {
      tmpFB = new VkFramebuffer[numFBs];
      tmpView = new VkImageView[numFBs];
    }

    int oldW = m_DebugWidth, oldH = m_DebugHeight;

    m_DebugWidth = imCreateInfo.extent.width;
    m_DebugHeight = imCreateInfo.extent.height;

    int renderCount = 0;

    // if 3d texture, render each slice separately, otherwise render once
    for(uint32_t i = 0; i < numFBs; i++)
    {
      if(numFBs > 1 && (renderCount % m_TexRender.UBO.GetRingCount()) == 0)
      {
        m_pDriver->SubmitCmds();
        m_pDriver->FlushQ();
      }

      TextureDisplay texDisplay;

      texDisplay.red = texDisplay.green = texDisplay.blue = texDisplay.alpha = true;
      texDisplay.hdrMultiplier = -1.0f;
      texDisplay.linearDisplayAsGamma = false;
      texDisplay.overlay = DebugOverlay::NoOverlay;
      texDisplay.flipY = false;
      texDisplay.subresource.mip = s.mip;
      texDisplay.subresource.slice = imInfo.type == VK_IMAGE_TYPE_3D ? i : s.slice;
      texDisplay.subresource.sample =
          imInfo.type == VK_IMAGE_TYPE_3D ? 0 : (resolve ? ~0U : s.sample);
      texDisplay.customShaderId = ResourceId();
      texDisplay.rangeMin = params.blackPoint;
      texDisplay.rangeMax = params.whitePoint;
      texDisplay.scale = 1.0f;
      texDisplay.resourceId = tex;
      texDisplay.typeCast = params.typeCast;
      texDisplay.rawOutput = false;
      texDisplay.xOffset = 0;
      texDisplay.yOffset = 0;

      VkImageViewCreateInfo viewInfo = {
          VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
          NULL,
          0,
          tmpImage,
          VK_IMAGE_VIEW_TYPE_2D,
          imCreateInfo.format,
          {VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY,
           VK_COMPONENT_SWIZZLE_IDENTITY, VK_COMPONENT_SWIZZLE_IDENTITY},
          {
              VK_IMAGE_ASPECT_COLOR_BIT,
              0,
              VK_REMAINING_MIP_LEVELS,
              i,
              1,
          },
      };

      vkr = vt->CreateImageView(Unwrap(dev), &viewInfo, NULL, &tmpView[i]);
      CheckVkResult(vkr);

      NameUnwrappedVulkanObject(tmpView[i], "GetTextureData tmpView[i]");

      VkFramebufferCreateInfo fbinfo = {
          VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
          NULL,
          0,
          tmpRP,
          1,
          &tmpView[i],
          (uint32_t)imCreateInfo.extent.width,
          (uint32_t)imCreateInfo.extent.height,
          1,
      };

      vkr = vt->CreateFramebuffer(Unwrap(dev), &fbinfo, NULL, &tmpFB[i]);
      CheckVkResult(vkr);

      VkClearValue clearval = {};
      VkRenderPassBeginInfo rpbegin = {
          VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
          NULL,
          tmpRP,
          tmpFB[i],
          {{
               0,
               0,
           },
           {imCreateInfo.extent.width, imCreateInfo.extent.height}},
          1,
          &clearval,
      };

      RenderTextureInternal(texDisplay, *srcImageState, rpbegin, renderFlags);
      renderCount++;

      // for textures with stencil, do another draw to copy the stencil
      if(isStencil)
      {
        viewInfo.format = GetViewCastedFormat(viewInfo.format, CompType::UInt);

        attDesc.format = viewInfo.format;
        vkr = vt->CreateRenderPass(Unwrap(dev), &rpinfo, NULL, &tmpRPStencil);
        CheckVkResult(vkr);
        fbinfo.renderPass = tmpRPStencil;
        rpbegin.renderPass = tmpRPStencil;

        vkr = vt->CreateImageView(Unwrap(dev), &viewInfo, NULL, &tmpView[i + numFBs]);
        CheckVkResult(vkr);
        NameUnwrappedVulkanObject(tmpView[i + numFBs], "GetTextureData tmpView[i]");
        fbinfo.pAttachments = &tmpView[i + numFBs];
        vkr = vt->CreateFramebuffer(Unwrap(dev), &fbinfo, NULL, &tmpFB[i + numFBs]);
        CheckVkResult(vkr);
        rpbegin.framebuffer = tmpFB[i + numFBs];

        int stencilFlags = renderFlags;
        stencilFlags &= ~eTexDisplay_RemapFloat;
        stencilFlags &= ~eTexDisplay_RemapSRGB;
        stencilFlags |= eTexDisplay_RemapUInt | eTexDisplay_GreenOnly;

        texDisplay.red = texDisplay.blue = texDisplay.alpha = false;

        // S8 renders into red
        if(IsStencilOnlyFormat(imInfo.format))
        {
          texDisplay.red = true;
          texDisplay.green = false;
          stencilFlags &= ~eTexDisplay_GreenOnly;
        }

        RenderTextureInternal(texDisplay, *srcImageState, rpbegin, stencilFlags);
        renderCount++;
      }
    }

    m_DebugWidth = oldW;
    m_DebugHeight = oldH;

    srcImage = tmpImage;
    srcImageState = &tmpImageState;

    // fetch a new command buffer for copy & readback
    cmd = m_pDriver->GetNextCmd();

    if(cmd == VK_NULL_HANDLE)
      return;

    vkr = vt->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
    CheckVkResult(vkr);

    tmpImageState.InlineTransition(cmd, m_pDriver->m_QueueFamilyIdx,
                                   VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                                   VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
                                   VK_ACCESS_TRANSFER_READ_BIT, m_pDriver->GetImageTransitionInfo());

    // these have already been selected, don't need to fetch that subresource
    // when copying back to readback buffer
    s.slice = 0;
    s.mip = 0;

    // no longer depth, if it was
    isDepth = false;
    isStencil = false;
  }
  else if(wasms && resolve)
  {
    // force to 1 array slice, 1 mip
    imCreateInfo.arrayLayers = 1;
    imCreateInfo.mipLevels = 1;

    imCreateInfo.extent.width = RDCMAX(1U, imCreateInfo.extent.width >> s.mip);
    imCreateInfo.extent.height = RDCMAX(1U, imCreateInfo.extent.height >> s.mip);

    // create resolve texture
    vt->CreateImage(Unwrap(dev), &imCreateInfo, NULL, &tmpImage);
    wrappedTmpImage = tmpImage;
    GetResourceManager()->WrapResource(Unwrap(dev), wrappedTmpImage);
    tmpImageState = ImageState(wrappedTmpImage, ImageInfo(imCreateInfo), eFrameRef_None);

    NameVulkanObject(wrappedTmpImage, "GetTextureData tmpImage");

    VkMemoryRequirements mrq = {0};
    vt->GetImageMemoryRequirements(Unwrap(dev), tmpImage, &mrq);

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
        NULL,
        mrq.size,
        m_pDriver->GetGPULocalMemoryIndex(mrq.memoryTypeBits),
    };

    vkr = vt->AllocateMemory(Unwrap(dev), &allocInfo, NULL, &tmpMemory);
    CheckVkResult(vkr);

    if(vkr != VK_SUCCESS)
      return;

    vkr = vt->BindImageMemory(Unwrap(dev), tmpImage, tmpMemory, 0);
    CheckVkResult(vkr);

    RDCASSERT(!isDepth && !isStencil);

    VkImageResolve resolveRegion = {
        {VK_IMAGE_ASPECT_COLOR_BIT, s.mip, s.slice, 1},
        {0, 0, 0},
        {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1},
        {0, 0, 0},
        imCreateInfo.extent,
    };

    tmpImageState.InlineTransition(
        cmd, m_pDriver->m_QueueFamilyIdx, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 0,
        VK_ACCESS_TRANSFER_WRITE_BIT, m_pDriver->GetImageTransitionInfo());
    ImageBarrierSequence setupBarriers, cleanupBarriers;
    srcImageState->TempTransition(m_pDriver->m_QueueFamilyIdx, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                                  VK_ACCESS_TRANSFER_READ_BIT, setupBarriers, cleanupBarriers,
                                  m_pDriver->GetImageTransitionInfo());
    m_pDriver->InlineSetupImageBarriers(cmd, setupBarriers);
    m_pDriver->SubmitAndFlushImageStateBarriers(setupBarriers);

    // resolve from live texture to resolve texture
    vt->CmdResolveImage(Unwrap(cmd), srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, tmpImage,
                        VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &resolveRegion);

    tmpImageState.InlineTransition(cmd, m_pDriver->m_QueueFamilyIdx,
                                   VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_ACCESS_TRANSFER_WRITE_BIT,
                                   VK_ACCESS_TRANSFER_READ_BIT, m_pDriver->GetImageTransitionInfo());

    m_pDriver->InlineCleanupImageBarriers(cmd, cleanupBarriers);

    if(!cleanupBarriers.empty())
    {
      // ensure this resolve happens before handing back the source image to the original queue
      vkr = vt->EndCommandBuffer(Unwrap(cmd));
      CheckVkResult(vkr);

      m_pDriver->SubmitCmds();
      m_pDriver->FlushQ();

      m_pDriver->SubmitAndFlushImageStateBarriers(cleanupBarriers);

      // fetch a new command buffer for remaining work
      cmd = m_pDriver->GetNextCmd();

      if(cmd == VK_NULL_HANDLE)
        return;

      vkr = vt->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
      CheckVkResult(vkr);
    }
    srcImageState = &tmpImageState;

    srcImage = tmpImage;

    // these have already been selected, don't need to fetch that subresource
    // when copying back to readback buffer
    s.slice = 0;
    s.mip = 0;
  }
  else if(wasms)
  {
    dataSize = GetByteSize(imInfo.extent.width, imInfo.extent.height, imInfo.extent.depth,
                           imCreateInfo.format, s.mip);

    // buffer size needs to be align to the int for shader writing
    VkBufferCreateInfo bufInfo = {
        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
        NULL,
        0,
        AlignUp(dataSize, 4U),
        VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
            VK_BUFFER_USAGE_TRANSFER_DST_BIT,
    };

    vkr = vt->CreateBuffer(Unwrap(dev), &bufInfo, NULL, &readbackBuf);
    CheckVkResult(vkr);

    VkMemoryRequirements mrq = {0};

    vt->GetBufferMemoryRequirements(Unwrap(dev), readbackBuf, &mrq);

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
        NULL,
        mrq.size,
        m_pDriver->GetReadbackMemoryIndex(mrq.memoryTypeBits),
    };
    vkr = vt->AllocateMemory(Unwrap(dev), &allocInfo, NULL, &readbackMem);
    CheckVkResult(vkr);

    if(vkr != VK_SUCCESS)
      return;

    vkr = vt->BindBufferMemory(Unwrap(dev), readbackBuf, readbackMem, 0);
    CheckVkResult(vkr);

    // copy/expand multisampled live texture to readback buffer
    ImageBarrierSequence setupBarriers, cleanupBarriers;
    srcImageState->TempTransition(m_pDriver->m_QueueFamilyIdx,
                                  VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
                                  VK_ACCESS_SHADER_READ_BIT, setupBarriers, cleanupBarriers,
                                  m_pDriver->GetImageTransitionInfo());
    m_pDriver->InlineSetupImageBarriers(cmd, setupBarriers);
    m_pDriver->SubmitAndFlushImageStateBarriers(setupBarriers);

    GetDebugManager()->CopyTex2DMSToBuffer(cmd, readbackBuf, srcImage, imCreateInfo.extent, s.slice,
                                           1, s.sample, 1, imCreateInfo.format);

    m_pDriver->InlineCleanupImageBarriers(cmd, cleanupBarriers);

    if(!cleanupBarriers.empty())
    {
      // ensure this resolve happens before handing back the source image to the original queue
      vkr = vt->EndCommandBuffer(Unwrap(cmd));
      CheckVkResult(vkr);

      m_pDriver->SubmitCmds();
      m_pDriver->FlushQ();

      m_pDriver->SubmitAndFlushImageStateBarriers(cleanupBarriers);

      // fetch a new command buffer for remaining work
      cmd = m_pDriver->GetNextCmd();

      if(cmd == VK_NULL_HANDLE)
        return;

      vkr = vt->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
      CheckVkResult(vkr);
    }

    // readback buffer has already been populated, no need to call CmdCopyImageToBuffer
    copyToBuffer = false;
  }

  VkDeviceSize stencilOffset = 0;
  // if we have no tmpImage, we're copying directly from the real image
  if(copyToBuffer)
  {
    ImageBarrierSequence cleanupBarriers;
    if(tmpImage == VK_NULL_HANDLE)
    {
      ImageBarrierSequence setupBarriers;
      srcImageState->TempTransition(m_pDriver->m_QueueFamilyIdx, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                                    VK_ACCESS_TRANSFER_READ_BIT, setupBarriers, cleanupBarriers,
                                    m_pDriver->GetImageTransitionInfo());
      m_pDriver->InlineSetupImageBarriers(cmd, setupBarriers);
      m_pDriver->SubmitAndFlushImageStateBarriers(setupBarriers);
    }

    rdcarray<VkBufferImageCopy> copyregions;

    VkBufferImageCopy copyRegionTemplate = {
        0,
        0,
        0,
        {VK_IMAGE_ASPECT_NONE, s.mip, s.slice, 1},
        {
            0,
            0,
            0,
        },
        {RDCMAX(1U, imCreateInfo.extent.width >> s.mip),
         RDCMAX(1U, imCreateInfo.extent.height >> s.mip),
         RDCMAX(1U, imCreateInfo.extent.depth >> s.mip)},
    };

    if(isDepth || isStencil)
    {
      if(isDepth)
      {
        copyRegionTemplate.imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
        copyregions.push_back(copyRegionTemplate);

        // Stencil offset (if present)
        copyRegionTemplate.bufferOffset = stencilOffset =
            GetByteSize(imInfo.extent.width, imInfo.extent.height, imInfo.extent.depth,
                        GetDepthOnlyFormat(imCreateInfo.format), s.mip);
        copyRegionTemplate.bufferOffset = AlignUp(copyRegionTemplate.bufferOffset, (VkDeviceSize)4);
      }

      if(isStencil)
      {
        copyRegionTemplate.imageSubresource.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
        copyregions.push_back(copyRegionTemplate);
      }
    }
    else if(isPlanar)
    {
      for(uint32_t i = 0; i < planeCount; i++)
      {
        copyRegionTemplate.imageSubresource.aspectMask = VK_IMAGE_ASPECT_PLANE_0_BIT << i;

        VkExtent2D planeExtent =
            GetPlaneShape(RDCMAX(1U, imCreateInfo.extent.width >> s.mip),
                          RDCMAX(1U, imCreateInfo.extent.height >> s.mip), imCreateInfo.format, i);
        copyRegionTemplate.imageExtent.width = planeExtent.width;
        copyRegionTemplate.imageExtent.height = planeExtent.height;

        copyregions.push_back(copyRegionTemplate);

        copyRegionTemplate.bufferOffset +=
            GetPlaneByteSize(imCreateInfo.extent.width, imCreateInfo.extent.height,
                             imCreateInfo.extent.depth, imCreateInfo.format, s.mip, i);
      }
    }
    else
    {
      copyRegionTemplate.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
      copyregions.push_back(copyRegionTemplate);
    }

    dataSize = GetByteSize(imInfo.extent.width, imInfo.extent.height, imInfo.extent.depth,
                           imCreateInfo.format, s.mip);

    if(imCreateInfo.format == VK_FORMAT_D24_UNORM_S8_UINT)
    {
      // for most combined depth-stencil images this will be large enough for both to be copied
      // separately, but for D24S8 we need to add extra space since they won't be copied packed
      dataSize = AlignUp(dataSize, 4U);
      dataSize += GetByteSize(imInfo.extent.width, imInfo.extent.height, imInfo.extent.depth,
                              VK_FORMAT_S8_UINT, s.mip);
    }

    VkBufferCreateInfo bufInfo = {
        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
        NULL,
        0,
        dataSize,
        VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
    };

    vkr = vt->CreateBuffer(Unwrap(dev), &bufInfo, NULL, &readbackBuf);
    CheckVkResult(vkr);

    VkMemoryRequirements mrq = {0};

    vt->GetBufferMemoryRequirements(Unwrap(dev), readbackBuf, &mrq);

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
        NULL,
        mrq.size,
        m_pDriver->GetReadbackMemoryIndex(mrq.memoryTypeBits),
    };
    vkr = vt->AllocateMemory(Unwrap(dev), &allocInfo, NULL, &readbackMem);
    CheckVkResult(vkr);

    if(vkr != VK_SUCCESS)
      return;

    vkr = vt->BindBufferMemory(Unwrap(dev), readbackBuf, readbackMem, 0);
    CheckVkResult(vkr);

    if(imInfo.type == VK_IMAGE_TYPE_3D && params.remap != RemapTexture::NoRemap)
    {
      // copy in each slice from the 2D array we created to render out the 3D texture
      for(uint32_t i = 0; i < imCreateInfo.arrayLayers; i++)
      {
        copyregions[0].imageSubresource.baseArrayLayer = i;
        copyregions[0].bufferOffset =
            i * GetByteSize(imCreateInfo.extent.width, imCreateInfo.extent.height, 1,
                            imCreateInfo.format, s.mip);
        vt->CmdCopyImageToBuffer(Unwrap(cmd), srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                                 readbackBuf, (uint32_t)copyregions.size(), copyregions.data());
      }
    }
    else
    {
      if(imInfo.type == VK_IMAGE_TYPE_3D)
        copyregions[0].imageSubresource.baseArrayLayer = 0;

      // copy from desired subresource in srcImage to buffer
      vt->CmdCopyImageToBuffer(Unwrap(cmd), srcImage, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                               readbackBuf, (uint32_t)copyregions.size(), copyregions.data());
    }

    // if we have no tmpImage, we're copying directly from the real image
    if(tmpImage == VK_NULL_HANDLE)
    {
      m_pDriver->InlineCleanupImageBarriers(cmd, cleanupBarriers);

      if(!cleanupBarriers.empty())
      {
        // ensure this resolve happens before handing back the source image to the original queue
        vkr = vt->EndCommandBuffer(Unwrap(cmd));
        CheckVkResult(vkr);

        m_pDriver->SubmitCmds();
        m_pDriver->FlushQ();

        m_pDriver->SubmitAndFlushImageStateBarriers(cleanupBarriers);

        // fetch a new command buffer for remaining work
        cmd = m_pDriver->GetNextCmd();

        if(cmd == VK_NULL_HANDLE)
          return;

        vkr = vt->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
        CheckVkResult(vkr);
      }
    }
  }

  VkBufferMemoryBarrier bufBarrier = {
      VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
      NULL,
      VK_ACCESS_TRANSFER_WRITE_BIT,
      VK_ACCESS_HOST_READ_BIT,
      VK_QUEUE_FAMILY_IGNORED,
      VK_QUEUE_FAMILY_IGNORED,
      readbackBuf,
      0,
      dataSize,
  };

  // wait for copy to finish before reading back to host
  DoPipelineBarrier(cmd, 1, &bufBarrier);

  vt->EndCommandBuffer(Unwrap(cmd));

  m_pDriver->SubmitCmds();
  m_pDriver->FlushQ();

  // map the buffer and copy to return buffer
  byte *pData = NULL;
  vkr = vt->MapMemory(Unwrap(dev), readbackMem, 0, VK_WHOLE_SIZE, 0, (void **)&pData);
  CheckVkResult(vkr);
  if(vkr != VK_SUCCESS)
    return;
  if(!pData)
  {
    RDCERR("Manually reporting failed memory map");
    CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
    return;
  }

  VkMappedMemoryRange range = {
      VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, readbackMem, 0, VK_WHOLE_SIZE,
  };

  vkr = vt->InvalidateMappedMemoryRanges(Unwrap(dev), 1, &range);
  CheckVkResult(vkr);

  RDCASSERT(pData != NULL);

  data.resize(dataSize);

  if(params.remap == RemapTexture::RGBA32 && IsDepthAndStencilFormat(imInfo.format))
  {
    memcpy(data.data(), pData, dataSize);

    Vec4f *output = (Vec4f *)data.data();
    Vec4u *input = (Vec4u *)pData;
    for(size_t i = 0; i < dataSize / sizeof(Vec4u); i++)
      output[i].y = float(input[i].y) / 255.0f;
  }
  else if(isDepth && isStencil && copyToBuffer)
  {
    // We only need to manually interleave if we use CmdCopyImageToBuffer.
    // CopyDepthTex2DMS2Buffer will produce interleaved results.
    size_t pixelCount = std::max(1U, imCreateInfo.extent.width >> s.mip) *
                        std::max(1U, imCreateInfo.extent.height >> s.mip) *
                        std::max(1U, imCreateInfo.extent.depth >> s.mip);

    // for some reason reading direct from mapped memory here is *super* slow on android (1.5s to
    // iterate over the image), so we memcpy to a temporary buffer.
    rdcarray<byte> tmp;
    tmp.resize((size_t)stencilOffset + pixelCount * sizeof(uint8_t));
    memcpy(tmp.data(), pData, tmp.size());

    if(imCreateInfo.format == VK_FORMAT_D16_UNORM_S8_UINT)
    {
      uint16_t *dSrc = (uint16_t *)tmp.data();
      uint8_t *sSrc = (uint8_t *)(tmp.data() + stencilOffset);

      uint16_t *dDst = (uint16_t *)data.data();
      uint16_t *sDst = dDst + 1;    // interleaved, next pixel

      for(size_t i = 0; i < pixelCount; i++)
      {
        *dDst = *dSrc;
        *sDst = *sSrc;

        // increment source pointers by 1 since they're separate, and dest pointers by 2 since
        // they're interleaved
        dDst += 2;
        sDst += 2;

        sSrc++;
        dSrc++;
      }
    }
    else if(imCreateInfo.format == VK_FORMAT_D24_UNORM_S8_UINT)
    {
      // we can copy the depth from D24 as a 32-bit integer, since the remaining bits are garbage
      // and we overwrite them with stencil
      uint32_t *dSrc = (uint32_t *)tmp.data();
      uint8_t *sSrc = (uint8_t *)(tmp.data() + stencilOffset);

      uint32_t *dst = (uint32_t *)data.data();

      for(size_t i = 0; i < pixelCount; i++)
      {
        // pack the data together again, stencil in top bits
        *dst = (*dSrc & 0x00ffffff) | (uint32_t(*sSrc) << 24);

        dst++;
        sSrc++;
        dSrc++;
      }
    }
    else
    {
      uint32_t *dSrc = (uint32_t *)tmp.data();
      uint8_t *sSrc = (uint8_t *)(tmp.data() + stencilOffset);

      uint32_t *dDst = (uint32_t *)data.data();
      uint32_t *sDst = dDst + 1;    // interleaved, next pixel

      for(size_t i = 0; i < pixelCount; i++)
      {
        *dDst = *dSrc;
        *sDst = *sSrc;

        // increment source pointers by 1 since they're separate, and dest pointers by 2 since
        // they're interleaved
        dDst += 2;
        sDst += 2;

        sSrc++;
        dSrc++;
      }
    }
    // need to manually copy to interleave pixels
  }
  else
  {
    memcpy(data.data(), pData, dataSize);

    // vulkan's bitpacking of some layouts puts alpha in the low bits, which is not our 'standard'
    // layout and is not representable in our resource formats
    if(params.standardLayout)
    {
      if(imCreateInfo.format == VK_FORMAT_R4G4B4A4_UNORM_PACK16 ||
         imCreateInfo.format == VK_FORMAT_B4G4R4A4_UNORM_PACK16)
      {
        uint16_t *ptr = (uint16_t *)data.data();

        for(uint32_t i = 0; i < dataSize; i += sizeof(uint16_t))
        {
          const uint16_t val = *ptr;
          *ptr = (val >> 4) | ((val & 0xf) << 12);
          ptr++;
        }
      }
      else if(imCreateInfo.format == VK_FORMAT_R5G5B5A1_UNORM_PACK16 ||
              imCreateInfo.format == VK_FORMAT_B5G5R5A1_UNORM_PACK16)
      {
        uint16_t *ptr = (uint16_t *)data.data();

        for(uint32_t i = 0; i < dataSize; i += sizeof(uint16_t))
        {
          const uint16_t val = *ptr;
          *ptr = (val >> 1) | ((val & 0x1) << 15);
          ptr++;
        }
      }
    }
  }

  vt->UnmapMemory(Unwrap(dev), readbackMem);

  // clean up temporary objects
  vt->DestroyBuffer(Unwrap(dev), readbackBuf, NULL);
  vt->FreeMemory(Unwrap(dev), readbackMem, NULL);

  if(tmpImage != VK_NULL_HANDLE)
  {
    GetResourceManager()->ReleaseWrappedResource(wrappedTmpImage, true);
    vt->DestroyImage(Unwrap(dev), tmpImage, NULL);
    vt->FreeMemory(Unwrap(dev), tmpMemory, NULL);
  }

  if(tmpFB != NULL)
  {
    if(IsStencilFormat(imInfo.format))
      numFBs *= 2;

    for(uint32_t i = 0; i < numFBs; i++)
    {
      vt->DestroyFramebuffer(Unwrap(dev), tmpFB[i], NULL);
      vt->DestroyImageView(Unwrap(dev), tmpView[i], NULL);
    }
    delete[] tmpFB;
    delete[] tmpView;
    vt->DestroyRenderPass(Unwrap(dev), tmpRP, NULL);
    vt->DestroyRenderPass(Unwrap(dev), tmpRPStencil, NULL);
  }
}