bool VulkanReplay::FetchShaderFeedback()

in renderdoc/driver/vulkan/vk_shader_feedback.cpp [1588:2379]


bool VulkanReplay::FetchShaderFeedback(uint32_t eventId)
{
  if(m_BindlessFeedback.Usage.find(eventId) != m_BindlessFeedback.Usage.end())
    return false;

  if(!Vulkan_BindlessFeedback())
    return false;

  // create it here so we won't re-run any code if the event is re-selected. We'll mark it as valid
  // if it actually has any data in it later.
  VKDynamicShaderFeedback &result = m_BindlessFeedback.Usage[eventId];

  bool useBufferAddress = (m_pDriver->GetExtensions(NULL).ext_KHR_buffer_device_address ||
                           m_pDriver->GetExtensions(NULL).ext_EXT_buffer_device_address);

  if(Vulkan_Debug_DisableBufferDeviceAddress() ||
     m_pDriver->GetDriverInfo().BufferDeviceAddressBrokenDriver())
    useBufferAddress = false;

  bool useBufferAddressKHR = m_pDriver->GetExtensions(NULL).ext_KHR_buffer_device_address;

  const VulkanRenderState &state = m_pDriver->m_RenderState;
  VulkanCreationInfo &creationInfo = m_pDriver->m_CreationInfo;

  const ActionDescription *action = m_pDriver->GetAction(eventId);

  if(action == NULL ||
     !(action->flags & (ActionFlags::Dispatch | ActionFlags::MeshDispatch | ActionFlags::Drawcall)))
  {
    // deliberately show no bindings as used for non-draws
    result.valid = true;
    return false;
  }

  result.compute = bool(action->flags & ActionFlags::Dispatch);

  const VulkanStatePipeline &pipe = result.compute ? state.compute : state.graphics;

  if(pipe.pipeline == ResourceId())
  {
    result.valid = true;
    return false;
  }

  const VulkanCreationInfo::Pipeline &pipeInfo = creationInfo.m_Pipeline[pipe.pipeline];

  bool usesPrintf = false;

  VkGraphicsPipelineCreateInfo graphicsInfo = {};
  VkComputePipelineCreateInfo computeInfo = {};

  // get pipeline create info
  if(result.compute)
  {
    m_pDriver->GetShaderCache()->MakeComputePipelineInfo(computeInfo, state.compute.pipeline);
  }
  else
  {
    m_pDriver->GetShaderCache()->MakeGraphicsPipelineInfo(graphicsInfo, state.graphics.pipeline);

    if(graphicsInfo.renderPass != VK_NULL_HANDLE)
      graphicsInfo.renderPass =
          creationInfo.m_RenderPass[GetResID(graphicsInfo.renderPass)].loadRPs[graphicsInfo.subpass];
    graphicsInfo.subpass = 0;
  }

  if(result.compute)
  {
    usesPrintf = pipeInfo.shaders[5].patchData->usesPrintf;
  }
  else
  {
    for(uint32_t i = 0; i < graphicsInfo.stageCount; i++)
    {
      VkPipelineShaderStageCreateInfo &stage =
          (VkPipelineShaderStageCreateInfo &)graphicsInfo.pStages[i];

      int idx = StageIndex(stage.stage);

      usesPrintf |= pipeInfo.shaders[idx].patchData->usesPrintf;
    }
  }

  BindlessFeedbackData feedbackData;

  if(usesPrintf)
  {
    // reserve some space at the start for an atomic offset counter then the buffer size, and an
    // overflow section for any clamped messages
    feedbackData.feedbackStorageSize += 16 + Vulkan_Debug_PrintfBufferSize() + 1024;
  }

  ShaderReflection *stageRefls[NumShaderStages] = {};

  {
    const rdcarray<VulkanStatePipeline::DescriptorAndOffsets> &descSets =
        (result.compute ? state.compute.descSets : state.graphics.descSets);

    rdcarray<const DescSetLayout *> descLayouts;
    for(size_t set = 0; set < pipeInfo.descSetLayouts.size(); set++)
      descLayouts.push_back(&creationInfo.m_DescSetLayout[pipeInfo.descSetLayouts[set]]);

    auto processBinding = [this, &descLayouts, &descSets, &feedbackData](
                              ShaderStage stage, DescriptorType type, uint16_t index,
                              uint32_t bindset, uint32_t bind, uint32_t arraySize) {
      // only process array bindings
      if(arraySize <= 1)
        return;

      BindKey key;
      key.stage = stage;
      key.arraySize = arraySize;
      key.index.category = CategoryForDescriptorType(type);
      key.index.index = index;
      key.index.arrayElement = 0;

      if(bindset >= descLayouts.size() || !descLayouts[bindset] || bindset > descSets.size() ||
         descSets[bindset].descSet == ResourceId())
      {
        RDCERR("Invalid set %u referenced by %s shader", bindset, ToStr(key.stage).c_str());
        return;
      }

      ResourceId descSet = descSets[bindset].descSet;

      if(bind >= descLayouts[bindset]->bindings.size())
      {
        RDCERR("Invalid binding %u in set %u referenced by %s shader", bind, bindset,
               ToStr(key.stage).c_str());
        return;
      }

      // VkShaderStageFlagBits and ShaderStageMask are identical bit-for-bit.
      if((descLayouts[bindset]->bindings[bind].stageFlags &
          (VkShaderStageFlags)MaskForStage(key.stage)) == 0)
      {
        // this might be deliberate if the binding is never actually used dynamically, only
        // statically used bindings must be declared
        return;
      }

      if(descLayouts[bindset]->bindings[bind].variableSize)
      {
        auto it = m_pDriver->m_DescriptorSetState.find(descSet);
        if(it != m_pDriver->m_DescriptorSetState.end())
          arraySize = it->second.data.variableDescriptorCount;
      }
      else if(arraySize == ~0U)
      {
        // if the array was unbounded, clamp it to the size of the descriptor set
        arraySize = descLayouts[bindset]->bindings[bind].descriptorCount;
      }

      DescriptorAccess access;
      access.stage = key.stage;
      access.type = type;
      access.index = index;
      access.descriptorStore = m_pDriver->GetResourceManager()->GetOriginalID(descSet);
      access.byteOffset =
          descLayouts[bindset]->bindings[bind].elemOffset + descLayouts[bindset]->inlineByteSize;
      access.byteSize = 1;

      feedbackData.offsetMap[key] = {feedbackData.feedbackStorageSize, arraySize, access};

      feedbackData.feedbackStorageSize += arraySize * sizeof(uint32_t);
    };

    for(const VulkanCreationInfo::Pipeline::Shader &sh : pipeInfo.shaders)
    {
      if(!sh.refl)
        continue;

      stageRefls[(uint32_t)sh.refl->stage] = sh.refl;

      for(uint32_t i = 0; i < sh.refl->constantBlocks.size(); i++)
        processBinding(sh.refl->stage, DescriptorType::ConstantBuffer, i & 0xffff,
                       sh.refl->constantBlocks[i].fixedBindSetOrSpace,
                       sh.refl->constantBlocks[i].fixedBindNumber,
                       sh.refl->constantBlocks[i].bindArraySize);

      for(uint32_t i = 0; i < sh.refl->samplers.size(); i++)
        processBinding(sh.refl->stage, DescriptorType::Sampler, i & 0xffff,
                       sh.refl->samplers[i].fixedBindSetOrSpace,
                       sh.refl->samplers[i].fixedBindNumber, sh.refl->samplers[i].bindArraySize);

      for(uint32_t i = 0; i < sh.refl->readOnlyResources.size(); i++)
        processBinding(sh.refl->stage, sh.refl->readOnlyResources[i].descriptorType, i & 0xffff,
                       sh.refl->readOnlyResources[i].fixedBindSetOrSpace,
                       sh.refl->readOnlyResources[i].fixedBindNumber,
                       sh.refl->readOnlyResources[i].bindArraySize);

      for(uint32_t i = 0; i < sh.refl->readWriteResources.size(); i++)
        processBinding(sh.refl->stage, sh.refl->readWriteResources[i].descriptorType, i & 0xffff,
                       sh.refl->readWriteResources[i].fixedBindSetOrSpace,
                       sh.refl->readWriteResources[i].fixedBindNumber,
                       sh.refl->readWriteResources[i].bindArraySize);
    }
  }

  uint32_t maxSlot = uint32_t(feedbackData.feedbackStorageSize / sizeof(uint32_t));

  // add some extra padding just in case of out-of-bounds writes
  feedbackData.feedbackStorageSize += 128;

  // if we don't have any array descriptors or printf's to feedback then just return now
  if(feedbackData.offsetMap.empty() && !usesPrintf)
  {
    return false;
  }

  if(!m_pDriver->GetDeviceEnabledFeatures().shaderInt64 &&
     feedbackData.feedbackStorageSize > 0xffff0000U)
  {
    RDCLOG(
        "Feedback buffer is too large for 32-bit addressed maths, and device doesn't support "
        "int64");
    return false;
  }

  if(!result.compute)
  {
    // if we don't have any stores supported at all, we can't do feedback on the graphics pipeline
    if(!m_pDriver->GetDeviceEnabledFeatures().vertexPipelineStoresAndAtomics &&
       !m_pDriver->GetDeviceEnabledFeatures().fragmentStoresAndAtomics)
    {
      return false;
    }
  }

  // we go through the driver for all these creations since they need to be properly
  // registered in order to be put in the partial replay state. Our patched shader is valid so we
  // don't need to replay after doing the feedback execute
  VkResult vkr = VK_SUCCESS;
  VkDevice dev = m_Device;

  if(feedbackData.feedbackStorageSize > m_BindlessFeedback.FeedbackBuffer.sz)
  {
    uint32_t flags = GPUBuffer::eGPUBufferGPULocal | GPUBuffer::eGPUBufferSSBO;

    if(useBufferAddress)
      flags |= GPUBuffer::eGPUBufferAddressable;

    m_BindlessFeedback.FeedbackBuffer.Destroy();
    m_BindlessFeedback.FeedbackBuffer.Create(m_pDriver, dev, feedbackData.feedbackStorageSize, 1,
                                             flags);
  }

  VkDeviceAddress bufferAddress = 0;

  VkDescriptorPool descpool = VK_NULL_HANDLE;
  rdcarray<VkDescriptorSetLayout> setLayouts;
  rdcarray<VkDescriptorSet> descSets;

  VkPipelineLayout pipeLayout = VK_NULL_HANDLE;

  if(useBufferAddress)
  {
    RDCCOMPILE_ASSERT(VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO ==
                          VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT,
                      "KHR and EXT buffer_device_address should be interchangeable here.");
    VkBufferDeviceAddressInfo getAddressInfo = {VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO};
    getAddressInfo.buffer = m_BindlessFeedback.FeedbackBuffer.buf;

    if(useBufferAddressKHR)
      bufferAddress = m_pDriver->vkGetBufferDeviceAddress(dev, &getAddressInfo);
    else
      bufferAddress = m_pDriver->vkGetBufferDeviceAddressEXT(dev, &getAddressInfo);
  }
  else
  {
    VkDescriptorSetLayoutBinding newBindings[] = {
        // output buffer
        {
            0,
            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
            1,
            VkShaderStageFlags(result.compute ? VK_SHADER_STAGE_COMPUTE_BIT
                                              : VK_SHADER_STAGE_ALL_GRAPHICS),
            NULL,
        },
    };
    RDCCOMPILE_ASSERT(ARRAY_COUNT(newBindings) == 1,
                      "Should only be one new descriptor for bindless feedback");

    // create a duplicate set of descriptor sets, all visible to compute, with bindings shifted to
    // account for new ones we need. This also copies the existing bindings into the new sets
    PatchReservedDescriptors(pipe, descpool, setLayouts, descSets, VkShaderStageFlagBits(),
                             newBindings, ARRAY_COUNT(newBindings));

    // if the pool failed due to limits, it will be NULL so bail now
    if(descpool == VK_NULL_HANDLE)
      return false;

    // create pipeline layout with new descriptor set layouts
    {
      const rdcarray<VkPushConstantRange> &push =
          creationInfo.m_PipelineLayout[result.compute ? pipeInfo.compLayout : pipeInfo.vertLayout]
              .pushRanges;

      VkPipelineLayoutCreateInfo pipeLayoutInfo = {
          VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
          NULL,
          0,
          (uint32_t)setLayouts.size(),
          setLayouts.data(),
          (uint32_t)push.size(),
          push.data(),
      };

      vkr = m_pDriver->vkCreatePipelineLayout(dev, &pipeLayoutInfo, NULL, &pipeLayout);
      CheckVkResult(vkr);

      // we'll only use one, set both structs to keep things simple
      computeInfo.layout = pipeLayout;
      graphicsInfo.layout = pipeLayout;
    }

    // vkUpdateDescriptorSet desc set to point to buffer
    VkDescriptorBufferInfo desc = {0};

    m_BindlessFeedback.FeedbackBuffer.FillDescriptor(desc);

    VkWriteDescriptorSet write = {
        VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
        NULL,
        Unwrap(descSets[0]),
        0,
        0,
        1,
        VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
        NULL,
        &desc,
        NULL,
    };

    ObjDisp(dev)->UpdateDescriptorSets(Unwrap(dev), 1, &write, 0, NULL);
  }

  // create vertex shader with modified code
  VkShaderModuleCreateInfo moduleCreateInfo = {VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO};

  VkShaderModule modules[NumShaderStages] = {};

  const rdcstr filename[NumShaderStages] = {
      "bindless_vertex.spv", "bindless_hull.spv",    "bindless_domain.spv", "bindless_geometry.spv",
      "bindless_pixel.spv",  "bindless_compute.spv", "bindless_task.spv",   "bindless_mesh.spv",
  };

  std::map<uint32_t, PrintfData> printfData[NumShaderStages];

  if(result.compute)
  {
    VkPipelineShaderStageCreateInfo &stage = computeInfo.stage;

    const VulkanCreationInfo::ShaderModule &moduleInfo =
        creationInfo.m_ShaderModule[pipeInfo.shaders[5].module];

    rdcarray<uint32_t> modSpirv = moduleInfo.spirv.GetSPIRV();

    if(!Vulkan_Debug_FeedbackDumpDirPath().empty())
      FileIO::WriteAll(Vulkan_Debug_FeedbackDumpDirPath() + "/before_" + filename[5], modSpirv);

    if(m_pDriver->GetDeviceEnabledFeatures().shaderInt64)
    {
      AnnotateShader<uint64_t>(*pipeInfo.shaders[5].refl, *pipeInfo.shaders[5].patchData,
                               ShaderStage(StageIndex(stage.stage)), stage.pName,
                               feedbackData.offsetMap, maxSlot, false, bufferAddress,
                               useBufferAddressKHR, false, modSpirv, printfData[5]);
    }
    else
    {
      AnnotateShader<uint32_t>(*pipeInfo.shaders[5].refl, *pipeInfo.shaders[5].patchData,
                               ShaderStage(StageIndex(stage.stage)), stage.pName,
                               feedbackData.offsetMap, maxSlot, false, bufferAddress,
                               useBufferAddressKHR, false, modSpirv, printfData[5]);
    }

    if(!Vulkan_Debug_FeedbackDumpDirPath().empty())
      FileIO::WriteAll(Vulkan_Debug_FeedbackDumpDirPath() + "/after_" + filename[5], modSpirv);

    moduleCreateInfo.pCode = modSpirv.data();
    moduleCreateInfo.codeSize = modSpirv.size() * sizeof(uint32_t);

    vkr = m_pDriver->vkCreateShaderModule(dev, &moduleCreateInfo, NULL, &modules[0]);
    CheckVkResult(vkr);

    stage.module = modules[0];
  }
  else
  {
    bool hasGeomOrMesh = false;

    for(uint32_t i = 0; i < graphicsInfo.stageCount; i++)
    {
      VkPipelineShaderStageCreateInfo &stage =
          (VkPipelineShaderStageCreateInfo &)graphicsInfo.pStages[i];

      if((stage.stage & (VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) != 0)
      {
        hasGeomOrMesh = true;
        break;
      }
    }

    bool usePrimitiveID =
        !hasGeomOrMesh && m_pDriver->GetDeviceEnabledFeatures().geometryShader != VK_FALSE;

    bool usesMultiview = state.GetRenderPass() != ResourceId()
                             ? creationInfo.m_RenderPass[state.GetRenderPass()]
                                       .subpasses[state.subpass]
                                       .multiviews.size() > 1
                             : pipeInfo.viewMask != 0;

    for(uint32_t i = 0; i < graphicsInfo.stageCount; i++)
    {
      VkPipelineShaderStageCreateInfo &stage =
          (VkPipelineShaderStageCreateInfo &)graphicsInfo.pStages[i];

      bool storesUnsupported = false;

      if(stage.stage & VK_SHADER_STAGE_FRAGMENT_BIT)
      {
        if(!m_pDriver->GetDeviceEnabledFeatures().fragmentStoresAndAtomics)
          storesUnsupported = true;
      }
      else
      {
        if(!m_pDriver->GetDeviceEnabledFeatures().vertexPipelineStoresAndAtomics)
          storesUnsupported = true;
      }

      // if we are using buffer device address, we can just skip patching this shader
      if(storesUnsupported && bufferAddress != 0)
      {
        continue;

        // if we're not using BDA, we need to be sure all stages have the bindings patched in-kind.
        // Otherwise if e.g. vertex stores aren't supported the vertex bindings won't be patched and
        // will mismatch our patched descriptor sets
      }

      int idx = StageIndex(stage.stage);

      const VulkanCreationInfo::ShaderModule &moduleInfo =
          creationInfo.m_ShaderModule[pipeInfo.shaders[idx].module];

      rdcarray<uint32_t> modSpirv = moduleInfo.spirv.GetSPIRV();

      if(!Vulkan_Debug_FeedbackDumpDirPath().empty())
        FileIO::WriteAll(Vulkan_Debug_FeedbackDumpDirPath() + "/before_" + filename[idx], modSpirv);

      if(storesUnsupported)
      {
        OffsetBindingsToMatch(modSpirv);
      }
      else if(m_pDriver->GetDeviceEnabledFeatures().shaderInt64)
      {
        AnnotateShader<uint64_t>(*pipeInfo.shaders[idx].refl, *pipeInfo.shaders[idx].patchData,
                                 ShaderStage(StageIndex(stage.stage)), stage.pName,
                                 feedbackData.offsetMap, maxSlot, usePrimitiveID, bufferAddress,
                                 useBufferAddressKHR, usesMultiview, modSpirv, printfData[idx]);
      }
      else
      {
        AnnotateShader<uint32_t>(*pipeInfo.shaders[idx].refl, *pipeInfo.shaders[idx].patchData,
                                 ShaderStage(StageIndex(stage.stage)), stage.pName,
                                 feedbackData.offsetMap, maxSlot, usePrimitiveID, bufferAddress,
                                 useBufferAddressKHR, usesMultiview, modSpirv, printfData[idx]);
      }

      if(!Vulkan_Debug_FeedbackDumpDirPath().empty())
        FileIO::WriteAll(Vulkan_Debug_FeedbackDumpDirPath() + "/after_" + filename[idx], modSpirv);

      moduleCreateInfo.pCode = modSpirv.data();
      moduleCreateInfo.codeSize = modSpirv.size() * sizeof(uint32_t);

      vkr = m_pDriver->vkCreateShaderModule(dev, &moduleCreateInfo, NULL, &modules[i]);
      CheckVkResult(vkr);

      stage.module = modules[i];
    }
  }

  VkPipeline feedbackPipe;

  if(result.compute)
  {
    vkr = m_pDriver->vkCreateComputePipelines(m_Device, VK_NULL_HANDLE, 1, &computeInfo, NULL,
                                              &feedbackPipe);
    CheckVkResult(vkr);
  }
  else
  {
    vkr = m_pDriver->vkCreateGraphicsPipelines(m_Device, VK_NULL_HANDLE, 1, &graphicsInfo, NULL,
                                               &feedbackPipe);
    CheckVkResult(vkr);
  }

  // make copy of state to draw from
  VulkanRenderState modifiedstate = state;
  VulkanStatePipeline &modifiedpipe = result.compute ? modifiedstate.compute : modifiedstate.graphics;

  // bind created pipeline to partial replay state
  modifiedpipe.pipeline = GetResID(feedbackPipe);

  if(!useBufferAddress)
  {
    // replace descriptor set IDs with our temporary sets. The offsets we keep the same. If the
    // original action had no sets, we ensure there's room (with no offsets needed)

    if(modifiedpipe.descSets.empty())
      modifiedpipe.descSets.resize(1);

    for(size_t i = 0; i < descSets.size(); i++)
    {
      modifiedpipe.descSets[i].pipeLayout = GetResID(pipeLayout);
      modifiedpipe.descSets[i].descSet = GetResID(descSets[i]);
    }
  }

  modifiedstate.subpassContents = VK_SUBPASS_CONTENTS_INLINE;
  modifiedstate.dynamicRendering.flags &= ~VK_RENDERING_CONTENTS_SECONDARY_COMMAND_BUFFERS_BIT;

  {
    VkCommandBuffer cmd = m_pDriver->GetNextCmd();

    if(cmd == VK_NULL_HANDLE)
      return false;

    VkCommandBufferBeginInfo beginInfo = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, NULL,
                                          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT};

    vkr = ObjDisp(dev)->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
    CheckVkResult(vkr);

    // fill destination buffer with 0s to ensure a baseline to then feedback against
    ObjDisp(dev)->CmdFillBuffer(Unwrap(cmd), Unwrap(m_BindlessFeedback.FeedbackBuffer.buf), 0,
                                feedbackData.feedbackStorageSize, 0);

    VkBufferMemoryBarrier feedbackbufBarrier = {
        VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
        NULL,
        VK_ACCESS_TRANSFER_WRITE_BIT,
        VK_ACCESS_SHADER_WRITE_BIT,
        VK_QUEUE_FAMILY_IGNORED,
        VK_QUEUE_FAMILY_IGNORED,
        Unwrap(m_BindlessFeedback.FeedbackBuffer.buf),
        0,
        feedbackData.feedbackStorageSize,
    };

    // wait for the above fill to finish.
    DoPipelineBarrier(cmd, 1, &feedbackbufBarrier);

    if(result.compute)
    {
      modifiedstate.BindPipeline(m_pDriver, cmd, VulkanRenderState::BindCompute, true);

      ObjDisp(cmd)->CmdDispatch(Unwrap(cmd), action->dispatchDimension[0],
                                action->dispatchDimension[1], action->dispatchDimension[2]);
    }
    else
    {
      modifiedstate.BeginRenderPassAndApplyState(m_pDriver, cmd, VulkanRenderState::BindGraphics,
                                                 false);

      m_pDriver->ReplayDraw(cmd, *action);

      modifiedstate.EndRenderPass(cmd);
    }

    vkr = ObjDisp(dev)->EndCommandBuffer(Unwrap(cmd));
    CheckVkResult(vkr);

    m_pDriver->SubmitCmds();
    m_pDriver->FlushQ();
  }

  bytebuf data;
  GetBufferData(GetResID(m_BindlessFeedback.FeedbackBuffer.buf), 0, 0, data);

  for(auto it = feedbackData.offsetMap.begin(); it != feedbackData.offsetMap.end(); ++it)
  {
    uint32_t *readbackData = (uint32_t *)(data.data() + it->second.offset);

    DescriptorAccess access = it->second.access;

    for(uint32_t i = 0; i < it->second.numEntries; i++)
    {
      if(readbackData[i])
      {
        access.arrayElement = i;

        result.access.push_back(access);
      }

      access.byteOffset++;
    }
  }

  result.valid = true;

  uint32_t *printfBuf = (uint32_t *)data.data();
  uint32_t *printfBufEnd = (uint32_t *)(data.data() + Vulkan_Debug_PrintfBufferSize());
  if(usesPrintf && *printfBuf > 0)
  {
    uint32_t wordsNeeded = *printfBuf;

    if(wordsNeeded > Vulkan_Debug_PrintfBufferSize())
    {
      RDCLOG("printf buffer overflowed, needed %u bytes but printf buffer is only %u bytes",
             wordsNeeded * 4, Vulkan_Debug_PrintfBufferSize());
    }

    printfBuf++;

    while(*printfBuf && printfBuf < printfBufEnd)
    {
      ShaderStage stage = ShaderStage((*printfBuf) >> ShaderStageHeaderBitShift);
      uint32_t printfID = *printfBuf & 0xfffffffU;

      printfBuf++;

      if(stage < ShaderStage::Count)
      {
        auto it = printfData[(uint32_t)stage].find(printfID);
        if(it == printfData[(uint32_t)stage].end())
        {
          RDCERR("Error parsing DebugPrintf buffer, unexpected printf ID %x from header %x",
                 printfID, *printfBuf);
          break;
        }

        uint32_t *location = printfBuf;

        printfBuf += 4;

        const PrintfData &fmt = it->second;

        ShaderPrintfArgs args(printfBuf, fmt);

        printfBuf += fmt.payloadWords;

        // this message overflowed, don't process it
        if(printfBuf >= printfBufEnd)
          break;

        ShaderMessage msg;

        msg.stage = stage;

        const VulkanCreationInfo::Pipeline::Shader &sh = pipeInfo.shaders[(uint32_t)stage];

        {
          VulkanCreationInfo::ShaderModule &mod = creationInfo.m_ShaderModule[sh.module];
          VulkanCreationInfo::ShaderModuleReflection &modrefl =
              mod.GetReflection(stage, sh.entryPoint, pipe.pipeline);
          modrefl.PopulateDisassembly(mod.spirv);

          const std::map<size_t, uint32_t> instructionLines = modrefl.instructionLines;

          auto instit = instructionLines.find(printfID);
          if(instit != instructionLines.end())
            msg.disassemblyLine = (int32_t)instit->second;
          else
            msg.disassemblyLine = -1;
        }

        if(stage == ShaderStage::Compute)
        {
          for(int x = 0; x < 3; x++)
          {
            uint32_t threadDimX = sh.refl->dispatchThreadsDimension[x];
            msg.location.compute.workgroup[x] = location[x] / threadDimX;
            msg.location.compute.thread[x] = location[x] % threadDimX;
          }
        }
        else if(stage == ShaderStage::Task)
        {
          for(int x = 0; x < 3; x++)
          {
            uint32_t threadDimX = sh.refl->dispatchThreadsDimension[x];
            msg.location.mesh.taskGroup[x] = location[x] / threadDimX;
            msg.location.mesh.thread[x] = location[x] % threadDimX;
          }
        }
        else if(stage == ShaderStage::Vertex)
        {
          msg.location.vertex.vertexIndex = location[0];
          if(!(action->flags & ActionFlags::Indexed))
          {
            // for non-indexed draws get back to 0-based index
            msg.location.vertex.vertexIndex -= action->vertexOffset;
          }
          // go back to a 0-based instance index
          msg.location.vertex.instance = location[1] - action->instanceOffset;
          msg.location.vertex.view = location[2];
        }
        else if(stage == ShaderStage::Geometry)
        {
          msg.location.geometry.primitive = location[0];
          msg.location.geometry.view = location[1];
        }
        else if(stage == ShaderStage::Mesh)
        {
          for(int x = 0; x < 3; x++)
            msg.location.mesh.meshGroup[x] = location[x];

          uint32_t meshThread = msg.location.mesh.meshGroup[2] >> 16U;
          msg.location.mesh.meshGroup[2] &= 0xffffu;

          msg.location.mesh.thread[0] = meshThread % sh.refl->dispatchThreadsDimension[0];
          msg.location.mesh.thread[1] = (meshThread / sh.refl->dispatchThreadsDimension[0]) %
                                        sh.refl->dispatchThreadsDimension[1];
          msg.location.mesh.thread[2] = meshThread / (sh.refl->dispatchThreadsDimension[0] *
                                                      sh.refl->dispatchThreadsDimension[1]);

          const VulkanCreationInfo::Pipeline::Shader &tasksh =
              pipeInfo.shaders[(uint32_t)ShaderStage::Task];

          if(tasksh.module == ResourceId())
          {
            msg.location.mesh.taskGroup = {ShaderMeshMessageLocation::NotUsed,
                                           ShaderMeshMessageLocation::NotUsed,
                                           ShaderMeshMessageLocation::NotUsed};
          }
          else
          {
            uint32_t taskGroup = location[3];

            msg.location.mesh.taskGroup[0] = taskGroup % tasksh.refl->dispatchThreadsDimension[0];
            msg.location.mesh.taskGroup[1] = (taskGroup / tasksh.refl->dispatchThreadsDimension[0]) %
                                             tasksh.refl->dispatchThreadsDimension[1];
            msg.location.mesh.taskGroup[2] = taskGroup / (tasksh.refl->dispatchThreadsDimension[0] *
                                                          tasksh.refl->dispatchThreadsDimension[1]);
          }
        }
        else
        {
          msg.location.pixel.x = location[0] >> 16U;
          msg.location.pixel.y = location[0] & 0xffff;
          msg.location.pixel.sample = location[1] >> 16U;
          msg.location.pixel.view = location[1] & 0xffff;
          msg.location.pixel.primitive = location[2];
          if(msg.location.pixel.sample == (~0U >> 16U))
          {
            msg.location.pixel.sample = ~0U;
          }
        }

        msg.message = StringFormat::FmtArgs(fmt.effective_format.c_str(), args);

        if(!args.get_error().empty())
          msg.message = args.get_error() + " in \"" + fmt.user_format + "\"";

        result.messages.push_back(msg);
      }
      else
      {
        RDCERR("Error parsing DebugPrintf buffer, unexpected stage %x from header %x", stage,
               *printfBuf);
        break;
      }
    }
  }

  if(descpool != VK_NULL_HANDLE)
  {
    // delete descriptors. Technically we don't have to free the descriptor sets, but our tracking
    // on
    // replay doesn't handle destroying children of pooled objects so we do it explicitly anyway.
    m_pDriver->vkFreeDescriptorSets(dev, descpool, (uint32_t)descSets.size(), descSets.data());

    m_pDriver->vkDestroyDescriptorPool(dev, descpool, NULL);
  }

  for(VkDescriptorSetLayout layout : setLayouts)
    m_pDriver->vkDestroyDescriptorSetLayout(dev, layout, NULL);

  // delete pipeline layout
  m_pDriver->vkDestroyPipelineLayout(dev, pipeLayout, NULL);

  // delete pipeline
  m_pDriver->vkDestroyPipeline(dev, feedbackPipe, NULL);

  // delete shader/shader module
  for(size_t i = 0; i < ARRAY_COUNT(modules); i++)
    if(modules[i] != VK_NULL_HANDLE)
      m_pDriver->vkDestroyShaderModule(dev, modules[i], NULL);

  return true;
}