void VulkanReplay::FetchVSOut()

in renderdoc/driver/vulkan/vk_postvs.cpp [4069:5477]
988 lines of code
176 McCabe index (conditional complexity)

void VulkanReplay::FetchVSOut(uint32_t eventId, VulkanRenderState &state)
{
  VulkanCreationInfo &creationInfo = m_pDriver->m_CreationInfo;

  const VulkanCreationInfo::Pipeline &pipeInfo = creationInfo.m_Pipeline[state.graphics.pipeline];

  const ActionDescription *action = m_pDriver->GetAction(eventId);

  const VulkanCreationInfo::ShaderModule &moduleInfo =
      creationInfo.m_ShaderModule[pipeInfo.shaders[0].module];

  ShaderReflection *refl = pipeInfo.shaders[0].refl;

  VulkanPostVSData &ret = m_PostVS.Data[eventId];

  // set defaults so that we don't try to fetch this output again if something goes wrong and the
  // same event is selected again
  {
    ret.vsout.buf = VK_NULL_HANDLE;
    ret.vsout.bufmem = VK_NULL_HANDLE;
    ret.vsout.instStride = 0;
    ret.vsout.vertStride = 0;
    ret.vsout.numViews = 1;
    ret.vsout.nearPlane = 0.0f;
    ret.vsout.farPlane = 0.0f;
    ret.vsout.useIndices = false;
    ret.vsout.hasPosOut = false;
    ret.vsout.flipY = false;
    ret.vsout.idxbuf = VK_NULL_HANDLE;
    ret.vsout.idxbufmem = VK_NULL_HANDLE;

    ret.vsout.topo = MakePrimitiveTopology(state.primitiveTopology, state.patchControlPoints);
  }

  // no outputs from this shader? unexpected but theoretically possible (dummy VS before
  // tessellation maybe). Just fill out an empty data set
  if(refl->outputSignature.empty())
    return;

  // we go through the driver for all these creations since they need to be properly
  // registered in order to be put in the partial replay state
  VkResult vkr = VK_SUCCESS;
  VkDevice dev = m_Device;

  VkDescriptorPool descpool = VK_NULL_HANDLE;
  rdcarray<VkDescriptorSetLayout> setLayouts;
  rdcarray<VkDescriptorSet> descSets;

  VkPipelineLayout pipeLayout = VK_NULL_HANDLE;

  StorageMode storageMode = Binding;

  if(m_pDriver->GetExtensions(NULL).ext_KHR_buffer_device_address)
  {
    storageMode = KHR_bda;
  }
  else if(m_pDriver->GetExtensions(NULL).ext_EXT_buffer_device_address)
  {
    storageMode = EXT_bda;

    if(!m_pDriver->GetDeviceEnabledFeatures().shaderInt64)
    {
      static bool warned = false;
      if(!warned)
      {
        warned = true;
        RDCLOG(
            "EXT_buffer_device_address is available but shaderInt64 isn't, falling back to binding "
            "storage mode");
      }
    }
  }

  if(Vulkan_Debug_DisableBufferDeviceAddress() ||
     m_pDriver->GetDriverInfo().BufferDeviceAddressBrokenDriver())
    storageMode = Binding;

  if(m_pDriver->GetDeviceProps().limits.maxPerStageDescriptorStorageBuffers - 2 <
     MeshOutputBufferArraySize)
  {
    RDCWARN("Default buffer descriptor array size %u is over device limit, clamping to %u",
            MeshOutputBufferArraySize,
            m_pDriver->GetDeviceProps().limits.maxPerStageDescriptorStorageBuffers - 2);

    MeshOutputBufferArraySize =
        m_pDriver->GetDeviceProps().limits.maxPerStageDescriptorStorageBuffers - 2;
  }

  for(size_t i = 0; i < refl->inputSignature.size(); i++)
  {
    if(refl->inputSignature[i].regIndex >= MeshOutputBufferArraySize)
    {
      ret.vsout.status = StringFormat::Fmt(
          "Input %s refers to attribute %u which is too large to be handled",
          refl->inputSignature[i].varName.c_str(), refl->inputSignature[i].regIndex);
      RDCERR("%s", ret.vsout.status.c_str());
      return;
    }
  }

  VkDescriptorSetLayoutBinding newBindings[] = {
      // output buffer
      {
          0,
          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
          1,
          VK_SHADER_STAGE_COMPUTE_BIT,
          NULL,
      },
      // index buffer (if needed)
      {
          1,
          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
          1,
          VK_SHADER_STAGE_COMPUTE_BIT,
          NULL,
      },
      // vertex buffers
      {
          2,
          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
          MeshOutputBufferArraySize,
          VK_SHADER_STAGE_COMPUTE_BIT,
          NULL,
      },
  };
  RDCCOMPILE_ASSERT(ARRAY_COUNT(newBindings) == MeshOutputReservedBindings,
                    "MeshOutputReservedBindings is wrong");

  // the spec says only one push constant range may be used per stage, so at most one has
  // VERTEX_BIT. Find it, and make it COMPUTE_BIT
  VkPushConstantRange push;
  uint32_t numPush = 0;
  rdcarray<VkPushConstantRange> oldPush =
      creationInfo.m_PipelineLayout[pipeInfo.vertLayout].pushRanges;

  // ensure the push range is visible to the compute shader
  for(const VkPushConstantRange &range : oldPush)
  {
    if(range.stageFlags & VK_SHADER_STAGE_VERTEX_BIT)
    {
      push = range;
      push.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
      numPush = 1;
      break;
    }
  }

  if(storageMode == Binding)
  {
    // create a duplicate set of descriptor sets, all visible to compute, with bindings shifted to
    // account for new ones we need. This also copies the existing bindings into the new sets
    PatchReservedDescriptors(state.graphics, descpool, setLayouts, descSets,
                             VK_SHADER_STAGE_COMPUTE_BIT, newBindings, ARRAY_COUNT(newBindings));

    // if the pool failed due to limits, it will be NULL so bail now
    if(descpool == VK_NULL_HANDLE)
    {
      ret.vsout.status =
          "Couldn't allocate and patch compatible descriptors for vertex output fetch";
      return;
    }

    VkPipelineLayoutCreateInfo pipeLayoutInfo = {
        VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
        NULL,
        0,
        (uint32_t)setLayouts.size(),
        setLayouts.data(),
        numPush,
        &push,
    };

    vkr = m_pDriver->vkCreatePipelineLayout(dev, &pipeLayoutInfo, NULL, &pipeLayout);
    CheckVkResult(vkr);
  }
  else
  {
    // using BDA we don't need to add any new bindings but we *do* need to patch the descriptor set
    // layouts to be compute visible. However with update-after-bind descriptors in the mix we can't
    // always reliably do this, as making a copy of the descriptor sets can't be done (in general).
    //
    // To get around this we patch descriptor set layouts at create time so that COMPUTE_BIT is
    // present wherever VERTEX_BIT was, so we can use the application's descriptor sets and layouts

    const rdcarray<ResourceId> &sets =
        creationInfo.m_PipelineLayout[pipeInfo.vertLayout].descSetLayouts;

    setLayouts.reserve(sets.size());

    for(size_t i = 0; i < sets.size(); i++)
      setLayouts.push_back(GetResourceManager()->GetCurrentHandle<VkDescriptorSetLayout>(sets[i]));

    VkPipelineLayoutCreateInfo pipeLayoutInfo = {
        VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
        NULL,
        0,
        (uint32_t)setLayouts.size(),
        setLayouts.data(),
        numPush,
        &push,
    };

    vkr = m_pDriver->vkCreatePipelineLayout(dev, &pipeLayoutInfo, NULL, &pipeLayout);
    CheckVkResult(vkr);

    // clear the array because it's not needed after and we want to avoid releasing real resources
    setLayouts.clear();
  }

  VkBuffer meshBuffer = VK_NULL_HANDLE, readbackBuffer = VK_NULL_HANDLE;
  VkDeviceMemory meshMem = VK_NULL_HANDLE, readbackMem = VK_NULL_HANDLE;

  VkBuffer uniqIdxBuf = VK_NULL_HANDLE;
  VkDeviceMemory uniqIdxBufMem = VK_NULL_HANDLE;
  VkDescriptorBufferInfo uniqIdxBufDescriptor = {};

  VkBuffer rebasedIdxBuf = VK_NULL_HANDLE;
  VkDeviceMemory rebasedIdxBufMem = VK_NULL_HANDLE;

  uint32_t numVerts = action->numIndices;
  VkDeviceSize bufSize = 0;

  uint32_t numViews = 1;

  if(state.dynamicRendering.active)
  {
    numViews = RDCMAX(numViews, Log2Ceil(state.dynamicRendering.viewMask + 1));
  }
  else
  {
    const VulkanCreationInfo::RenderPass &rp = creationInfo.m_RenderPass[state.GetRenderPass()];

    if(state.subpass < rp.subpasses.size())
    {
      numViews = RDCMAX(numViews, (uint32_t)rp.subpasses[state.subpass].multiviews.size());
    }
    else
    {
      RDCERR("Subpass is out of bounds to renderpass creation info");
    }
  }

  uint32_t idxsize = state.ibuffer.bytewidth;
  if(idxsize == 0)
    idxsize = 4U;

  uint32_t maxIndex = RDCMAX(action->baseVertex, 0) + numVerts - 1;

  uint32_t maxInstance = action->instanceOffset + action->numInstances - 1;

  const VkMemoryAllocateFlagsInfo memFlags = {
      VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO,
      NULL,
      VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT,
  };

  if(action->flags & ActionFlags::Indexed)
  {
    const bool restart = state.primRestartEnable != VK_FALSE;
    bytebuf idxdata;
    rdcarray<uint32_t> indices;
    uint8_t *idx8 = NULL;
    uint16_t *idx16 = NULL;
    uint32_t *idx32 = NULL;

    // fetch ibuffer
    if(state.ibuffer.buf != ResourceId())
      GetBufferData(state.ibuffer.buf, state.ibuffer.offs + action->indexOffset * idxsize,
                    uint64_t(action->numIndices) * idxsize, idxdata);

    // figure out what the maximum index could be, so we can clamp our index buffer to something
    // sane
    uint32_t maxIdx = 0;

    // if there are no active bindings assume the vertex shader is generating its own data
    // and don't clamp the indices
    if(state.vertexBindings.empty())
      maxIdx = ~0U;

    for(uint32_t vb = 0; vb < state.vertexBindings.size(); vb++)
    {
      // only vertex inputs (not instance inputs) count
      if(state.vertexBindings[vb].inputRate == VK_VERTEX_INPUT_RATE_VERTEX)
      {
        uint32_t b = state.vertexBindings[vb].binding;
        if(b >= state.vbuffers.size())
          continue;

        ResourceId buf = state.vbuffers[b].buf;
        VkDeviceSize offs = state.vbuffers[b].offs;

        VkDeviceSize bufsize = creationInfo.m_Buffer[buf].size;

        // the maximum valid index on this particular input is the one that reaches
        // the end of the buffer. The maximum valid index at all is the one that reads
        // off the end of ALL buffers (so we max it with any other maxindex value
        // calculated).
        if(state.vbuffers[b].stride > 0)
          maxIdx = RDCMAX(maxIdx, uint32_t((bufsize - offs) / state.vbuffers[b].stride));
      }
    }

    // in case the vertex buffers were set but had invalid stride (0), max with the number
    // of vertices too. This is fine since the max here is just a conservative limit
    maxIdx = RDCMAX(maxIdx, action->numIndices);

    // do ibuffer rebasing/remapping

    if(idxsize == 4)
      idx32 = (uint32_t *)&idxdata[0];
    else if(idxsize == 1)
      idx8 = (uint8_t *)&idxdata[0];
    else
      idx16 = (uint16_t *)&idxdata[0];

    // only read as many indices as were available in the buffer
    uint32_t numIndices = RDCMIN(uint32_t(idxdata.size() / idxsize), action->numIndices);

    uint32_t idxclamp = 0;
    if(action->baseVertex < 0)
      idxclamp = uint32_t(-action->baseVertex);

    // grab all unique vertex indices referenced
    for(uint32_t i = 0; i < numIndices; i++)
    {
      uint32_t i32 = 0;
      if(idx32)
        i32 = idx32[i];
      else if(idx16)
        i32 = uint32_t(idx16[i]);
      else if(idx8)
        i32 = uint32_t(idx8[i]);

      // apply baseVertex but clamp to 0 (don't allow index to become negative)
      if(i32 < idxclamp)
        i32 = 0;
      else if(action->baseVertex < 0)
        i32 -= idxclamp;
      else if(action->baseVertex > 0)
        i32 += action->baseVertex;

      // we clamp to maxIdx here, to avoid any invalid indices like 0xffffffff
      // from filtering through. Worst case we index to the end of the vertex
      // buffers which is generally much more reasonable
      i32 = RDCMIN(maxIdx, i32);

      // ignore primitive restart indices
      if(restart && i32 == (0xffffffff >> ((4 - idxsize) * 8)))
        continue;

      auto it = std::lower_bound(indices.begin(), indices.end(), i32);

      if(it != indices.end() && *it == i32)
        continue;

      indices.insert(it - indices.begin(), i32);
    }

    // if we read out of bounds, we'll also have a 0 index being referenced
    // (as 0 is read). Don't insert 0 if we already have 0 though
    if(numIndices < action->numIndices && (indices.empty() || indices[0] != 0))
      indices.insert(0, 0);

    maxIndex = indices.back();

    // set numVerts
    numVerts = (uint32_t)indices.size();

    // An index buffer could be something like: 500, 501, 502, 501, 503, 502
    // in which case we can't use the existing index buffer without filling 499 slots of vertex
    // data with padding. Instead we rebase the indices based on the smallest vertex so it becomes
    // 0, 1, 2, 1, 3, 2 and then that matches our stream-out'd buffer.
    //
    // Note that there could also be gaps, like: 500, 501, 502, 510, 511, 512
    // which would become 0, 1, 2, 3, 4, 5 and so the old index buffer would no longer be valid.
    // We just stream-out a tightly packed list of unique indices, and then remap the index buffer
    // so that what did point to 500 points to 0 (accounting for rebasing), and what did point
    // to 510 now points to 3 (accounting for the unique sort).

    // we use a map here since the indices may be sparse. Especially considering if an index
    // is 'invalid' like 0xcccccccc then we don't want an array of 3.4 billion entries.
    std::map<uint32_t, size_t> indexRemap;
    for(size_t i = 0; i < indices.size(); i++)
    {
      // by definition, this index will only appear once in indices[]
      indexRemap[indices[i]] = i;
    }

    // create buffer with unique 0-based indices
    VkBufferCreateInfo bufInfo = {
        VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
        NULL,
        0,
        indices.size() * sizeof(uint32_t),
        VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
    };

    // the flag is the same for KHR and EXT
    if(storageMode != Binding)
      bufInfo.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;

    vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &uniqIdxBuf);
    CheckVkResult(vkr);

    uniqIdxBufDescriptor.buffer = uniqIdxBuf;
    uniqIdxBufDescriptor.offset = 0;
    uniqIdxBufDescriptor.range = VK_WHOLE_SIZE;

    VkMemoryRequirements mrq = {0};
    m_pDriver->vkGetBufferMemoryRequirements(dev, uniqIdxBuf, &mrq);

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
        NULL,
        mrq.size,
        m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits),
    };

    if(storageMode == KHR_bda)
      allocInfo.pNext = &memFlags;

    vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &uniqIdxBufMem);

    if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
    {
      ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
      RDCERR("%s", ret.vsout.status.c_str());
      return;
    }

    CheckVkResult(vkr);

    vkr = m_pDriver->vkBindBufferMemory(dev, uniqIdxBuf, uniqIdxBufMem, 0);
    CheckVkResult(vkr);

    byte *idxData = NULL;
    vkr = m_pDriver->vkMapMemory(m_Device, uniqIdxBufMem, 0, VK_WHOLE_SIZE, 0, (void **)&idxData);
    CheckVkResult(vkr);
    if(vkr != VK_SUCCESS || !idxData)
    {
      if(!idxData)
      {
        RDCERR("Manually reporting failed memory map");
        CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
      }
      ret.vsout.status = "Couldn't read back vertex output data from GPU";
      return;
    }

    memcpy(idxData, &indices[0], indices.size() * sizeof(uint32_t));

    VkMappedMemoryRange range = {
        VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, uniqIdxBufMem, 0, VK_WHOLE_SIZE,
    };

    vkr = m_pDriver->vkFlushMappedMemoryRanges(m_Device, 1, &range);
    CheckVkResult(vkr);

    m_pDriver->vkUnmapMemory(m_Device, uniqIdxBufMem);

    // rebase existing index buffer to point to the right elements in our stream-out'd
    // vertex buffer
    for(uint32_t i = 0; i < numIndices; i++)
    {
      uint32_t i32 = 0;
      if(idx32)
        i32 = idx32[i];
      else if(idx16)
        i32 = uint32_t(idx16[i]);
      else if(idx8)
        i32 = uint32_t(idx8[i]);

      // preserve primitive restart indices
      if(restart && i32 == (0xffffffff >> ((4 - idxsize) * 8)))
        continue;

      // apply baseVertex but clamp to 0 (don't allow index to become negative)
      if(i32 < idxclamp)
        i32 = 0;
      else if(action->baseVertex < 0)
        i32 -= idxclamp;
      else if(action->baseVertex > 0)
        i32 += action->baseVertex;

      if(idx32)
        idx32[i] = uint32_t(indexRemap[i32]);
      else if(idx16)
        idx16[i] = uint16_t(indexRemap[i32]);
      else if(idx8)
        idx8[i] = uint8_t(indexRemap[i32]);
    }

    bufInfo.size = RDCMAX((VkDeviceSize)64, (VkDeviceSize)idxdata.size());
    bufInfo.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;

    vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &rebasedIdxBuf);
    CheckVkResult(vkr);

    m_pDriver->vkGetBufferMemoryRequirements(dev, rebasedIdxBuf, &mrq);

    allocInfo.allocationSize = mrq.size;
    allocInfo.memoryTypeIndex = m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits);

    vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &rebasedIdxBufMem);

    if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
    {
      RDCWARN("Failed to allocate %llu bytes for rebased index buffer", mrq.size);
      ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
      return;
    }

    CheckVkResult(vkr);

    vkr = m_pDriver->vkBindBufferMemory(dev, rebasedIdxBuf, rebasedIdxBufMem, 0);
    CheckVkResult(vkr);

    vkr = m_pDriver->vkMapMemory(m_Device, rebasedIdxBufMem, 0, VK_WHOLE_SIZE, 0, (void **)&idxData);
    CheckVkResult(vkr);
    if(vkr != VK_SUCCESS || !idxData)
    {
      if(!idxData)
      {
        RDCERR("Manually reporting failed memory map");
        CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
      }
      ret.vsout.status = "Couldn't read back vertex output data from GPU";
      return;
    }

    memcpy(idxData, idxdata.data(), idxdata.size());

    VkMappedMemoryRange rebasedRange = {
        VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, rebasedIdxBufMem, 0, VK_WHOLE_SIZE,
    };

    vkr = m_pDriver->vkFlushMappedMemoryRanges(m_Device, 1, &rebasedRange);
    CheckVkResult(vkr);

    m_pDriver->vkUnmapMemory(m_Device, rebasedIdxBufMem);
  }

  uint32_t baseSpecConstant = 0;

  bytebuf specData;
  rdcarray<VkSpecializationMapEntry> specEntries;

  VkGraphicsPipelineCreateInfo pipeCreateInfo;

  // get pipeline create info
  m_pDriver->GetShaderCache()->MakeGraphicsPipelineInfo(pipeCreateInfo, state.graphics.pipeline);

  // copy over specialization info
  for(uint32_t s = 0; s < pipeCreateInfo.stageCount; s++)
  {
    if(pipeCreateInfo.pStages[s].stage == VK_SHADER_STAGE_VERTEX_BIT)
    {
      if(pipeCreateInfo.pStages[s].pSpecializationInfo)
      {
        specData.assign((const byte *)pipeCreateInfo.pStages[s].pSpecializationInfo->pData,
                        pipeCreateInfo.pStages[s].pSpecializationInfo->dataSize);
        specEntries.assign(pipeCreateInfo.pStages[s].pSpecializationInfo->pMapEntries,
                           pipeCreateInfo.pStages[s].pSpecializationInfo->mapEntryCount);
      }
      break;
    }
  }

  // don't overlap with existing pipeline constants
  for(const VkSpecializationMapEntry &specConst : specEntries)
    baseSpecConstant = RDCMAX(baseSpecConstant, specConst.constantID + 1);

  uint32_t bufStride = 0;
  rdcarray<uint32_t> modSpirv = moduleInfo.spirv.GetSPIRV();

  struct CompactedAttrBuffer
  {
    VkDeviceMemory mem;
    VkBuffer buf;
    VkDescriptorBufferInfo descriptor;
  };

  rdcarray<uint32_t> attrInstDivisor;
  rdcarray<CompactedAttrBuffer> vbuffers;
  vbuffers.resize(MeshOutputBufferArraySize);

  {
    rdcarray<VkWriteDescriptorSet> descWrites;
    descWrites.resize(MeshOutputBufferArraySize);
    uint32_t numWrites = 0;

    RDCASSERT(state.vertexAttributes.size() <= MeshOutputBufferArraySize);

    // we fetch the vertex buffer data up front here since there's a very high chance of either
    // overlap due to interleaved attributes, or no overlap and no wastage due to separate compact
    // attributes.
    rdcarray<bytebuf> origVBs;
    origVBs.reserve(16);

    for(uint32_t vb = 0; vb < state.vertexBindings.size(); vb++)
    {
      uint32_t binding = state.vertexBindings[vb].binding;
      if(binding >= state.vbuffers.size())
      {
        origVBs.push_back(bytebuf());
        continue;
      }

      VkDeviceSize offs = state.vbuffers[binding].offs;
      VkDeviceSize stride = state.vbuffers[binding].stride;
      uint64_t len = 0;

      if(state.vertexBindings[vb].inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
      {
        len = (uint64_t(maxInstance) + 1) * stride;

        offs += action->instanceOffset * stride;
      }
      else
      {
        len = (uint64_t(maxIndex) + 1) * stride;

        offs += action->vertexOffset * stride;
      }

      len = RDCMIN(len, state.vbuffers[binding].size);

      origVBs.push_back(bytebuf());
      if(state.vbuffers[binding].buf != ResourceId())
        GetBufferData(state.vbuffers[binding].buf, offs, len, origVBs.back());
    }

    for(uint32_t i = 0; i < state.vertexAttributes.size(); i++)
    {
      const VkVertexInputAttributeDescription2EXT &attrDesc = state.vertexAttributes[i];
      uint32_t attr = attrDesc.location;

      RDCASSERT(attr < 64);
      if(attr >= vbuffers.size())
      {
        RDCERR("Attribute index too high! Resize array.");
        continue;
      }

      uint32_t instDivisor = ~0U;
      size_t stride = 1;

      const byte *origVBBegin = NULL;
      const byte *origVBEnd = NULL;

      for(uint32_t vb = 0; vb < state.vertexBindings.size(); vb++)
      {
        const VkVertexInputBindingDescription2EXT &vbDesc = state.vertexBindings[vb];
        if(vbDesc.binding == attrDesc.binding)
        {
          origVBBegin = origVBs[vb].data() + attrDesc.offset;
          origVBEnd = origVBs[vb].data() + origVBs[vb].size();

          if(origVBs[vb].empty())
            origVBBegin = origVBEnd = NULL;

          stride = vbDesc.stride;
          if(vbDesc.inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
            instDivisor = vbDesc.divisor;
          else
            instDivisor = ~0U;
          break;
        }
      }

      if(attrDesc.binding < state.vbuffers.size())
        stride = (size_t)state.vbuffers[attrDesc.binding].stride;

      // in some limited cases, provided we added the UNIFORM_TEXEL_BUFFER usage bit, we could use
      // the original buffers here as-is and read out of them. However it is likely that the offset
      // is not a multiple of the minimum texel buffer offset for at least some of the buffers if
      // not all of them, so we simplify the code here by *always* reading back the vertex buffer
      // data and uploading a compacted version.

      // we also need to handle the case where the format is not natively supported as a texel
      // buffer.

      // we used to use expanded texel buffers (i.e. expand to uint4, float4, int4 etc from any
      // smaller format) but since we want to support buffer_device_address to avoid descriptor
      // patching entirely it's easier to have an SSBO-based path. For that reason we only upload
      // this data as 16-byte strided data and read it out of a uint4[] then bitcast to int4 or
      // float4. That way the uint4[] SSBO can be easily substituted for a buffer device address
      VkFormat origFormat = attrDesc.format;
      VkFormat expandedFormat = VK_FORMAT_R32G32B32A32_SFLOAT;

      if(Is64BitFormat(origFormat))
        expandedFormat = VK_FORMAT_R32G32B32A32_UINT;
      else if(IsUIntFormat(origFormat))
        expandedFormat = VK_FORMAT_R32G32B32A32_UINT;
      else if(IsSIntFormat(origFormat))
        expandedFormat = VK_FORMAT_R32G32B32A32_SINT;

      uint32_t origElemSize = GetByteSize(1, 1, 1, origFormat, 0);
      uint32_t elemSize = GetByteSize(1, 1, 1, expandedFormat, 0);

      // 64-bit values are packed as uvec2
      if(Is64BitFormat(origFormat))
        elemSize *= 2;

      // used for interpreting the original data, if we're upcasting
      ResourceFormat fmt = MakeResourceFormat(origFormat);

      {
        VkBufferCreateInfo bufInfo = {
            VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
            NULL,
            0,
            elemSize * (maxIndex + 1),
            VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
        };

        if(instDivisor != ~0U)
          bufInfo.size = elemSize * (maxInstance + 1);

        // the flag is the same for KHR and EXT
        if(storageMode != Binding)
          bufInfo.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;

        vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &vbuffers[attr].buf);
        CheckVkResult(vkr);

        VkMemoryRequirements mrq = {0};
        m_pDriver->vkGetBufferMemoryRequirements(dev, vbuffers[attr].buf, &mrq);

        VkMemoryAllocateInfo allocInfo = {
            VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
            NULL,
            mrq.size,
            m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits),
        };

        if(storageMode == KHR_bda)
          allocInfo.pNext = &memFlags;

        vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &vbuffers[attr].mem);

        if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
        {
          RDCWARN("Failed to allocate %llu bytes for patched vertex buffer", mrq.size);
          ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
          return;
        }

        CheckVkResult(vkr);

        vkr = m_pDriver->vkBindBufferMemory(dev, vbuffers[attr].buf, vbuffers[attr].mem, 0);
        CheckVkResult(vkr);

        byte *dst = NULL;
        vkr =
            m_pDriver->vkMapMemory(m_Device, vbuffers[attr].mem, 0, VK_WHOLE_SIZE, 0, (void **)&dst);
        CheckVkResult(vkr);
        if(vkr != VK_SUCCESS || !dst)
        {
          if(!dst)
          {
            RDCERR("Manually reporting failed memory map");
            CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
          }
          ret.vsout.status = "Couldn't read back vertex output data from GPU";
          return;
        }

        const byte *dstBase = dst;
        (void)dstBase;

        const byte *dstEnd = dst + bufInfo.size;

        if(dst)
        {
          FloatVector defaultValue(0.0f, 0.0f, 0.0f, 1.0f);
          if(fmt.compType == CompType::UInt || fmt.compType == CompType::SInt || fmt.compCount == 4)
            defaultValue.w = 0.0f;

          const byte *src = origVBBegin;

          // fast memcpy compaction case for regular 32-bit types. Any type like R32G32B32 or so on
          // can be memcpy'd into place and read, since we discard any unused components and there's
          // no re-interpretation needed.
          if(fmt.type == ResourceFormatType::Regular && fmt.compByteWidth == 4)
          {
            size_t expandedComponentBytes = sizeof(FloatVector) - origElemSize;

            while(src < origVBEnd && dst < dstEnd)
            {
              if(expandedComponentBytes > 0)
                memcpy(dst + origElemSize, ((byte *)&defaultValue) + origElemSize,
                       expandedComponentBytes);
              memcpy(dst, src, origElemSize);

              // advance by the *destination* element size of 16 bytes
              dst += elemSize;
              src += stride;
            }

            // fill the rest with default values
            while(dst < dstEnd)
            {
              memcpy(dst, &defaultValue, sizeof(FloatVector));
              dst += elemSize;
            }
          }
          else
          {
            uint32_t zero = 0;

            // upcasting path
            if(Is64BitFormat(origFormat))
            {
              while(src < origVBEnd && dst < dstEnd)
              {
                // the 64-bit value (especially for doubles) is already in "packed uvec2" order,
                // with least significant 32-bits first, so we can copy directly
                memcpy(dst, src, sizeof(uint64_t) * fmt.compCount);
                dst += sizeof(uint64_t) * fmt.compCount;

                // fill up to *8* zeros not 4, since we're filling two for every component
                for(uint8_t c = fmt.compCount * 2; c < 8; c++)
                {
                  memcpy(dst, &zero, sizeof(uint32_t));
                  dst += sizeof(uint32_t);
                }

                src += stride;
              }
            }
            else if(IsUIntFormat(expandedFormat))
            {
              while(src < origVBEnd && dst < dstEnd)
              {
                uint32_t val = 0;

                const byte *s = src;

                uint8_t c = 0;
                for(; c < fmt.compCount; c++)
                {
                  if(fmt.compByteWidth == 1)
                    val = *s;
                  else if(fmt.compByteWidth == 2)
                    val = *(uint16_t *)s;
                  else if(fmt.compByteWidth == 4)
                    val = *(uint32_t *)s;

                  memcpy(dst, &val, sizeof(uint32_t));
                  dst += sizeof(uint32_t);
                  s += fmt.compByteWidth;
                }

                for(; c < 4; c++)
                {
                  memcpy(dst, &zero, sizeof(uint32_t));
                  dst += sizeof(uint32_t);
                }

                src += stride;
              }
            }
            else if(IsSIntFormat(expandedFormat))
            {
              while(src < origVBEnd && dst < dstEnd)
              {
                int32_t val = 0;

                const byte *s = src;

                uint8_t c = 0;
                for(; c < fmt.compCount; c++)
                {
                  if(fmt.compByteWidth == 1)
                    val = *(int8_t *)s;
                  else if(fmt.compByteWidth == 2)
                    val = *(int16_t *)s;
                  else if(fmt.compByteWidth == 4)
                    val = *(int32_t *)s;

                  memcpy(dst, &val, sizeof(int32_t));
                  dst += sizeof(int32_t);
                  s += fmt.compByteWidth;
                }

                for(; c < 4; c++)
                {
                  memcpy(dst, &zero, sizeof(uint32_t));
                  dst += sizeof(uint32_t);
                }

                src += stride;
              }
            }
            else
            {
              while(src < origVBEnd && dst < dstEnd)
              {
                bool valid = false;
                FloatVector vec = HighlightCache::InterpretVertex(src, 0, 0, fmt, origVBEnd, valid);

                memcpy(dst, &vec, sizeof(FloatVector));
                dst += sizeof(FloatVector);
                src += stride;
              }

              // fill the rest with default values
              while(dst < dstEnd)
              {
                memcpy(dst, &defaultValue, sizeof(FloatVector));
                dst += elemSize;
              }
            }
          }
        }

        VkMappedMemoryRange range = {
            VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, vbuffers[attr].mem, 0, VK_WHOLE_SIZE,
        };

        vkr = m_pDriver->vkFlushMappedMemoryRanges(m_Device, 1, &range);
        CheckVkResult(vkr);

        m_pDriver->vkUnmapMemory(m_Device, vbuffers[attr].mem);
      }

      attrInstDivisor.resize(RDCMAX(attrInstDivisor.size(), size_t(attr + 1)));
      attrInstDivisor[attr] = instDivisor;

      vbuffers[attr].descriptor.buffer = vbuffers[attr].buf;
      vbuffers[attr].descriptor.offset = 0;
      vbuffers[attr].descriptor.range = VK_WHOLE_SIZE;

      if(!descSets.empty())
      {
        descWrites[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
        descWrites[numWrites].dstSet = descSets[0];
        descWrites[numWrites].dstBinding = 2;
        descWrites[numWrites].dstArrayElement = attr;
        descWrites[numWrites].descriptorCount = 1;
        descWrites[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
        descWrites[numWrites].pBufferInfo = &vbuffers[attr].descriptor;
        numWrites++;
      }
    }

    // add a write of the index buffer
    if(uniqIdxBuf != VK_NULL_HANDLE && !descSets.empty())
    {
      descWrites[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
      descWrites[numWrites].dstSet = descSets[0];
      descWrites[numWrites].dstBinding = 1;
      descWrites[numWrites].dstArrayElement = 0;
      descWrites[numWrites].descriptorCount = 1;
      descWrites[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
      descWrites[numWrites].pBufferInfo = &uniqIdxBufDescriptor;
      numWrites++;
    }

    if(numWrites > 0)
      m_pDriver->vkUpdateDescriptorSets(dev, numWrites, descWrites.data(), 0, NULL);
  }

  if(!Vulkan_Debug_PostVSDumpDirPath().empty())
    FileIO::WriteAll(Vulkan_Debug_PostVSDumpDirPath() + "/debug_postvs_vert.spv", modSpirv);

  ConvertToMeshOutputCompute(*refl, *pipeInfo.shaders[0].patchData, pipeInfo.shaders[0].entryPoint,
                             storageMode, attrInstDivisor, action, numVerts, numViews,
                             baseSpecConstant, modSpirv, bufStride);

  if(!Vulkan_Debug_PostVSDumpDirPath().empty())
    FileIO::WriteAll(Vulkan_Debug_PostVSDumpDirPath() + "/debug_postvs_comp.spv", modSpirv);

  {
    // now that we know the stride, create buffer of sufficient size
    // this can't just be bufStride * num unique indices per instance, as we don't
    // have a compact 0-based index to index into the buffer. We must use
    // index-minIndex which is 0-based but potentially sparse, so this buffer may
    // be more or less wasteful
    VkBufferCreateInfo bufInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};

    // set bufSize
    bufSize = bufInfo.size = uint64_t(numVerts) * uint64_t(action->numInstances) *
                             uint64_t(bufStride) * uint64_t(numViews);

    bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
    bufInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
    bufInfo.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
    bufInfo.usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;

    // the flag is the same for KHR and EXT
    if(storageMode != Binding)
      bufInfo.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;

    vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &meshBuffer);
    CheckVkResult(vkr);

    bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;

    vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &readbackBuffer);
    CheckVkResult(vkr);

    VkMemoryRequirements mrq = {0};
    m_pDriver->vkGetBufferMemoryRequirements(dev, meshBuffer, &mrq);

    if(mrq.size > m_pDriver->GetMaxMemoryAllocationSize())
    {
      ret.vsout.status = StringFormat::Fmt("OOM %llu bytes Max %llu bytes", mrq.size,
                                           m_pDriver->GetMaxMemoryAllocationSize());
      return;
    }

    VkMemoryAllocateInfo allocInfo = {
        VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
        NULL,
        mrq.size,
        m_pDriver->GetGPULocalMemoryIndex(mrq.memoryTypeBits),
    };

    if(storageMode == KHR_bda)
      allocInfo.pNext = &memFlags;

    vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &meshMem);

    if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
    {
      RDCWARN("Failed to allocate %llu bytes for output vertex SSBO", mrq.size);
      ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
      return;
    }

    CheckVkResult(vkr);

    vkr = m_pDriver->vkBindBufferMemory(dev, meshBuffer, meshMem, 0);
    CheckVkResult(vkr);

    m_pDriver->vkGetBufferMemoryRequirements(dev, readbackBuffer, &mrq);

    allocInfo.pNext = NULL;
    allocInfo.memoryTypeIndex = m_pDriver->GetReadbackMemoryIndex(mrq.memoryTypeBits);

    vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &readbackMem);

    if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
    {
      RDCWARN("Failed to allocate %llu bytes for readback memory", mrq.size);
      ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
      return;
    }

    CheckVkResult(vkr);

    vkr = m_pDriver->vkBindBufferMemory(dev, readbackBuffer, readbackMem, 0);
    CheckVkResult(vkr);
  }

  VkComputePipelineCreateInfo compPipeInfo = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO};

  // repoint pipeline layout
  compPipeInfo.layout = pipeLayout;

  // create vertex shader with modified code
  VkShaderModuleCreateInfo moduleCreateInfo = {
      VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, NULL,         0,
      modSpirv.size() * sizeof(uint32_t),          &modSpirv[0],
  };

  VkShaderModule module;
  vkr = m_pDriver->vkCreateShaderModule(dev, &moduleCreateInfo, NULL, &module);
  CheckVkResult(vkr);

  compPipeInfo.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
  compPipeInfo.stage.module = module;
  compPipeInfo.stage.pName = PatchedMeshOutputEntryPoint;
  compPipeInfo.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;

  // append our own if we're using BDA
  if(storageMode != Binding)
  {
    // ensure we're 64-bit aligned first
    specData.resize(AlignUp(specData.size(), (size_t)8));

    uint32_t baseOffset = (uint32_t)specData.size();

    rdcarray<uint64_t> addresses;
    addresses.resize(MeshOutputBufferArraySize + 2);

    for(uint32_t i = 0; i <= MeshOutputBufferArraySize + 1; i++)
    {
      RDCCOMPILE_ASSERT(VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO ==
                            VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT,
                        "KHR and EXT buffer_device_address should be interchangeable here.");
      VkBufferDeviceAddressInfo getAddressInfo = {VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO};

      if(i < MeshOutputBufferArraySize)
        getAddressInfo.buffer = vbuffers[i].buf;
      else if(i == MeshOutputBufferArraySize)
        getAddressInfo.buffer = uniqIdxBuf;
      else if(i == MeshOutputBufferArraySize + 1)
        getAddressInfo.buffer = meshBuffer;

      // skip
      if(getAddressInfo.buffer == VK_NULL_HANDLE)
        continue;

      if(storageMode == KHR_bda)
        addresses[i] = m_pDriver->vkGetBufferDeviceAddress(dev, &getAddressInfo);
      else
        addresses[i] = m_pDriver->vkGetBufferDeviceAddressEXT(dev, &getAddressInfo);

      VkSpecializationMapEntry entry;
      entry.offset = baseOffset + i * sizeof(uint64_t);
      entry.constantID = baseSpecConstant + i * 2 + 0;

      // for EXT we have one 64-bit spec constant per address, for KHR we have a uvec2 - two
      // constants
      if(storageMode == EXT_bda)
      {
        entry.size = sizeof(uint64_t);
        specEntries.push_back(entry);
      }
      else
      {
        entry.size = sizeof(uint32_t);
        specEntries.push_back(entry);

        entry.offset += sizeof(uint32_t);
        entry.constantID++;

        entry.size = sizeof(uint32_t);
        specEntries.push_back(entry);
      }
    }

    specData.append((const byte *)addresses.data(), addresses.byteSize());
  }

  VkSpecializationInfo specInfo = {};
  specInfo.dataSize = specData.size();
  specInfo.pData = specData.data();
  specInfo.mapEntryCount = (uint32_t)specEntries.size();
  specInfo.pMapEntries = specEntries.data();

  compPipeInfo.stage.pSpecializationInfo = &specInfo;

  // create new pipeline
  VkPipeline pipe;
  vkr = m_pDriver->vkCreateComputePipelines(m_Device, VK_NULL_HANDLE, 1, &compPipeInfo, NULL, &pipe);

  if(vkr != VK_SUCCESS)
  {
    ret.vsout.status =
        StringFormat::Fmt("Failed to create patched compute pipeline: %s", ToStr(vkr).c_str());
    RDCERR("%s", ret.vsout.status.c_str());
    return;
  }

  // make copy of state to draw from
  VulkanRenderState modifiedstate = state;

  // bind created pipeline to partial replay state
  modifiedstate.compute.pipeline = GetResID(pipe);

  // move graphics descriptor sets onto the compute pipe.
  modifiedstate.compute.descSets = modifiedstate.graphics.descSets;

  if(!descSets.empty())
  {
    // replace descriptor set IDs with our temporary sets. The offsets we keep the same. If the
    // original draw had no sets, we ensure there's room (with no offsets needed)
    if(modifiedstate.compute.descSets.empty())
      modifiedstate.compute.descSets.resize(1);

    for(size_t i = 0; i < descSets.size(); i++)
    {
      modifiedstate.compute.descSets[i].pipeLayout = GetResID(pipeLayout);
      modifiedstate.compute.descSets[i].descSet = GetResID(descSets[i]);
    }
  }
  else
  {
    for(size_t i = 0; i < modifiedstate.compute.descSets.size(); i++)
      modifiedstate.compute.descSets[i].pipeLayout = GetResID(pipeLayout);
  }

  {
    VkCommandBuffer cmd = m_pDriver->GetNextCmd();

    if(cmd == VK_NULL_HANDLE)
      return;

    VkCommandBufferBeginInfo beginInfo = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, NULL,
                                          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT};

    vkr = ObjDisp(dev)->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
    CheckVkResult(vkr);

    // fill destination buffer with 0s to ensure unwritten vertices have sane data
    ObjDisp(dev)->CmdFillBuffer(Unwrap(cmd), Unwrap(meshBuffer), 0, bufSize, 0);

    VkBufferMemoryBarrier meshbufbarrier = {
        VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
        NULL,
        VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT,
        VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
        VK_QUEUE_FAMILY_IGNORED,
        VK_QUEUE_FAMILY_IGNORED,
    };

    meshbufbarrier.size = VK_WHOLE_SIZE;

    VkMemoryBarrier globalbarrier = {
        VK_STRUCTURE_TYPE_MEMORY_BARRIER,
        NULL,
        VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT,
        VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
    };

    // wait for uploads of index buffer (if used), compacted vertex buffers, and the above fill to
    // finish.
    DoPipelineBarrier(cmd, 1, &globalbarrier);

    // vkUpdateDescriptorSet desc set to point to buffer
    VkDescriptorBufferInfo fetchdesc = {0};
    fetchdesc.buffer = meshBuffer;
    fetchdesc.offset = 0;
    fetchdesc.range = bufSize;

    if(!descSets.empty())
    {
      VkWriteDescriptorSet write = {
          VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, NULL, descSets[0], 0,   0, 1,
          VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,      NULL, &fetchdesc,  NULL};
      m_pDriver->vkUpdateDescriptorSets(dev, 1, &write, 0, NULL);
    }

    // do single draw
    modifiedstate.BindPipeline(m_pDriver, cmd, VulkanRenderState::BindCompute, true);
    uint64_t totalVerts = numVerts * uint64_t(action->numInstances) * uint64_t(numViews);

    // the validation layers will probably complain about this dispatch saying some arrays aren't
    // fully updated. That's because they don't statically analyse that only fixed indices are
    // referred to. It's safe to leave unused array indices as invalid descriptors.
    ObjDisp(cmd)->CmdDispatch(Unwrap(cmd), uint32_t(totalVerts / MeshOutputDispatchWidth) + 1, 1, 1);

    // wait for mesh output writing to finish
    meshbufbarrier.buffer = Unwrap(meshBuffer);
    meshbufbarrier.size = bufSize;
    meshbufbarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
    meshbufbarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;

    DoPipelineBarrier(cmd, 1, &meshbufbarrier);

    VkBufferCopy bufcopy = {
        0,
        0,
        bufSize,
    };

    // copy to readback buffer
    ObjDisp(dev)->CmdCopyBuffer(Unwrap(cmd), Unwrap(meshBuffer), Unwrap(readbackBuffer), 1, &bufcopy);

    meshbufbarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
    meshbufbarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
    meshbufbarrier.buffer = Unwrap(readbackBuffer);

    // wait for copy to finish
    DoPipelineBarrier(cmd, 1, &meshbufbarrier);

    vkr = ObjDisp(dev)->EndCommandBuffer(Unwrap(cmd));
    CheckVkResult(vkr);

    // submit & flush so that we don't have to keep pipeline around for a while
    m_pDriver->SubmitCmds();
    m_pDriver->FlushQ();
  }

  for(CompactedAttrBuffer attrBuf : vbuffers)
  {
    m_pDriver->vkDestroyBuffer(dev, attrBuf.buf, NULL);
    m_pDriver->vkFreeMemory(dev, attrBuf.mem, NULL);
  }

  // readback mesh data
  byte *byteData = NULL;
  vkr = m_pDriver->vkMapMemory(m_Device, readbackMem, 0, VK_WHOLE_SIZE, 0, (void **)&byteData);
  CheckVkResult(vkr);
  if(vkr != VK_SUCCESS || !byteData)
  {
    if(!byteData)
    {
      RDCERR("Manually reporting failed memory map");
      CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
    }
    ret.vsout.status = "Couldn't read back vertex output data from GPU";
    return;
  }

  VkMappedMemoryRange range = {
      VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, readbackMem, 0, VK_WHOLE_SIZE,
  };

  vkr = m_pDriver->vkInvalidateMappedMemoryRanges(m_Device, 1, &range);
  CheckVkResult(vkr);

  // do near/far calculations

  float nearp = 0.1f;
  float farp = 100.0f;

  Vec4f *pos0 = (Vec4f *)byteData;

  bool found = false;

  // expect position at the start of the buffer, as system values are sorted first
  // and position is the first value

  for(uint32_t i = 1;
      refl->outputSignature[0].systemValue == ShaderBuiltin::Position && !found && i < numVerts; i++)
  {
    Vec4f *pos = (Vec4f *)(byteData + i * bufStride);

    DeriveNearFar(*pos, *pos0, nearp, farp, found);

    if(found)
      break;
  }

  // if we didn't find anything, all z's and w's were identical.
  // If the z is positive and w greater for the first element then
  // we detect this projection as reversed z with infinite far plane
  if(!found && pos0->z > 0.0f && pos0->w > pos0->z)
  {
    nearp = pos0->z;
    farp = FLT_MAX;
  }

  m_pDriver->vkUnmapMemory(m_Device, readbackMem);

  // clean up temporary memories
  m_pDriver->vkDestroyBuffer(m_Device, readbackBuffer, NULL);
  m_pDriver->vkFreeMemory(m_Device, readbackMem, NULL);

  if(uniqIdxBuf != VK_NULL_HANDLE)
  {
    m_pDriver->vkDestroyBuffer(m_Device, uniqIdxBuf, NULL);
    m_pDriver->vkFreeMemory(m_Device, uniqIdxBufMem, NULL);
  }

  // fill out m_PostVS.Data
  ret.vsout.topo = MakePrimitiveTopology(state.primitiveTopology, state.patchControlPoints);
  ret.vsout.buf = meshBuffer;
  ret.vsout.bufmem = meshMem;

  ret.vsout.baseVertex = 0;

  ret.vsout.numViews = numViews;

  ret.vsout.vertStride = bufStride;
  ret.vsout.nearPlane = nearp;
  ret.vsout.farPlane = farp;

  ret.vsout.useIndices = bool(action->flags & ActionFlags::Indexed);
  ret.vsout.numVerts = action->numIndices;

  ret.vsout.instStride = 0;
  if(action->flags & ActionFlags::Instanced)
    ret.vsout.instStride = uint32_t(bufSize / (action->numInstances * numViews));

  ret.vsout.idxbuf = VK_NULL_HANDLE;
  if(ret.vsout.useIndices && state.ibuffer.buf != ResourceId())
  {
    VkIndexType type = VK_INDEX_TYPE_UINT16;
    if(idxsize == 4)
      type = VK_INDEX_TYPE_UINT32;
    else if(idxsize == 1)
      type = VK_INDEX_TYPE_UINT8_KHR;

    ret.vsout.idxbuf = rebasedIdxBuf;
    ret.vsout.idxbufmem = rebasedIdxBufMem;
    ret.vsout.idxFmt = type;
  }

  ret.vsout.hasPosOut = refl->outputSignature[0].systemValue == ShaderBuiltin::Position;
  ret.vsout.flipY = state.views.empty() ? false : state.views[0].height < 0.0f;

  if(descpool != VK_NULL_HANDLE)
  {
    // delete descriptors. Technically we don't have to free the descriptor sets, but our tracking
    // on replay doesn't handle destroying children of pooled objects so we do it explicitly anyway.
    m_pDriver->vkFreeDescriptorSets(dev, descpool, (uint32_t)descSets.size(), descSets.data());

    m_pDriver->vkDestroyDescriptorPool(dev, descpool, NULL);

    for(VkDescriptorSetLayout layout : setLayouts)
      m_pDriver->vkDestroyDescriptorSetLayout(dev, layout, NULL);
  }

  // delete pipeline layout
  m_pDriver->vkDestroyPipelineLayout(dev, pipeLayout, NULL);

  // delete pipeline
  m_pDriver->vkDestroyPipeline(dev, pipe, NULL);

  // delete shader/shader module
  m_pDriver->vkDestroyShaderModule(dev, module, NULL);
}