in renderdoc/driver/vulkan/vk_postvs.cpp [4069:5477]
void VulkanReplay::FetchVSOut(uint32_t eventId, VulkanRenderState &state)
{
VulkanCreationInfo &creationInfo = m_pDriver->m_CreationInfo;
const VulkanCreationInfo::Pipeline &pipeInfo = creationInfo.m_Pipeline[state.graphics.pipeline];
const ActionDescription *action = m_pDriver->GetAction(eventId);
const VulkanCreationInfo::ShaderModule &moduleInfo =
creationInfo.m_ShaderModule[pipeInfo.shaders[0].module];
ShaderReflection *refl = pipeInfo.shaders[0].refl;
VulkanPostVSData &ret = m_PostVS.Data[eventId];
// set defaults so that we don't try to fetch this output again if something goes wrong and the
// same event is selected again
{
ret.vsout.buf = VK_NULL_HANDLE;
ret.vsout.bufmem = VK_NULL_HANDLE;
ret.vsout.instStride = 0;
ret.vsout.vertStride = 0;
ret.vsout.numViews = 1;
ret.vsout.nearPlane = 0.0f;
ret.vsout.farPlane = 0.0f;
ret.vsout.useIndices = false;
ret.vsout.hasPosOut = false;
ret.vsout.flipY = false;
ret.vsout.idxbuf = VK_NULL_HANDLE;
ret.vsout.idxbufmem = VK_NULL_HANDLE;
ret.vsout.topo = MakePrimitiveTopology(state.primitiveTopology, state.patchControlPoints);
}
// no outputs from this shader? unexpected but theoretically possible (dummy VS before
// tessellation maybe). Just fill out an empty data set
if(refl->outputSignature.empty())
return;
// we go through the driver for all these creations since they need to be properly
// registered in order to be put in the partial replay state
VkResult vkr = VK_SUCCESS;
VkDevice dev = m_Device;
VkDescriptorPool descpool = VK_NULL_HANDLE;
rdcarray<VkDescriptorSetLayout> setLayouts;
rdcarray<VkDescriptorSet> descSets;
VkPipelineLayout pipeLayout = VK_NULL_HANDLE;
StorageMode storageMode = Binding;
if(m_pDriver->GetExtensions(NULL).ext_KHR_buffer_device_address)
{
storageMode = KHR_bda;
}
else if(m_pDriver->GetExtensions(NULL).ext_EXT_buffer_device_address)
{
storageMode = EXT_bda;
if(!m_pDriver->GetDeviceEnabledFeatures().shaderInt64)
{
static bool warned = false;
if(!warned)
{
warned = true;
RDCLOG(
"EXT_buffer_device_address is available but shaderInt64 isn't, falling back to binding "
"storage mode");
}
}
}
if(Vulkan_Debug_DisableBufferDeviceAddress() ||
m_pDriver->GetDriverInfo().BufferDeviceAddressBrokenDriver())
storageMode = Binding;
if(m_pDriver->GetDeviceProps().limits.maxPerStageDescriptorStorageBuffers - 2 <
MeshOutputBufferArraySize)
{
RDCWARN("Default buffer descriptor array size %u is over device limit, clamping to %u",
MeshOutputBufferArraySize,
m_pDriver->GetDeviceProps().limits.maxPerStageDescriptorStorageBuffers - 2);
MeshOutputBufferArraySize =
m_pDriver->GetDeviceProps().limits.maxPerStageDescriptorStorageBuffers - 2;
}
for(size_t i = 0; i < refl->inputSignature.size(); i++)
{
if(refl->inputSignature[i].regIndex >= MeshOutputBufferArraySize)
{
ret.vsout.status = StringFormat::Fmt(
"Input %s refers to attribute %u which is too large to be handled",
refl->inputSignature[i].varName.c_str(), refl->inputSignature[i].regIndex);
RDCERR("%s", ret.vsout.status.c_str());
return;
}
}
VkDescriptorSetLayoutBinding newBindings[] = {
// output buffer
{
0,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1,
VK_SHADER_STAGE_COMPUTE_BIT,
NULL,
},
// index buffer (if needed)
{
1,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
1,
VK_SHADER_STAGE_COMPUTE_BIT,
NULL,
},
// vertex buffers
{
2,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
MeshOutputBufferArraySize,
VK_SHADER_STAGE_COMPUTE_BIT,
NULL,
},
};
RDCCOMPILE_ASSERT(ARRAY_COUNT(newBindings) == MeshOutputReservedBindings,
"MeshOutputReservedBindings is wrong");
// the spec says only one push constant range may be used per stage, so at most one has
// VERTEX_BIT. Find it, and make it COMPUTE_BIT
VkPushConstantRange push;
uint32_t numPush = 0;
rdcarray<VkPushConstantRange> oldPush =
creationInfo.m_PipelineLayout[pipeInfo.vertLayout].pushRanges;
// ensure the push range is visible to the compute shader
for(const VkPushConstantRange &range : oldPush)
{
if(range.stageFlags & VK_SHADER_STAGE_VERTEX_BIT)
{
push = range;
push.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
numPush = 1;
break;
}
}
if(storageMode == Binding)
{
// create a duplicate set of descriptor sets, all visible to compute, with bindings shifted to
// account for new ones we need. This also copies the existing bindings into the new sets
PatchReservedDescriptors(state.graphics, descpool, setLayouts, descSets,
VK_SHADER_STAGE_COMPUTE_BIT, newBindings, ARRAY_COUNT(newBindings));
// if the pool failed due to limits, it will be NULL so bail now
if(descpool == VK_NULL_HANDLE)
{
ret.vsout.status =
"Couldn't allocate and patch compatible descriptors for vertex output fetch";
return;
}
VkPipelineLayoutCreateInfo pipeLayoutInfo = {
VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
NULL,
0,
(uint32_t)setLayouts.size(),
setLayouts.data(),
numPush,
&push,
};
vkr = m_pDriver->vkCreatePipelineLayout(dev, &pipeLayoutInfo, NULL, &pipeLayout);
CheckVkResult(vkr);
}
else
{
// using BDA we don't need to add any new bindings but we *do* need to patch the descriptor set
// layouts to be compute visible. However with update-after-bind descriptors in the mix we can't
// always reliably do this, as making a copy of the descriptor sets can't be done (in general).
//
// To get around this we patch descriptor set layouts at create time so that COMPUTE_BIT is
// present wherever VERTEX_BIT was, so we can use the application's descriptor sets and layouts
const rdcarray<ResourceId> &sets =
creationInfo.m_PipelineLayout[pipeInfo.vertLayout].descSetLayouts;
setLayouts.reserve(sets.size());
for(size_t i = 0; i < sets.size(); i++)
setLayouts.push_back(GetResourceManager()->GetCurrentHandle<VkDescriptorSetLayout>(sets[i]));
VkPipelineLayoutCreateInfo pipeLayoutInfo = {
VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
NULL,
0,
(uint32_t)setLayouts.size(),
setLayouts.data(),
numPush,
&push,
};
vkr = m_pDriver->vkCreatePipelineLayout(dev, &pipeLayoutInfo, NULL, &pipeLayout);
CheckVkResult(vkr);
// clear the array because it's not needed after and we want to avoid releasing real resources
setLayouts.clear();
}
VkBuffer meshBuffer = VK_NULL_HANDLE, readbackBuffer = VK_NULL_HANDLE;
VkDeviceMemory meshMem = VK_NULL_HANDLE, readbackMem = VK_NULL_HANDLE;
VkBuffer uniqIdxBuf = VK_NULL_HANDLE;
VkDeviceMemory uniqIdxBufMem = VK_NULL_HANDLE;
VkDescriptorBufferInfo uniqIdxBufDescriptor = {};
VkBuffer rebasedIdxBuf = VK_NULL_HANDLE;
VkDeviceMemory rebasedIdxBufMem = VK_NULL_HANDLE;
uint32_t numVerts = action->numIndices;
VkDeviceSize bufSize = 0;
uint32_t numViews = 1;
if(state.dynamicRendering.active)
{
numViews = RDCMAX(numViews, Log2Ceil(state.dynamicRendering.viewMask + 1));
}
else
{
const VulkanCreationInfo::RenderPass &rp = creationInfo.m_RenderPass[state.GetRenderPass()];
if(state.subpass < rp.subpasses.size())
{
numViews = RDCMAX(numViews, (uint32_t)rp.subpasses[state.subpass].multiviews.size());
}
else
{
RDCERR("Subpass is out of bounds to renderpass creation info");
}
}
uint32_t idxsize = state.ibuffer.bytewidth;
if(idxsize == 0)
idxsize = 4U;
uint32_t maxIndex = RDCMAX(action->baseVertex, 0) + numVerts - 1;
uint32_t maxInstance = action->instanceOffset + action->numInstances - 1;
const VkMemoryAllocateFlagsInfo memFlags = {
VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO,
NULL,
VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT,
};
if(action->flags & ActionFlags::Indexed)
{
const bool restart = state.primRestartEnable != VK_FALSE;
bytebuf idxdata;
rdcarray<uint32_t> indices;
uint8_t *idx8 = NULL;
uint16_t *idx16 = NULL;
uint32_t *idx32 = NULL;
// fetch ibuffer
if(state.ibuffer.buf != ResourceId())
GetBufferData(state.ibuffer.buf, state.ibuffer.offs + action->indexOffset * idxsize,
uint64_t(action->numIndices) * idxsize, idxdata);
// figure out what the maximum index could be, so we can clamp our index buffer to something
// sane
uint32_t maxIdx = 0;
// if there are no active bindings assume the vertex shader is generating its own data
// and don't clamp the indices
if(state.vertexBindings.empty())
maxIdx = ~0U;
for(uint32_t vb = 0; vb < state.vertexBindings.size(); vb++)
{
// only vertex inputs (not instance inputs) count
if(state.vertexBindings[vb].inputRate == VK_VERTEX_INPUT_RATE_VERTEX)
{
uint32_t b = state.vertexBindings[vb].binding;
if(b >= state.vbuffers.size())
continue;
ResourceId buf = state.vbuffers[b].buf;
VkDeviceSize offs = state.vbuffers[b].offs;
VkDeviceSize bufsize = creationInfo.m_Buffer[buf].size;
// the maximum valid index on this particular input is the one that reaches
// the end of the buffer. The maximum valid index at all is the one that reads
// off the end of ALL buffers (so we max it with any other maxindex value
// calculated).
if(state.vbuffers[b].stride > 0)
maxIdx = RDCMAX(maxIdx, uint32_t((bufsize - offs) / state.vbuffers[b].stride));
}
}
// in case the vertex buffers were set but had invalid stride (0), max with the number
// of vertices too. This is fine since the max here is just a conservative limit
maxIdx = RDCMAX(maxIdx, action->numIndices);
// do ibuffer rebasing/remapping
if(idxsize == 4)
idx32 = (uint32_t *)&idxdata[0];
else if(idxsize == 1)
idx8 = (uint8_t *)&idxdata[0];
else
idx16 = (uint16_t *)&idxdata[0];
// only read as many indices as were available in the buffer
uint32_t numIndices = RDCMIN(uint32_t(idxdata.size() / idxsize), action->numIndices);
uint32_t idxclamp = 0;
if(action->baseVertex < 0)
idxclamp = uint32_t(-action->baseVertex);
// grab all unique vertex indices referenced
for(uint32_t i = 0; i < numIndices; i++)
{
uint32_t i32 = 0;
if(idx32)
i32 = idx32[i];
else if(idx16)
i32 = uint32_t(idx16[i]);
else if(idx8)
i32 = uint32_t(idx8[i]);
// apply baseVertex but clamp to 0 (don't allow index to become negative)
if(i32 < idxclamp)
i32 = 0;
else if(action->baseVertex < 0)
i32 -= idxclamp;
else if(action->baseVertex > 0)
i32 += action->baseVertex;
// we clamp to maxIdx here, to avoid any invalid indices like 0xffffffff
// from filtering through. Worst case we index to the end of the vertex
// buffers which is generally much more reasonable
i32 = RDCMIN(maxIdx, i32);
// ignore primitive restart indices
if(restart && i32 == (0xffffffff >> ((4 - idxsize) * 8)))
continue;
auto it = std::lower_bound(indices.begin(), indices.end(), i32);
if(it != indices.end() && *it == i32)
continue;
indices.insert(it - indices.begin(), i32);
}
// if we read out of bounds, we'll also have a 0 index being referenced
// (as 0 is read). Don't insert 0 if we already have 0 though
if(numIndices < action->numIndices && (indices.empty() || indices[0] != 0))
indices.insert(0, 0);
maxIndex = indices.back();
// set numVerts
numVerts = (uint32_t)indices.size();
// An index buffer could be something like: 500, 501, 502, 501, 503, 502
// in which case we can't use the existing index buffer without filling 499 slots of vertex
// data with padding. Instead we rebase the indices based on the smallest vertex so it becomes
// 0, 1, 2, 1, 3, 2 and then that matches our stream-out'd buffer.
//
// Note that there could also be gaps, like: 500, 501, 502, 510, 511, 512
// which would become 0, 1, 2, 3, 4, 5 and so the old index buffer would no longer be valid.
// We just stream-out a tightly packed list of unique indices, and then remap the index buffer
// so that what did point to 500 points to 0 (accounting for rebasing), and what did point
// to 510 now points to 3 (accounting for the unique sort).
// we use a map here since the indices may be sparse. Especially considering if an index
// is 'invalid' like 0xcccccccc then we don't want an array of 3.4 billion entries.
std::map<uint32_t, size_t> indexRemap;
for(size_t i = 0; i < indices.size(); i++)
{
// by definition, this index will only appear once in indices[]
indexRemap[indices[i]] = i;
}
// create buffer with unique 0-based indices
VkBufferCreateInfo bufInfo = {
VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
NULL,
0,
indices.size() * sizeof(uint32_t),
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
};
// the flag is the same for KHR and EXT
if(storageMode != Binding)
bufInfo.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;
vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &uniqIdxBuf);
CheckVkResult(vkr);
uniqIdxBufDescriptor.buffer = uniqIdxBuf;
uniqIdxBufDescriptor.offset = 0;
uniqIdxBufDescriptor.range = VK_WHOLE_SIZE;
VkMemoryRequirements mrq = {0};
m_pDriver->vkGetBufferMemoryRequirements(dev, uniqIdxBuf, &mrq);
VkMemoryAllocateInfo allocInfo = {
VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
NULL,
mrq.size,
m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits),
};
if(storageMode == KHR_bda)
allocInfo.pNext = &memFlags;
vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &uniqIdxBufMem);
if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
{
ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
RDCERR("%s", ret.vsout.status.c_str());
return;
}
CheckVkResult(vkr);
vkr = m_pDriver->vkBindBufferMemory(dev, uniqIdxBuf, uniqIdxBufMem, 0);
CheckVkResult(vkr);
byte *idxData = NULL;
vkr = m_pDriver->vkMapMemory(m_Device, uniqIdxBufMem, 0, VK_WHOLE_SIZE, 0, (void **)&idxData);
CheckVkResult(vkr);
if(vkr != VK_SUCCESS || !idxData)
{
if(!idxData)
{
RDCERR("Manually reporting failed memory map");
CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
}
ret.vsout.status = "Couldn't read back vertex output data from GPU";
return;
}
memcpy(idxData, &indices[0], indices.size() * sizeof(uint32_t));
VkMappedMemoryRange range = {
VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, uniqIdxBufMem, 0, VK_WHOLE_SIZE,
};
vkr = m_pDriver->vkFlushMappedMemoryRanges(m_Device, 1, &range);
CheckVkResult(vkr);
m_pDriver->vkUnmapMemory(m_Device, uniqIdxBufMem);
// rebase existing index buffer to point to the right elements in our stream-out'd
// vertex buffer
for(uint32_t i = 0; i < numIndices; i++)
{
uint32_t i32 = 0;
if(idx32)
i32 = idx32[i];
else if(idx16)
i32 = uint32_t(idx16[i]);
else if(idx8)
i32 = uint32_t(idx8[i]);
// preserve primitive restart indices
if(restart && i32 == (0xffffffff >> ((4 - idxsize) * 8)))
continue;
// apply baseVertex but clamp to 0 (don't allow index to become negative)
if(i32 < idxclamp)
i32 = 0;
else if(action->baseVertex < 0)
i32 -= idxclamp;
else if(action->baseVertex > 0)
i32 += action->baseVertex;
if(idx32)
idx32[i] = uint32_t(indexRemap[i32]);
else if(idx16)
idx16[i] = uint16_t(indexRemap[i32]);
else if(idx8)
idx8[i] = uint8_t(indexRemap[i32]);
}
bufInfo.size = RDCMAX((VkDeviceSize)64, (VkDeviceSize)idxdata.size());
bufInfo.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &rebasedIdxBuf);
CheckVkResult(vkr);
m_pDriver->vkGetBufferMemoryRequirements(dev, rebasedIdxBuf, &mrq);
allocInfo.allocationSize = mrq.size;
allocInfo.memoryTypeIndex = m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits);
vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &rebasedIdxBufMem);
if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
{
RDCWARN("Failed to allocate %llu bytes for rebased index buffer", mrq.size);
ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
return;
}
CheckVkResult(vkr);
vkr = m_pDriver->vkBindBufferMemory(dev, rebasedIdxBuf, rebasedIdxBufMem, 0);
CheckVkResult(vkr);
vkr = m_pDriver->vkMapMemory(m_Device, rebasedIdxBufMem, 0, VK_WHOLE_SIZE, 0, (void **)&idxData);
CheckVkResult(vkr);
if(vkr != VK_SUCCESS || !idxData)
{
if(!idxData)
{
RDCERR("Manually reporting failed memory map");
CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
}
ret.vsout.status = "Couldn't read back vertex output data from GPU";
return;
}
memcpy(idxData, idxdata.data(), idxdata.size());
VkMappedMemoryRange rebasedRange = {
VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, rebasedIdxBufMem, 0, VK_WHOLE_SIZE,
};
vkr = m_pDriver->vkFlushMappedMemoryRanges(m_Device, 1, &rebasedRange);
CheckVkResult(vkr);
m_pDriver->vkUnmapMemory(m_Device, rebasedIdxBufMem);
}
uint32_t baseSpecConstant = 0;
bytebuf specData;
rdcarray<VkSpecializationMapEntry> specEntries;
VkGraphicsPipelineCreateInfo pipeCreateInfo;
// get pipeline create info
m_pDriver->GetShaderCache()->MakeGraphicsPipelineInfo(pipeCreateInfo, state.graphics.pipeline);
// copy over specialization info
for(uint32_t s = 0; s < pipeCreateInfo.stageCount; s++)
{
if(pipeCreateInfo.pStages[s].stage == VK_SHADER_STAGE_VERTEX_BIT)
{
if(pipeCreateInfo.pStages[s].pSpecializationInfo)
{
specData.assign((const byte *)pipeCreateInfo.pStages[s].pSpecializationInfo->pData,
pipeCreateInfo.pStages[s].pSpecializationInfo->dataSize);
specEntries.assign(pipeCreateInfo.pStages[s].pSpecializationInfo->pMapEntries,
pipeCreateInfo.pStages[s].pSpecializationInfo->mapEntryCount);
}
break;
}
}
// don't overlap with existing pipeline constants
for(const VkSpecializationMapEntry &specConst : specEntries)
baseSpecConstant = RDCMAX(baseSpecConstant, specConst.constantID + 1);
uint32_t bufStride = 0;
rdcarray<uint32_t> modSpirv = moduleInfo.spirv.GetSPIRV();
struct CompactedAttrBuffer
{
VkDeviceMemory mem;
VkBuffer buf;
VkDescriptorBufferInfo descriptor;
};
rdcarray<uint32_t> attrInstDivisor;
rdcarray<CompactedAttrBuffer> vbuffers;
vbuffers.resize(MeshOutputBufferArraySize);
{
rdcarray<VkWriteDescriptorSet> descWrites;
descWrites.resize(MeshOutputBufferArraySize);
uint32_t numWrites = 0;
RDCASSERT(state.vertexAttributes.size() <= MeshOutputBufferArraySize);
// we fetch the vertex buffer data up front here since there's a very high chance of either
// overlap due to interleaved attributes, or no overlap and no wastage due to separate compact
// attributes.
rdcarray<bytebuf> origVBs;
origVBs.reserve(16);
for(uint32_t vb = 0; vb < state.vertexBindings.size(); vb++)
{
uint32_t binding = state.vertexBindings[vb].binding;
if(binding >= state.vbuffers.size())
{
origVBs.push_back(bytebuf());
continue;
}
VkDeviceSize offs = state.vbuffers[binding].offs;
VkDeviceSize stride = state.vbuffers[binding].stride;
uint64_t len = 0;
if(state.vertexBindings[vb].inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
{
len = (uint64_t(maxInstance) + 1) * stride;
offs += action->instanceOffset * stride;
}
else
{
len = (uint64_t(maxIndex) + 1) * stride;
offs += action->vertexOffset * stride;
}
len = RDCMIN(len, state.vbuffers[binding].size);
origVBs.push_back(bytebuf());
if(state.vbuffers[binding].buf != ResourceId())
GetBufferData(state.vbuffers[binding].buf, offs, len, origVBs.back());
}
for(uint32_t i = 0; i < state.vertexAttributes.size(); i++)
{
const VkVertexInputAttributeDescription2EXT &attrDesc = state.vertexAttributes[i];
uint32_t attr = attrDesc.location;
RDCASSERT(attr < 64);
if(attr >= vbuffers.size())
{
RDCERR("Attribute index too high! Resize array.");
continue;
}
uint32_t instDivisor = ~0U;
size_t stride = 1;
const byte *origVBBegin = NULL;
const byte *origVBEnd = NULL;
for(uint32_t vb = 0; vb < state.vertexBindings.size(); vb++)
{
const VkVertexInputBindingDescription2EXT &vbDesc = state.vertexBindings[vb];
if(vbDesc.binding == attrDesc.binding)
{
origVBBegin = origVBs[vb].data() + attrDesc.offset;
origVBEnd = origVBs[vb].data() + origVBs[vb].size();
if(origVBs[vb].empty())
origVBBegin = origVBEnd = NULL;
stride = vbDesc.stride;
if(vbDesc.inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
instDivisor = vbDesc.divisor;
else
instDivisor = ~0U;
break;
}
}
if(attrDesc.binding < state.vbuffers.size())
stride = (size_t)state.vbuffers[attrDesc.binding].stride;
// in some limited cases, provided we added the UNIFORM_TEXEL_BUFFER usage bit, we could use
// the original buffers here as-is and read out of them. However it is likely that the offset
// is not a multiple of the minimum texel buffer offset for at least some of the buffers if
// not all of them, so we simplify the code here by *always* reading back the vertex buffer
// data and uploading a compacted version.
// we also need to handle the case where the format is not natively supported as a texel
// buffer.
// we used to use expanded texel buffers (i.e. expand to uint4, float4, int4 etc from any
// smaller format) but since we want to support buffer_device_address to avoid descriptor
// patching entirely it's easier to have an SSBO-based path. For that reason we only upload
// this data as 16-byte strided data and read it out of a uint4[] then bitcast to int4 or
// float4. That way the uint4[] SSBO can be easily substituted for a buffer device address
VkFormat origFormat = attrDesc.format;
VkFormat expandedFormat = VK_FORMAT_R32G32B32A32_SFLOAT;
if(Is64BitFormat(origFormat))
expandedFormat = VK_FORMAT_R32G32B32A32_UINT;
else if(IsUIntFormat(origFormat))
expandedFormat = VK_FORMAT_R32G32B32A32_UINT;
else if(IsSIntFormat(origFormat))
expandedFormat = VK_FORMAT_R32G32B32A32_SINT;
uint32_t origElemSize = GetByteSize(1, 1, 1, origFormat, 0);
uint32_t elemSize = GetByteSize(1, 1, 1, expandedFormat, 0);
// 64-bit values are packed as uvec2
if(Is64BitFormat(origFormat))
elemSize *= 2;
// used for interpreting the original data, if we're upcasting
ResourceFormat fmt = MakeResourceFormat(origFormat);
{
VkBufferCreateInfo bufInfo = {
VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
NULL,
0,
elemSize * (maxIndex + 1),
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
};
if(instDivisor != ~0U)
bufInfo.size = elemSize * (maxInstance + 1);
// the flag is the same for KHR and EXT
if(storageMode != Binding)
bufInfo.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;
vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &vbuffers[attr].buf);
CheckVkResult(vkr);
VkMemoryRequirements mrq = {0};
m_pDriver->vkGetBufferMemoryRequirements(dev, vbuffers[attr].buf, &mrq);
VkMemoryAllocateInfo allocInfo = {
VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
NULL,
mrq.size,
m_pDriver->GetUploadMemoryIndex(mrq.memoryTypeBits),
};
if(storageMode == KHR_bda)
allocInfo.pNext = &memFlags;
vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &vbuffers[attr].mem);
if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
{
RDCWARN("Failed to allocate %llu bytes for patched vertex buffer", mrq.size);
ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
return;
}
CheckVkResult(vkr);
vkr = m_pDriver->vkBindBufferMemory(dev, vbuffers[attr].buf, vbuffers[attr].mem, 0);
CheckVkResult(vkr);
byte *dst = NULL;
vkr =
m_pDriver->vkMapMemory(m_Device, vbuffers[attr].mem, 0, VK_WHOLE_SIZE, 0, (void **)&dst);
CheckVkResult(vkr);
if(vkr != VK_SUCCESS || !dst)
{
if(!dst)
{
RDCERR("Manually reporting failed memory map");
CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
}
ret.vsout.status = "Couldn't read back vertex output data from GPU";
return;
}
const byte *dstBase = dst;
(void)dstBase;
const byte *dstEnd = dst + bufInfo.size;
if(dst)
{
FloatVector defaultValue(0.0f, 0.0f, 0.0f, 1.0f);
if(fmt.compType == CompType::UInt || fmt.compType == CompType::SInt || fmt.compCount == 4)
defaultValue.w = 0.0f;
const byte *src = origVBBegin;
// fast memcpy compaction case for regular 32-bit types. Any type like R32G32B32 or so on
// can be memcpy'd into place and read, since we discard any unused components and there's
// no re-interpretation needed.
if(fmt.type == ResourceFormatType::Regular && fmt.compByteWidth == 4)
{
size_t expandedComponentBytes = sizeof(FloatVector) - origElemSize;
while(src < origVBEnd && dst < dstEnd)
{
if(expandedComponentBytes > 0)
memcpy(dst + origElemSize, ((byte *)&defaultValue) + origElemSize,
expandedComponentBytes);
memcpy(dst, src, origElemSize);
// advance by the *destination* element size of 16 bytes
dst += elemSize;
src += stride;
}
// fill the rest with default values
while(dst < dstEnd)
{
memcpy(dst, &defaultValue, sizeof(FloatVector));
dst += elemSize;
}
}
else
{
uint32_t zero = 0;
// upcasting path
if(Is64BitFormat(origFormat))
{
while(src < origVBEnd && dst < dstEnd)
{
// the 64-bit value (especially for doubles) is already in "packed uvec2" order,
// with least significant 32-bits first, so we can copy directly
memcpy(dst, src, sizeof(uint64_t) * fmt.compCount);
dst += sizeof(uint64_t) * fmt.compCount;
// fill up to *8* zeros not 4, since we're filling two for every component
for(uint8_t c = fmt.compCount * 2; c < 8; c++)
{
memcpy(dst, &zero, sizeof(uint32_t));
dst += sizeof(uint32_t);
}
src += stride;
}
}
else if(IsUIntFormat(expandedFormat))
{
while(src < origVBEnd && dst < dstEnd)
{
uint32_t val = 0;
const byte *s = src;
uint8_t c = 0;
for(; c < fmt.compCount; c++)
{
if(fmt.compByteWidth == 1)
val = *s;
else if(fmt.compByteWidth == 2)
val = *(uint16_t *)s;
else if(fmt.compByteWidth == 4)
val = *(uint32_t *)s;
memcpy(dst, &val, sizeof(uint32_t));
dst += sizeof(uint32_t);
s += fmt.compByteWidth;
}
for(; c < 4; c++)
{
memcpy(dst, &zero, sizeof(uint32_t));
dst += sizeof(uint32_t);
}
src += stride;
}
}
else if(IsSIntFormat(expandedFormat))
{
while(src < origVBEnd && dst < dstEnd)
{
int32_t val = 0;
const byte *s = src;
uint8_t c = 0;
for(; c < fmt.compCount; c++)
{
if(fmt.compByteWidth == 1)
val = *(int8_t *)s;
else if(fmt.compByteWidth == 2)
val = *(int16_t *)s;
else if(fmt.compByteWidth == 4)
val = *(int32_t *)s;
memcpy(dst, &val, sizeof(int32_t));
dst += sizeof(int32_t);
s += fmt.compByteWidth;
}
for(; c < 4; c++)
{
memcpy(dst, &zero, sizeof(uint32_t));
dst += sizeof(uint32_t);
}
src += stride;
}
}
else
{
while(src < origVBEnd && dst < dstEnd)
{
bool valid = false;
FloatVector vec = HighlightCache::InterpretVertex(src, 0, 0, fmt, origVBEnd, valid);
memcpy(dst, &vec, sizeof(FloatVector));
dst += sizeof(FloatVector);
src += stride;
}
// fill the rest with default values
while(dst < dstEnd)
{
memcpy(dst, &defaultValue, sizeof(FloatVector));
dst += elemSize;
}
}
}
}
VkMappedMemoryRange range = {
VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, vbuffers[attr].mem, 0, VK_WHOLE_SIZE,
};
vkr = m_pDriver->vkFlushMappedMemoryRanges(m_Device, 1, &range);
CheckVkResult(vkr);
m_pDriver->vkUnmapMemory(m_Device, vbuffers[attr].mem);
}
attrInstDivisor.resize(RDCMAX(attrInstDivisor.size(), size_t(attr + 1)));
attrInstDivisor[attr] = instDivisor;
vbuffers[attr].descriptor.buffer = vbuffers[attr].buf;
vbuffers[attr].descriptor.offset = 0;
vbuffers[attr].descriptor.range = VK_WHOLE_SIZE;
if(!descSets.empty())
{
descWrites[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
descWrites[numWrites].dstSet = descSets[0];
descWrites[numWrites].dstBinding = 2;
descWrites[numWrites].dstArrayElement = attr;
descWrites[numWrites].descriptorCount = 1;
descWrites[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
descWrites[numWrites].pBufferInfo = &vbuffers[attr].descriptor;
numWrites++;
}
}
// add a write of the index buffer
if(uniqIdxBuf != VK_NULL_HANDLE && !descSets.empty())
{
descWrites[numWrites].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
descWrites[numWrites].dstSet = descSets[0];
descWrites[numWrites].dstBinding = 1;
descWrites[numWrites].dstArrayElement = 0;
descWrites[numWrites].descriptorCount = 1;
descWrites[numWrites].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
descWrites[numWrites].pBufferInfo = &uniqIdxBufDescriptor;
numWrites++;
}
if(numWrites > 0)
m_pDriver->vkUpdateDescriptorSets(dev, numWrites, descWrites.data(), 0, NULL);
}
if(!Vulkan_Debug_PostVSDumpDirPath().empty())
FileIO::WriteAll(Vulkan_Debug_PostVSDumpDirPath() + "/debug_postvs_vert.spv", modSpirv);
ConvertToMeshOutputCompute(*refl, *pipeInfo.shaders[0].patchData, pipeInfo.shaders[0].entryPoint,
storageMode, attrInstDivisor, action, numVerts, numViews,
baseSpecConstant, modSpirv, bufStride);
if(!Vulkan_Debug_PostVSDumpDirPath().empty())
FileIO::WriteAll(Vulkan_Debug_PostVSDumpDirPath() + "/debug_postvs_comp.spv", modSpirv);
{
// now that we know the stride, create buffer of sufficient size
// this can't just be bufStride * num unique indices per instance, as we don't
// have a compact 0-based index to index into the buffer. We must use
// index-minIndex which is 0-based but potentially sparse, so this buffer may
// be more or less wasteful
VkBufferCreateInfo bufInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
// set bufSize
bufSize = bufInfo.size = uint64_t(numVerts) * uint64_t(action->numInstances) *
uint64_t(bufStride) * uint64_t(numViews);
bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
bufInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
bufInfo.usage |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
bufInfo.usage |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
// the flag is the same for KHR and EXT
if(storageMode != Binding)
bufInfo.usage |= VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT;
vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &meshBuffer);
CheckVkResult(vkr);
bufInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT;
vkr = m_pDriver->vkCreateBuffer(dev, &bufInfo, NULL, &readbackBuffer);
CheckVkResult(vkr);
VkMemoryRequirements mrq = {0};
m_pDriver->vkGetBufferMemoryRequirements(dev, meshBuffer, &mrq);
if(mrq.size > m_pDriver->GetMaxMemoryAllocationSize())
{
ret.vsout.status = StringFormat::Fmt("OOM %llu bytes Max %llu bytes", mrq.size,
m_pDriver->GetMaxMemoryAllocationSize());
return;
}
VkMemoryAllocateInfo allocInfo = {
VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
NULL,
mrq.size,
m_pDriver->GetGPULocalMemoryIndex(mrq.memoryTypeBits),
};
if(storageMode == KHR_bda)
allocInfo.pNext = &memFlags;
vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &meshMem);
if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
{
RDCWARN("Failed to allocate %llu bytes for output vertex SSBO", mrq.size);
ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
return;
}
CheckVkResult(vkr);
vkr = m_pDriver->vkBindBufferMemory(dev, meshBuffer, meshMem, 0);
CheckVkResult(vkr);
m_pDriver->vkGetBufferMemoryRequirements(dev, readbackBuffer, &mrq);
allocInfo.pNext = NULL;
allocInfo.memoryTypeIndex = m_pDriver->GetReadbackMemoryIndex(mrq.memoryTypeBits);
vkr = m_pDriver->vkAllocateMemory(dev, &allocInfo, NULL, &readbackMem);
if(vkr == VK_ERROR_OUT_OF_DEVICE_MEMORY || vkr == VK_ERROR_OUT_OF_HOST_MEMORY)
{
RDCWARN("Failed to allocate %llu bytes for readback memory", mrq.size);
ret.vsout.status = StringFormat::Fmt("Failed to allocate %llu bytes", mrq.size);
return;
}
CheckVkResult(vkr);
vkr = m_pDriver->vkBindBufferMemory(dev, readbackBuffer, readbackMem, 0);
CheckVkResult(vkr);
}
VkComputePipelineCreateInfo compPipeInfo = {VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO};
// repoint pipeline layout
compPipeInfo.layout = pipeLayout;
// create vertex shader with modified code
VkShaderModuleCreateInfo moduleCreateInfo = {
VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, NULL, 0,
modSpirv.size() * sizeof(uint32_t), &modSpirv[0],
};
VkShaderModule module;
vkr = m_pDriver->vkCreateShaderModule(dev, &moduleCreateInfo, NULL, &module);
CheckVkResult(vkr);
compPipeInfo.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
compPipeInfo.stage.module = module;
compPipeInfo.stage.pName = PatchedMeshOutputEntryPoint;
compPipeInfo.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
// append our own if we're using BDA
if(storageMode != Binding)
{
// ensure we're 64-bit aligned first
specData.resize(AlignUp(specData.size(), (size_t)8));
uint32_t baseOffset = (uint32_t)specData.size();
rdcarray<uint64_t> addresses;
addresses.resize(MeshOutputBufferArraySize + 2);
for(uint32_t i = 0; i <= MeshOutputBufferArraySize + 1; i++)
{
RDCCOMPILE_ASSERT(VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO ==
VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO_EXT,
"KHR and EXT buffer_device_address should be interchangeable here.");
VkBufferDeviceAddressInfo getAddressInfo = {VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO};
if(i < MeshOutputBufferArraySize)
getAddressInfo.buffer = vbuffers[i].buf;
else if(i == MeshOutputBufferArraySize)
getAddressInfo.buffer = uniqIdxBuf;
else if(i == MeshOutputBufferArraySize + 1)
getAddressInfo.buffer = meshBuffer;
// skip
if(getAddressInfo.buffer == VK_NULL_HANDLE)
continue;
if(storageMode == KHR_bda)
addresses[i] = m_pDriver->vkGetBufferDeviceAddress(dev, &getAddressInfo);
else
addresses[i] = m_pDriver->vkGetBufferDeviceAddressEXT(dev, &getAddressInfo);
VkSpecializationMapEntry entry;
entry.offset = baseOffset + i * sizeof(uint64_t);
entry.constantID = baseSpecConstant + i * 2 + 0;
// for EXT we have one 64-bit spec constant per address, for KHR we have a uvec2 - two
// constants
if(storageMode == EXT_bda)
{
entry.size = sizeof(uint64_t);
specEntries.push_back(entry);
}
else
{
entry.size = sizeof(uint32_t);
specEntries.push_back(entry);
entry.offset += sizeof(uint32_t);
entry.constantID++;
entry.size = sizeof(uint32_t);
specEntries.push_back(entry);
}
}
specData.append((const byte *)addresses.data(), addresses.byteSize());
}
VkSpecializationInfo specInfo = {};
specInfo.dataSize = specData.size();
specInfo.pData = specData.data();
specInfo.mapEntryCount = (uint32_t)specEntries.size();
specInfo.pMapEntries = specEntries.data();
compPipeInfo.stage.pSpecializationInfo = &specInfo;
// create new pipeline
VkPipeline pipe;
vkr = m_pDriver->vkCreateComputePipelines(m_Device, VK_NULL_HANDLE, 1, &compPipeInfo, NULL, &pipe);
if(vkr != VK_SUCCESS)
{
ret.vsout.status =
StringFormat::Fmt("Failed to create patched compute pipeline: %s", ToStr(vkr).c_str());
RDCERR("%s", ret.vsout.status.c_str());
return;
}
// make copy of state to draw from
VulkanRenderState modifiedstate = state;
// bind created pipeline to partial replay state
modifiedstate.compute.pipeline = GetResID(pipe);
// move graphics descriptor sets onto the compute pipe.
modifiedstate.compute.descSets = modifiedstate.graphics.descSets;
if(!descSets.empty())
{
// replace descriptor set IDs with our temporary sets. The offsets we keep the same. If the
// original draw had no sets, we ensure there's room (with no offsets needed)
if(modifiedstate.compute.descSets.empty())
modifiedstate.compute.descSets.resize(1);
for(size_t i = 0; i < descSets.size(); i++)
{
modifiedstate.compute.descSets[i].pipeLayout = GetResID(pipeLayout);
modifiedstate.compute.descSets[i].descSet = GetResID(descSets[i]);
}
}
else
{
for(size_t i = 0; i < modifiedstate.compute.descSets.size(); i++)
modifiedstate.compute.descSets[i].pipeLayout = GetResID(pipeLayout);
}
{
VkCommandBuffer cmd = m_pDriver->GetNextCmd();
if(cmd == VK_NULL_HANDLE)
return;
VkCommandBufferBeginInfo beginInfo = {VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, NULL,
VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT};
vkr = ObjDisp(dev)->BeginCommandBuffer(Unwrap(cmd), &beginInfo);
CheckVkResult(vkr);
// fill destination buffer with 0s to ensure unwritten vertices have sane data
ObjDisp(dev)->CmdFillBuffer(Unwrap(cmd), Unwrap(meshBuffer), 0, bufSize, 0);
VkBufferMemoryBarrier meshbufbarrier = {
VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
NULL,
VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT,
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
VK_QUEUE_FAMILY_IGNORED,
VK_QUEUE_FAMILY_IGNORED,
};
meshbufbarrier.size = VK_WHOLE_SIZE;
VkMemoryBarrier globalbarrier = {
VK_STRUCTURE_TYPE_MEMORY_BARRIER,
NULL,
VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT,
VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
};
// wait for uploads of index buffer (if used), compacted vertex buffers, and the above fill to
// finish.
DoPipelineBarrier(cmd, 1, &globalbarrier);
// vkUpdateDescriptorSet desc set to point to buffer
VkDescriptorBufferInfo fetchdesc = {0};
fetchdesc.buffer = meshBuffer;
fetchdesc.offset = 0;
fetchdesc.range = bufSize;
if(!descSets.empty())
{
VkWriteDescriptorSet write = {
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, NULL, descSets[0], 0, 0, 1,
VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, NULL, &fetchdesc, NULL};
m_pDriver->vkUpdateDescriptorSets(dev, 1, &write, 0, NULL);
}
// do single draw
modifiedstate.BindPipeline(m_pDriver, cmd, VulkanRenderState::BindCompute, true);
uint64_t totalVerts = numVerts * uint64_t(action->numInstances) * uint64_t(numViews);
// the validation layers will probably complain about this dispatch saying some arrays aren't
// fully updated. That's because they don't statically analyse that only fixed indices are
// referred to. It's safe to leave unused array indices as invalid descriptors.
ObjDisp(cmd)->CmdDispatch(Unwrap(cmd), uint32_t(totalVerts / MeshOutputDispatchWidth) + 1, 1, 1);
// wait for mesh output writing to finish
meshbufbarrier.buffer = Unwrap(meshBuffer);
meshbufbarrier.size = bufSize;
meshbufbarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
meshbufbarrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
DoPipelineBarrier(cmd, 1, &meshbufbarrier);
VkBufferCopy bufcopy = {
0,
0,
bufSize,
};
// copy to readback buffer
ObjDisp(dev)->CmdCopyBuffer(Unwrap(cmd), Unwrap(meshBuffer), Unwrap(readbackBuffer), 1, &bufcopy);
meshbufbarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
meshbufbarrier.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
meshbufbarrier.buffer = Unwrap(readbackBuffer);
// wait for copy to finish
DoPipelineBarrier(cmd, 1, &meshbufbarrier);
vkr = ObjDisp(dev)->EndCommandBuffer(Unwrap(cmd));
CheckVkResult(vkr);
// submit & flush so that we don't have to keep pipeline around for a while
m_pDriver->SubmitCmds();
m_pDriver->FlushQ();
}
for(CompactedAttrBuffer attrBuf : vbuffers)
{
m_pDriver->vkDestroyBuffer(dev, attrBuf.buf, NULL);
m_pDriver->vkFreeMemory(dev, attrBuf.mem, NULL);
}
// readback mesh data
byte *byteData = NULL;
vkr = m_pDriver->vkMapMemory(m_Device, readbackMem, 0, VK_WHOLE_SIZE, 0, (void **)&byteData);
CheckVkResult(vkr);
if(vkr != VK_SUCCESS || !byteData)
{
if(!byteData)
{
RDCERR("Manually reporting failed memory map");
CheckVkResult(VK_ERROR_MEMORY_MAP_FAILED);
}
ret.vsout.status = "Couldn't read back vertex output data from GPU";
return;
}
VkMappedMemoryRange range = {
VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE, NULL, readbackMem, 0, VK_WHOLE_SIZE,
};
vkr = m_pDriver->vkInvalidateMappedMemoryRanges(m_Device, 1, &range);
CheckVkResult(vkr);
// do near/far calculations
float nearp = 0.1f;
float farp = 100.0f;
Vec4f *pos0 = (Vec4f *)byteData;
bool found = false;
// expect position at the start of the buffer, as system values are sorted first
// and position is the first value
for(uint32_t i = 1;
refl->outputSignature[0].systemValue == ShaderBuiltin::Position && !found && i < numVerts; i++)
{
Vec4f *pos = (Vec4f *)(byteData + i * bufStride);
DeriveNearFar(*pos, *pos0, nearp, farp, found);
if(found)
break;
}
// if we didn't find anything, all z's and w's were identical.
// If the z is positive and w greater for the first element then
// we detect this projection as reversed z with infinite far plane
if(!found && pos0->z > 0.0f && pos0->w > pos0->z)
{
nearp = pos0->z;
farp = FLT_MAX;
}
m_pDriver->vkUnmapMemory(m_Device, readbackMem);
// clean up temporary memories
m_pDriver->vkDestroyBuffer(m_Device, readbackBuffer, NULL);
m_pDriver->vkFreeMemory(m_Device, readbackMem, NULL);
if(uniqIdxBuf != VK_NULL_HANDLE)
{
m_pDriver->vkDestroyBuffer(m_Device, uniqIdxBuf, NULL);
m_pDriver->vkFreeMemory(m_Device, uniqIdxBufMem, NULL);
}
// fill out m_PostVS.Data
ret.vsout.topo = MakePrimitiveTopology(state.primitiveTopology, state.patchControlPoints);
ret.vsout.buf = meshBuffer;
ret.vsout.bufmem = meshMem;
ret.vsout.baseVertex = 0;
ret.vsout.numViews = numViews;
ret.vsout.vertStride = bufStride;
ret.vsout.nearPlane = nearp;
ret.vsout.farPlane = farp;
ret.vsout.useIndices = bool(action->flags & ActionFlags::Indexed);
ret.vsout.numVerts = action->numIndices;
ret.vsout.instStride = 0;
if(action->flags & ActionFlags::Instanced)
ret.vsout.instStride = uint32_t(bufSize / (action->numInstances * numViews));
ret.vsout.idxbuf = VK_NULL_HANDLE;
if(ret.vsout.useIndices && state.ibuffer.buf != ResourceId())
{
VkIndexType type = VK_INDEX_TYPE_UINT16;
if(idxsize == 4)
type = VK_INDEX_TYPE_UINT32;
else if(idxsize == 1)
type = VK_INDEX_TYPE_UINT8_KHR;
ret.vsout.idxbuf = rebasedIdxBuf;
ret.vsout.idxbufmem = rebasedIdxBufMem;
ret.vsout.idxFmt = type;
}
ret.vsout.hasPosOut = refl->outputSignature[0].systemValue == ShaderBuiltin::Position;
ret.vsout.flipY = state.views.empty() ? false : state.views[0].height < 0.0f;
if(descpool != VK_NULL_HANDLE)
{
// delete descriptors. Technically we don't have to free the descriptor sets, but our tracking
// on replay doesn't handle destroying children of pooled objects so we do it explicitly anyway.
m_pDriver->vkFreeDescriptorSets(dev, descpool, (uint32_t)descSets.size(), descSets.data());
m_pDriver->vkDestroyDescriptorPool(dev, descpool, NULL);
for(VkDescriptorSetLayout layout : setLayouts)
m_pDriver->vkDestroyDescriptorSetLayout(dev, layout, NULL);
}
// delete pipeline layout
m_pDriver->vkDestroyPipelineLayout(dev, pipeLayout, NULL);
// delete pipeline
m_pDriver->vkDestroyPipeline(dev, pipe, NULL);
// delete shader/shader module
m_pDriver->vkDestroyShaderModule(dev, module, NULL);
}