in renderdoc/driver/d3d12/d3d12_postvs.cpp [2810:3993]
void D3D12Replay::InitPostVSBuffers(uint32_t eventId)
{
// go through any aliasing
if(m_PostVSAlias.find(eventId) != m_PostVSAlias.end())
eventId = m_PostVSAlias[eventId];
if(m_PostVSData.find(eventId) != m_PostVSData.end())
return;
D3D12PostVSData &ret = m_PostVSData[eventId];
// we handle out-of-memory errors while processing postvs, don't treat it as a fatal error
ScopedOOMHandle12 oom(m_pDevice);
D3D12MarkerRegion postvs(m_pDevice->GetQueue(), StringFormat::Fmt("PostVS for %u", eventId));
D3D12CommandData *cmd = m_pDevice->GetQueue()->GetCommandData();
const D3D12RenderState &rs = cmd->m_RenderState;
if(rs.pipe == ResourceId())
{
ret.gsout.status = ret.vsout.status = "No pipeline bound";
return;
}
WrappedID3D12PipelineState *origPSO =
m_pDevice->GetResourceManager()->GetCurrentAs<WrappedID3D12PipelineState>(rs.pipe);
if(!origPSO || !origPSO->IsGraphics())
{
ret.gsout.status = ret.vsout.status = "No graphics pipeline bound";
return;
}
D3D12_EXPANDED_PIPELINE_STATE_STREAM_DESC psoDesc;
origPSO->Fill(psoDesc);
if(psoDesc.MS.BytecodeLength > 0)
{
InitPostMSBuffers(eventId);
return;
}
if(psoDesc.VS.BytecodeLength == 0)
{
ret.gsout.status = ret.vsout.status = "No vertex shader in pipeline";
return;
}
WrappedID3D12Shader *vs = origPSO->VS();
D3D_PRIMITIVE_TOPOLOGY topo = rs.topo;
ret.vsout.topo = MakePrimitiveTopology(topo);
const ActionDescription *action = m_pDevice->GetAction(eventId);
if(action->numIndices == 0)
{
ret.gsout.status = ret.vsout.status = "Empty drawcall (0 indices/vertices)";
return;
}
if(action->numInstances == 0)
{
ret.gsout.status = ret.vsout.status = "Empty drawcall (0 instances)";
return;
}
DXBC::DXBCContainer *dxbcVS = vs->GetDXBC();
RDCASSERT(dxbcVS);
DXBC::DXBCContainer *dxbcGS = NULL;
WrappedID3D12Shader *gs = origPSO->GS();
if(gs)
{
dxbcGS = gs->GetDXBC();
RDCASSERT(dxbcGS);
}
DXBC::DXBCContainer *dxbcDS = NULL;
WrappedID3D12Shader *ds = origPSO->DS();
if(ds)
{
dxbcDS = ds->GetDXBC();
RDCASSERT(dxbcDS);
}
DXBC::DXBCContainer *lastShader = dxbcDS;
if(dxbcGS)
lastShader = dxbcGS;
if(lastShader)
{
// put a general error in here in case anything goes wrong fetching VS outputs
ret.gsout.status =
"No geometry/tessellation output fetched due to error processing vertex stage.";
}
else
{
ret.gsout.status = "No geometry and no tessellation shader bound.";
}
ID3D12RootSignature *soSig = NULL;
HRESULT hr = S_OK;
{
WrappedID3D12RootSignature *sig =
m_pDevice->GetResourceManager()->GetCurrentAs<WrappedID3D12RootSignature>(rs.graphics.rootsig);
D3D12RootSignature rootsig = sig->sig;
// create a root signature that allows stream out, if necessary
if((rootsig.Flags & D3D12_ROOT_SIGNATURE_FLAG_ALLOW_STREAM_OUTPUT) == 0)
{
rootsig.Flags |= D3D12_ROOT_SIGNATURE_FLAG_ALLOW_STREAM_OUTPUT;
ID3DBlob *blob = m_pDevice->GetShaderCache()->MakeRootSig(rootsig);
hr = m_pDevice->CreateRootSignature(0, blob->GetBufferPointer(), blob->GetBufferSize(),
__uuidof(ID3D12RootSignature), (void **)&soSig);
if(FAILED(hr))
{
ret.vsout.status = StringFormat::Fmt(
"Couldn't enable stream-out in root signature: HRESULT: %s", ToStr(hr).c_str());
RDCERR("%s", ret.vsout.status.c_str());
return;
}
SAFE_RELEASE(blob);
}
}
rdcarray<D3D12_SO_DECLARATION_ENTRY> sodecls;
UINT stride = 0;
int posidx = -1;
int numPosComponents = 0;
if(!dxbcVS->GetReflection()->OutputSig.empty())
{
for(const SigParameter &sign : dxbcVS->GetReflection()->OutputSig)
{
D3D12_SO_DECLARATION_ENTRY decl;
decl.Stream = 0;
decl.OutputSlot = 0;
decl.SemanticName = sign.semanticName.c_str();
decl.SemanticIndex = sign.semanticIndex;
decl.StartComponent = 0;
decl.ComponentCount = sign.compCount & 0xff;
if(sign.systemValue == ShaderBuiltin::Position)
{
posidx = (int)sodecls.size();
numPosComponents = decl.ComponentCount = 4;
}
stride += decl.ComponentCount * sizeof(float);
sodecls.push_back(decl);
}
if(stride == 0)
{
RDCERR("Didn't get valid stride! Setting to 4 bytes");
stride = 4;
}
// shift position attribute up to first, keeping order otherwise
// the same
if(posidx > 0)
{
D3D12_SO_DECLARATION_ENTRY pos = sodecls[posidx];
sodecls.erase(posidx);
sodecls.insert(0, pos);
}
// set up stream output entries and buffers
psoDesc.StreamOutput.NumEntries = (UINT)sodecls.size();
psoDesc.StreamOutput.pSODeclaration = &sodecls[0];
psoDesc.StreamOutput.NumStrides = 1;
psoDesc.StreamOutput.pBufferStrides = &stride;
psoDesc.StreamOutput.RasterizedStream = D3D12_SO_NO_RASTERIZED_STREAM;
// disable all other shader stages
psoDesc.HS.BytecodeLength = 0;
psoDesc.HS.pShaderBytecode = NULL;
psoDesc.DS.BytecodeLength = 0;
psoDesc.DS.pShaderBytecode = NULL;
psoDesc.GS.BytecodeLength = 0;
psoDesc.GS.pShaderBytecode = NULL;
psoDesc.PS.BytecodeLength = 0;
psoDesc.PS.pShaderBytecode = NULL;
// disable any rasterization/use of output targets
psoDesc.DepthStencilState.DepthEnable = FALSE;
psoDesc.DepthStencilState.DepthWriteMask = D3D12_DEPTH_WRITE_MASK_ZERO;
psoDesc.DepthStencilState.StencilEnable = FALSE;
if(soSig)
psoDesc.pRootSignature = soSig;
// render as points
psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_POINT;
// disable MSAA
psoDesc.SampleDesc.Count = 1;
psoDesc.SampleDesc.Quality = 0;
// disable outputs
RDCEraseEl(psoDesc.RTVFormats);
psoDesc.DSVFormat = DXGI_FORMAT_UNKNOWN;
// for now disable view instancing, unclear if this is legal but it
psoDesc.ViewInstancing.Flags = D3D12_VIEW_INSTANCING_FLAG_NONE;
psoDesc.ViewInstancing.ViewInstanceCount = 0;
ID3D12PipelineState *pipe = NULL;
hr = m_pDevice->CreatePipeState(psoDesc, &pipe);
if(FAILED(hr))
{
SAFE_RELEASE(soSig);
ret.vsout.status = StringFormat::Fmt("Couldn't create patched graphics pipeline: HRESULT: %s",
ToStr(hr).c_str());
RDCERR("%s", ret.vsout.status.c_str());
return;
}
ID3D12Resource *idxBuf = NULL;
uint64_t idxBufSize = ~0ULL;
bool recreate = false;
// we add 64 to account for the stream-out data counter
uint64_t outputSize = uint64_t(action->numIndices) * action->numInstances * stride + 64;
if(m_SOBufferSize < outputSize)
{
uint64_t oldSize = m_SOBufferSize;
m_SOBufferSize = CalcMeshOutputSize(m_SOBufferSize, outputSize);
RDCWARN("Resizing stream-out buffer from %llu to %llu for output data", oldSize,
m_SOBufferSize);
recreate = true;
}
ID3D12GraphicsCommandListX *list = NULL;
if(!(action->flags & ActionFlags::Indexed))
{
if(recreate)
{
m_pDevice->GPUSync();
uint64_t newSize = m_SOBufferSize;
if(!CreateSOBuffers())
{
ret.vsout.status = StringFormat::Fmt(
"Vertex output generated %llu bytes of data which ran out of memory", newSize);
return;
}
}
list = GetDebugManager()->ResetDebugList();
rs.ApplyState(m_pDevice, list);
list->SetPipelineState(pipe);
if(soSig)
{
list->SetGraphicsRootSignature(soSig);
rs.ApplyGraphicsRootElements(list);
}
D3D12_STREAM_OUTPUT_BUFFER_VIEW view;
view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress();
view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64;
view.SizeInBytes = m_SOBufferSize - 64;
list->SOSetTargets(0, 1, &view);
list->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_POINTLIST);
list->DrawInstanced(action->numIndices, action->numInstances, action->vertexOffset,
action->instanceOffset);
}
else // drawcall is indexed
{
bytebuf idxdata;
if(rs.ibuffer.buf != ResourceId() && rs.ibuffer.size > 0)
GetBufferData(rs.ibuffer.buf, rs.ibuffer.offs + action->indexOffset * rs.ibuffer.bytewidth,
RDCMIN(action->numIndices * rs.ibuffer.bytewidth, rs.ibuffer.size), idxdata);
rdcarray<uint32_t> indices;
uint16_t *idx16 = (uint16_t *)&idxdata[0];
uint32_t *idx32 = (uint32_t *)&idxdata[0];
// only read as many indices as were available in the buffer
uint32_t numIndices =
RDCMIN(uint32_t(idxdata.size() / RDCMAX(1, rs.ibuffer.bytewidth)), action->numIndices);
// grab all unique vertex indices referenced
for(uint32_t i = 0; i < numIndices; i++)
{
uint32_t i32 = rs.ibuffer.bytewidth == 2 ? uint32_t(idx16[i]) : idx32[i];
auto it = std::lower_bound(indices.begin(), indices.end(), i32);
if(it != indices.end() && *it == i32)
continue;
indices.insert(it - indices.begin(), i32);
}
// if we read out of bounds, we'll also have a 0 index being referenced
// (as 0 is read). Don't insert 0 if we already have 0 though
if(numIndices < action->numIndices && (indices.empty() || indices[0] != 0))
indices.insert(0, 0);
// An index buffer could be something like: 500, 501, 502, 501, 503, 502
// in which case we can't use the existing index buffer without filling 499 slots of vertex
// data with padding. Instead we rebase the indices based on the smallest vertex so it becomes
// 0, 1, 2, 1, 3, 2 and then that matches our stream-out'd buffer.
//
// Note that there could also be gaps, like: 500, 501, 502, 510, 511, 512
// which would become 0, 1, 2, 3, 4, 5 and so the old index buffer would no longer be valid.
// We just stream-out a tightly packed list of unique indices, and then remap the index buffer
// so that what did point to 500 points to 0 (accounting for rebasing), and what did point
// to 510 now points to 3 (accounting for the unique sort).
// we use a map here since the indices may be sparse. Especially considering if an index
// is 'invalid' like 0xcccccccc then we don't want an array of 3.4 billion entries.
std::map<uint32_t, size_t> indexRemap;
for(size_t i = 0; i < indices.size(); i++)
{
// by definition, this index will only appear once in indices[]
indexRemap[indices[i]] = i;
}
outputSize = uint64_t(indices.size() * sizeof(uint32_t) * sizeof(Vec4f));
if(m_SOBufferSize < outputSize)
{
uint64_t oldSize = m_SOBufferSize;
m_SOBufferSize = CalcMeshOutputSize(m_SOBufferSize, outputSize);
RDCWARN("Resizing stream-out buffer from %llu to %llu for indices", oldSize, m_SOBufferSize);
recreate = true;
}
if(recreate)
{
m_pDevice->GPUSync();
uint64_t newSize = m_SOBufferSize;
if(!CreateSOBuffers())
{
ret.vsout.status = StringFormat::Fmt(
"Vertex output generated %llu bytes of data which ran out of memory", newSize);
return;
}
}
GetDebugManager()->FillBuffer(m_SOPatchedIndexBuffer, 0, &indices[0],
indices.size() * sizeof(uint32_t));
D3D12_INDEX_BUFFER_VIEW patchedIB;
patchedIB.BufferLocation = m_SOPatchedIndexBuffer->GetGPUVirtualAddress();
patchedIB.Format = DXGI_FORMAT_R32_UINT;
patchedIB.SizeInBytes = UINT(indices.size() * sizeof(uint32_t));
list = GetDebugManager()->ResetDebugList();
rs.ApplyState(m_pDevice, list);
list->SetPipelineState(pipe);
list->IASetIndexBuffer(&patchedIB);
if(soSig)
{
list->SetGraphicsRootSignature(soSig);
rs.ApplyGraphicsRootElements(list);
}
D3D12_STREAM_OUTPUT_BUFFER_VIEW view;
view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress();
view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64;
view.SizeInBytes = m_SOBufferSize - 64;
list->SOSetTargets(0, 1, &view);
list->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_POINTLIST);
list->DrawIndexedInstanced((UINT)indices.size(), action->numInstances, 0, action->baseVertex,
action->instanceOffset);
uint32_t stripCutValue = 0;
if(psoDesc.IBStripCutValue == D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFF)
stripCutValue = 0xffff;
else if(psoDesc.IBStripCutValue == D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFFFFFF)
stripCutValue = 0xffffffff;
// rebase existing index buffer to point to the right elements in our stream-out'd
// vertex buffer
for(uint32_t i = 0; i < numIndices; i++)
{
uint32_t i32 = rs.ibuffer.bytewidth == 2 ? uint32_t(idx16[i]) : idx32[i];
// preserve primitive restart indices
if(stripCutValue && i32 == stripCutValue)
continue;
if(rs.ibuffer.bytewidth == 2)
idx16[i] = uint16_t(indexRemap[i32]);
else
idx32[i] = uint32_t(indexRemap[i32]);
}
idxBuf = NULL;
if(!idxdata.empty())
{
D3D12_RESOURCE_DESC idxBufDesc;
idxBufDesc.Alignment = 0;
idxBufDesc.DepthOrArraySize = 1;
idxBufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
idxBufDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
idxBufDesc.Format = DXGI_FORMAT_UNKNOWN;
idxBufDesc.Height = 1;
idxBufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
idxBufDesc.MipLevels = 1;
idxBufDesc.SampleDesc.Count = 1;
idxBufDesc.SampleDesc.Quality = 0;
idxBufDesc.Width = idxdata.size();
D3D12_HEAP_PROPERTIES heapProps;
heapProps.Type = D3D12_HEAP_TYPE_UPLOAD;
heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
heapProps.CreationNodeMask = 1;
heapProps.VisibleNodeMask = 1;
hr = m_pDevice->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &idxBufDesc,
D3D12_RESOURCE_STATE_GENERIC_READ, NULL,
__uuidof(ID3D12Resource), (void **)&idxBuf);
RDCASSERTEQUAL(hr, S_OK);
SetObjName(idxBuf, StringFormat::Fmt("PostVS idxBuf for %u", eventId));
GetDebugManager()->FillBuffer(idxBuf, 0, &idxdata[0], idxdata.size());
idxBufSize = idxdata.size();
}
}
D3D12_RESOURCE_BARRIER sobarr = {};
sobarr.Transition.pResource = m_SOBuffer;
sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_STREAM_OUT;
sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
list->ResourceBarrier(1, &sobarr);
list->CopyResource(m_SOStagingBuffer, m_SOBuffer);
// we're done with this after the copy, so we can discard it and reset
// the counter for the next stream-out
sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE;
sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_COMMON;
list->DiscardResource(m_SOBuffer, NULL);
list->ResourceBarrier(1, &sobarr);
GetDebugManager()->SetDescriptorHeaps(list, true, false);
UINT zeroes[4] = {0, 0, 0, 0};
list->ClearUnorderedAccessViewUint(GetDebugManager()->GetGPUHandle(STREAM_OUT_UAV),
GetDebugManager()->GetUAVClearHandle(STREAM_OUT_UAV),
m_SOBuffer, zeroes, 0, NULL);
list->Close();
ID3D12CommandList *l = list;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
GetDebugManager()->ResetDebugAlloc();
SAFE_RELEASE(pipe);
byte *byteData = NULL;
D3D12_RANGE range = {0, (SIZE_T)m_SOBufferSize};
hr = m_SOStagingBuffer->Map(0, &range, (void **)&byteData);
m_pDevice->CheckHRESULT(hr);
if(FAILED(hr))
{
RDCERR("Failed to map sobuffer HRESULT: %s", ToStr(hr).c_str());
ret.vsout.status = "Couldn't read back vertex output data from GPU";
SAFE_RELEASE(idxBuf);
SAFE_RELEASE(soSig);
return;
}
range.End = 0;
uint64_t numBytesWritten = *(uint64_t *)byteData;
if(numBytesWritten == 0)
{
ret = D3D12PostVSData();
SAFE_RELEASE(idxBuf);
SAFE_RELEASE(soSig);
ret.vsout.status = "Vertex output data from GPU contained no vertex data";
return;
}
// skip past the counter
byteData += 64;
uint64_t numPrims = numBytesWritten / stride;
ID3D12Resource *vsoutBuffer = NULL;
{
D3D12_RESOURCE_DESC vertBufDesc;
vertBufDesc.Alignment = 0;
vertBufDesc.DepthOrArraySize = 1;
vertBufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
vertBufDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
vertBufDesc.Format = DXGI_FORMAT_UNKNOWN;
vertBufDesc.Height = 1;
vertBufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
vertBufDesc.MipLevels = 1;
vertBufDesc.SampleDesc.Count = 1;
vertBufDesc.SampleDesc.Quality = 0;
vertBufDesc.Width = numBytesWritten;
D3D12_HEAP_PROPERTIES heapProps;
heapProps.Type = D3D12_HEAP_TYPE_UPLOAD;
heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
heapProps.CreationNodeMask = 1;
heapProps.VisibleNodeMask = 1;
hr = m_pDevice->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &vertBufDesc,
D3D12_RESOURCE_STATE_GENERIC_READ, NULL,
__uuidof(ID3D12Resource), (void **)&vsoutBuffer);
RDCASSERTEQUAL(hr, S_OK);
if(vsoutBuffer)
{
SetObjName(vsoutBuffer, StringFormat::Fmt("PostVS vsoutBuffer for %u", eventId));
GetDebugManager()->FillBuffer(vsoutBuffer, 0, byteData, (size_t)numBytesWritten);
}
}
float nearp = 0.1f;
float farp = 100.0f;
Vec4f *pos0 = (Vec4f *)byteData;
bool found = false;
for(uint64_t i = 1; numPosComponents == 4 && i < numPrims; i++)
{
Vec4f *pos = (Vec4f *)(byteData + i * stride);
DeriveNearFar(*pos, *pos0, nearp, farp, found);
if(found)
break;
}
// if we didn't find anything, all z's and w's were identical.
// If the z is positive and w greater for the first element then
// we detect this projection as reversed z with infinite far plane
if(!found && pos0->z > 0.0f && pos0->w > pos0->z)
{
nearp = pos0->z;
farp = FLT_MAX;
}
m_SOStagingBuffer->Unmap(0, &range);
ret.vsout.buf = vsoutBuffer;
ret.vsout.vertStride = stride;
ret.vsout.nearPlane = nearp;
ret.vsout.farPlane = farp;
ret.vsout.bufSize = numBytesWritten;
ret.vsout.useIndices = bool(action->flags & ActionFlags::Indexed);
ret.vsout.numVerts = action->numIndices;
ret.vsout.instStride = 0;
if(action->flags & ActionFlags::Instanced)
ret.vsout.instStride = uint32_t(numBytesWritten / RDCMAX(1U, action->numInstances));
ret.vsout.idxBuf = NULL;
if(ret.vsout.useIndices && idxBuf)
{
ret.vsout.idxBuf = idxBuf;
ret.vsout.idxFmt = rs.ibuffer.bytewidth == 2 ? DXGI_FORMAT_R16_UINT : DXGI_FORMAT_R32_UINT;
ret.vsout.idxBufSize = idxBufSize;
}
ret.vsout.hasPosOut = posidx >= 0;
ret.vsout.topo = MakePrimitiveTopology(topo);
}
else
{
// empty vertex output signature
ret.vsout.buf = NULL;
ret.vsout.bufSize = ~0ULL;
ret.vsout.instStride = 0;
ret.vsout.vertStride = 0;
ret.vsout.nearPlane = 0.0f;
ret.vsout.farPlane = 0.0f;
ret.vsout.useIndices = false;
ret.vsout.hasPosOut = false;
ret.vsout.idxBuf = NULL;
ret.vsout.idxBufSize = ~0ULL;
ret.vsout.topo = MakePrimitiveTopology(topo);
}
if(lastShader)
{
ret.gsout.status.clear();
stride = 0;
posidx = -1;
numPosComponents = 0;
sodecls.clear();
for(const SigParameter &sign : lastShader->GetReflection()->OutputSig)
{
D3D12_SO_DECLARATION_ENTRY decl;
// skip streams that aren't rasterized, or if none are rasterized skip non-zero
if(psoDesc.StreamOutput.RasterizedStream == ~0U)
{
if(sign.stream != 0)
continue;
}
else
{
if(sign.stream != psoDesc.StreamOutput.RasterizedStream)
continue;
}
decl.Stream = 0;
decl.OutputSlot = 0;
decl.SemanticName = sign.semanticName.c_str();
decl.SemanticIndex = sign.semanticIndex;
decl.StartComponent = 0;
decl.ComponentCount = sign.compCount & 0xff;
if(sign.systemValue == ShaderBuiltin::Position)
{
posidx = (int)sodecls.size();
numPosComponents = decl.ComponentCount = 4;
}
stride += decl.ComponentCount * sizeof(float);
sodecls.push_back(decl);
}
// shift position attribute up to first, keeping order otherwise
// the same
if(posidx > 0)
{
D3D12_SO_DECLARATION_ENTRY pos = sodecls[posidx];
sodecls.erase(posidx);
sodecls.insert(0, pos);
}
// enable the other shader stages again
if(origPSO->DS())
psoDesc.DS = origPSO->DS()->GetDesc();
if(origPSO->HS())
psoDesc.HS = origPSO->HS()->GetDesc();
if(origPSO->GS())
psoDesc.GS = origPSO->GS()->GetDesc();
// configure new SO declarations
psoDesc.StreamOutput.NumEntries = (UINT)sodecls.size();
psoDesc.StreamOutput.pSODeclaration = &sodecls[0];
psoDesc.StreamOutput.NumStrides = 1;
psoDesc.StreamOutput.pBufferStrides = &stride;
// we're using the same topology this time
psoDesc.PrimitiveTopologyType = origPSO->graphics->PrimitiveTopologyType;
ID3D12PipelineState *pipe = NULL;
hr = m_pDevice->CreatePipeState(psoDesc, &pipe);
if(FAILED(hr))
{
SAFE_RELEASE(soSig);
ret.gsout.status = StringFormat::Fmt("Couldn't create patched graphics pipeline: HRESULT: %s",
ToStr(hr).c_str());
RDCERR("%s", ret.gsout.status.c_str());
return;
}
D3D12_STREAM_OUTPUT_BUFFER_VIEW view;
ID3D12GraphicsCommandListX *list = NULL;
view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress();
view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64;
view.SizeInBytes = m_SOBufferSize - 64;
// draws with multiple instances must be replayed one at a time so we can record the number of
// primitives from each action, as due to expansion this can vary per-instance.
if(action->numInstances > 1)
{
list = GetDebugManager()->ResetDebugList();
rs.ApplyState(m_pDevice, list);
list->SetPipelineState(pipe);
if(soSig)
{
list->SetGraphicsRootSignature(soSig);
rs.ApplyGraphicsRootElements(list);
}
view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress();
view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64;
view.SizeInBytes = m_SOBufferSize - 64;
// do a dummy draw to make sure we have enough space in the output buffer
list->SOSetTargets(0, 1, &view);
list->BeginQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0);
// because the result is expanded we don't have to remap index buffers or anything
if(action->flags & ActionFlags::Indexed)
{
list->DrawIndexedInstanced(action->numIndices, action->numInstances, action->indexOffset,
action->baseVertex, action->instanceOffset);
}
else
{
list->DrawInstanced(action->numIndices, action->numInstances, action->vertexOffset,
action->instanceOffset);
}
list->EndQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0);
list->ResolveQueryData(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0, 1,
m_SOStagingBuffer, 0);
list->Close();
ID3D12CommandList *l = list;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
// check that things are OK, and resize up if needed
D3D12_RANGE range;
range.Begin = 0;
range.End = (SIZE_T)sizeof(D3D12_QUERY_DATA_SO_STATISTICS);
D3D12_QUERY_DATA_SO_STATISTICS *data;
hr = m_SOStagingBuffer->Map(0, &range, (void **)&data);
m_pDevice->CheckHRESULT(hr);
if(FAILED(hr))
{
RDCERR("Couldn't get SO statistics data");
ret.gsout.status =
StringFormat::Fmt("Couldn't get stream-out statistics: HRESULT: %s", ToStr(hr).c_str());
return;
}
D3D12_QUERY_DATA_SO_STATISTICS result = *data;
range.End = 0;
m_SOStagingBuffer->Unmap(0, &range);
// reserve space for enough 'buffer filled size' locations
UINT64 SizeCounterBytes = AlignUp(uint64_t(action->numInstances * sizeof(UINT64)), 64ULL);
uint64_t outputSize = SizeCounterBytes + result.PrimitivesStorageNeeded * 3 * stride;
if(m_SOBufferSize < outputSize)
{
uint64_t oldSize = m_SOBufferSize;
m_SOBufferSize = CalcMeshOutputSize(m_SOBufferSize, outputSize);
RDCWARN("Resizing stream-out buffer from %llu to %llu for output", oldSize, m_SOBufferSize);
uint64_t newSize = m_SOBufferSize;
if(!CreateSOBuffers())
{
ret.gsout.status = StringFormat::Fmt(
"Geometry/tessellation output generated %llu bytes of data which ran out of memory",
newSize);
return;
}
}
GetDebugManager()->ResetDebugAlloc();
// now do the actual stream out
list = GetDebugManager()->ResetDebugList();
// first need to reset the counter byte values which may have either been written to above, or
// are newly created
{
D3D12_RESOURCE_BARRIER sobarr = {};
sobarr.Transition.pResource = m_SOBuffer;
sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_STREAM_OUT;
sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
list->ResourceBarrier(1, &sobarr);
GetDebugManager()->SetDescriptorHeaps(list, true, false);
UINT zeroes[4] = {0, 0, 0, 0};
list->ClearUnorderedAccessViewUint(GetDebugManager()->GetGPUHandle(STREAM_OUT_UAV),
GetDebugManager()->GetUAVClearHandle(STREAM_OUT_UAV),
m_SOBuffer, zeroes, 0, NULL);
std::swap(sobarr.Transition.StateBefore, sobarr.Transition.StateAfter);
list->ResourceBarrier(1, &sobarr);
}
rs.ApplyState(m_pDevice, list);
list->SetPipelineState(pipe);
if(soSig)
{
list->SetGraphicsRootSignature(soSig);
rs.ApplyGraphicsRootElements(list);
}
view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + SizeCounterBytes;
view.SizeInBytes = m_SOBufferSize - SizeCounterBytes;
// do incremental draws to get the output size. We have to do this O(N^2) style because
// there's no way to replay only a single instance. We have to replay 1, 2, 3, ... N instances
// and count the total number of verts each time, then we can see from the difference how much
// each instance wrote.
for(uint32_t inst = 1; inst <= action->numInstances; inst++)
{
if(action->flags & ActionFlags::Indexed)
{
view.BufferFilledSizeLocation =
m_SOBuffer->GetGPUVirtualAddress() + (inst - 1) * sizeof(UINT64);
list->SOSetTargets(0, 1, &view);
list->DrawIndexedInstanced(action->numIndices, inst, action->indexOffset,
action->baseVertex, action->instanceOffset);
}
else
{
view.BufferFilledSizeLocation =
m_SOBuffer->GetGPUVirtualAddress() + (inst - 1) * sizeof(UINT64);
list->SOSetTargets(0, 1, &view);
list->DrawInstanced(action->numIndices, inst, action->vertexOffset, action->instanceOffset);
}
// Instanced draws with a wild number of instances can hang the GPU, sync after every 1000
if((inst % 1000) == 0)
{
list->Close();
l = list;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
GetDebugManager()->ResetDebugAlloc();
list = GetDebugManager()->ResetDebugList();
rs.ApplyState(m_pDevice, list);
list->SetPipelineState(pipe);
if(soSig)
{
list->SetGraphicsRootSignature(soSig);
rs.ApplyGraphicsRootElements(list);
}
}
}
list->Close();
l = list;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
GetDebugManager()->ResetDebugAlloc();
// the last draw will have written the actual data we want into the buffer
}
else
{
// this only loops if we find from a query that we need to resize up
while(true)
{
list = GetDebugManager()->ResetDebugList();
rs.ApplyState(m_pDevice, list);
list->SetPipelineState(pipe);
if(soSig)
{
list->SetGraphicsRootSignature(soSig);
rs.ApplyGraphicsRootElements(list);
}
view.BufferFilledSizeLocation = m_SOBuffer->GetGPUVirtualAddress();
view.BufferLocation = m_SOBuffer->GetGPUVirtualAddress() + 64;
view.SizeInBytes = m_SOBufferSize - 64;
list->SOSetTargets(0, 1, &view);
list->BeginQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0);
// because the result is expanded we don't have to remap index buffers or anything
if(action->flags & ActionFlags::Indexed)
{
list->DrawIndexedInstanced(action->numIndices, action->numInstances, action->indexOffset,
action->baseVertex, action->instanceOffset);
}
else
{
list->DrawInstanced(action->numIndices, action->numInstances, action->vertexOffset,
action->instanceOffset);
}
list->EndQuery(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0);
list->ResolveQueryData(m_SOQueryHeap, D3D12_QUERY_TYPE_SO_STATISTICS_STREAM0, 0, 1,
m_SOStagingBuffer, 0);
list->Close();
ID3D12CommandList *l = list;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
// check that things are OK, and resize up if needed
D3D12_RANGE range;
range.Begin = 0;
range.End = (SIZE_T)sizeof(D3D12_QUERY_DATA_SO_STATISTICS);
D3D12_QUERY_DATA_SO_STATISTICS *data;
hr = m_SOStagingBuffer->Map(0, &range, (void **)&data);
m_pDevice->CheckHRESULT(hr);
if(FAILED(hr))
{
RDCERR("Couldn't get SO statistics data");
ret.gsout.status = StringFormat::Fmt("Couldn't get stream-out statistics: HRESULT: %s",
ToStr(hr).c_str());
return;
}
uint64_t outputSize = data->PrimitivesStorageNeeded * 3 * stride;
if(m_SOBufferSize < outputSize)
{
uint64_t oldSize = m_SOBufferSize;
m_SOBufferSize = CalcMeshOutputSize(m_SOBufferSize, outputSize);
RDCWARN("Resizing stream-out buffer from %llu to %llu for output", oldSize, m_SOBufferSize);
uint64_t newSize = m_SOBufferSize;
if(!CreateSOBuffers())
{
ret.gsout.status = StringFormat::Fmt(
"Geometry/tessellation output generated %llu bytes of data which ran out of memory",
newSize);
return;
}
continue;
}
range.End = 0;
m_SOStagingBuffer->Unmap(0, &range);
GetDebugManager()->ResetDebugAlloc();
break;
}
}
list = GetDebugManager()->ResetDebugList();
D3D12_RESOURCE_BARRIER sobarr = {};
sobarr.Transition.pResource = m_SOBuffer;
sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_STREAM_OUT;
sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_COPY_SOURCE;
list->ResourceBarrier(1, &sobarr);
list->CopyResource(m_SOStagingBuffer, m_SOBuffer);
// we're done with this after the copy, so we can discard it and reset
// the counter for the next stream-out
sobarr.Transition.StateBefore = D3D12_RESOURCE_STATE_COPY_SOURCE;
sobarr.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
list->DiscardResource(m_SOBuffer, NULL);
list->ResourceBarrier(1, &sobarr);
GetDebugManager()->SetDescriptorHeaps(list, true, false);
UINT zeroes[4] = {0, 0, 0, 0};
list->ClearUnorderedAccessViewUint(GetDebugManager()->GetGPUHandle(STREAM_OUT_UAV),
GetDebugManager()->GetUAVClearHandle(STREAM_OUT_UAV),
m_SOBuffer, zeroes, 0, NULL);
list->Close();
ID3D12CommandList *l = list;
m_pDevice->GetQueue()->ExecuteCommandLists(1, &l);
m_pDevice->GPUSync();
GetDebugManager()->ResetDebugAlloc();
SAFE_RELEASE(pipe);
byte *byteData = NULL;
D3D12_RANGE range = {0, (SIZE_T)m_SOBufferSize};
hr = m_SOStagingBuffer->Map(0, &range, (void **)&byteData);
m_pDevice->CheckHRESULT(hr);
if(FAILED(hr))
{
RDCERR("Failed to map sobuffer HRESULT: %s", ToStr(hr).c_str());
ret.gsout.status = "Couldn't read back geometry/tessellation output data from GPU";
SAFE_RELEASE(soSig);
return;
}
range.End = 0;
uint64_t *counters = (uint64_t *)byteData;
uint64_t numBytesWritten = 0;
rdcarray<D3D12PostVSData::InstData> instData;
if(action->numInstances > 1)
{
uint64_t prevByteCount = 0;
for(uint32_t inst = 0; inst < action->numInstances; inst++)
{
uint64_t byteCount = counters[inst];
D3D12PostVSData::InstData d;
d.numVerts = uint32_t((byteCount - prevByteCount) / stride);
d.bufOffset = prevByteCount;
prevByteCount = byteCount;
instData.push_back(d);
}
numBytesWritten = prevByteCount;
}
else
{
numBytesWritten = counters[0];
}
if(numBytesWritten == 0)
{
SAFE_RELEASE(soSig);
ret.gsout.status = "No detectable output generated by geometry/tessellation shaders";
m_SOStagingBuffer->Unmap(0, &range);
return;
}
// skip past the counter(s)
byteData += (view.BufferLocation - m_SOBuffer->GetGPUVirtualAddress());
uint64_t numVerts = numBytesWritten / stride;
ID3D12Resource *gsoutBuffer = NULL;
{
D3D12_RESOURCE_DESC vertBufDesc;
vertBufDesc.Alignment = 0;
vertBufDesc.DepthOrArraySize = 1;
vertBufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
vertBufDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
vertBufDesc.Format = DXGI_FORMAT_UNKNOWN;
vertBufDesc.Height = 1;
vertBufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
vertBufDesc.MipLevels = 1;
vertBufDesc.SampleDesc.Count = 1;
vertBufDesc.SampleDesc.Quality = 0;
vertBufDesc.Width = numBytesWritten;
D3D12_HEAP_PROPERTIES heapProps;
heapProps.Type = D3D12_HEAP_TYPE_UPLOAD;
heapProps.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
heapProps.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
heapProps.CreationNodeMask = 1;
heapProps.VisibleNodeMask = 1;
hr = m_pDevice->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &vertBufDesc,
D3D12_RESOURCE_STATE_GENERIC_READ, NULL,
__uuidof(ID3D12Resource), (void **)&gsoutBuffer);
RDCASSERTEQUAL(hr, S_OK);
if(gsoutBuffer)
{
SetObjName(gsoutBuffer, StringFormat::Fmt("PostVS gsoutBuffer for %u", eventId));
GetDebugManager()->FillBuffer(gsoutBuffer, 0, byteData, (size_t)numBytesWritten);
}
}
float nearp = 0.1f;
float farp = 100.0f;
Vec4f *pos0 = (Vec4f *)byteData;
bool found = false;
for(UINT64 i = 1; numPosComponents == 4 && i < numVerts; i++)
{
Vec4f *pos = (Vec4f *)(byteData + i * stride);
DeriveNearFar(*pos, *pos0, nearp, farp, found);
if(found)
break;
}
// if we didn't find anything, all z's and w's were identical.
// If the z is positive and w greater for the first element then
// we detect this projection as reversed z with infinite far plane
if(!found && pos0->z > 0.0f && pos0->w > pos0->z)
{
nearp = pos0->z;
farp = FLT_MAX;
}
m_SOStagingBuffer->Unmap(0, &range);
ret.gsout.buf = gsoutBuffer;
ret.gsout.bufSize = numBytesWritten;
ret.gsout.instStride = 0;
if(action->flags & ActionFlags::Instanced)
ret.gsout.instStride = uint32_t(numBytesWritten / RDCMAX(1U, action->numInstances));
ret.gsout.vertStride = stride;
ret.gsout.nearPlane = nearp;
ret.gsout.farPlane = farp;
ret.gsout.useIndices = false;
ret.gsout.hasPosOut = posidx >= 0;
ret.gsout.idxBuf = NULL;
ret.gsout.idxBufSize = ~0ULL;
topo = lastShader->GetOutputTopology();
// streamout expands strips unfortunately
if(topo == D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP)
topo = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
else if(topo == D3D11_PRIMITIVE_TOPOLOGY_LINESTRIP)
topo = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
else if(topo == D3D11_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP_ADJ)
topo = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST_ADJ;
else if(topo == D3D11_PRIMITIVE_TOPOLOGY_LINESTRIP_ADJ)
topo = D3D11_PRIMITIVE_TOPOLOGY_LINELIST_ADJ;
ret.gsout.topo = MakePrimitiveTopology(topo);
ret.gsout.numVerts = (uint32_t)numVerts;
if(action->flags & ActionFlags::Instanced)
ret.gsout.numVerts /= RDCMAX(1U, action->numInstances);
ret.gsout.instData = instData;
}
SAFE_RELEASE(soSig);
}