in renderdoc/driver/shaders/dxbc/dxbc_bytecode_vendorext.cpp [186:1756]
void Program::PostprocessVendorExtensions()
{
const bool friendly = DXBC_Disassembly_FriendlyNaming();
uint32_t magicID = ~0U;
for(size_t i = 0; i < m_Declarations.size(); i++)
{
const Declaration &decl = m_Declarations[i];
if((decl.operand.indices.size() == 1 && decl.operand.indices[0].index == m_ShaderExt.second) ||
(decl.operand.indices.size() == 3 && decl.operand.indices[1].index == m_ShaderExt.second &&
decl.space == m_ShaderExt.first))
{
magicID = (uint32_t)decl.operand.indices[0].index;
m_Declarations.erase(i);
break;
}
}
// now we know the UAV, iterate the instructions looking for patterns to replace.
//
// AMD is nice and easy. Every instruction works on a scalar (vector versions repeat for each
// component) and is encoded into a single InterlockedCompareExchange on the UAV.
// So we can simply replace them in-place by decoding.
//
// NV's are not as nice. They are demarcated by IncrementCounter on the UAV so we know we'll see
// a linear stream without re-ordering, but they *can* be intermixed with other non-intrinsic
// instructions. Parameters and data are set by writing to specific offsets within the structure
//
// There are two types:
//
// Simpler, instructions that work purely on vars and not on resources. Shuffle/ballot/etc
//
// These come in the form:
// index = magicUAV.IncrementCounter()
// set params and opcode by writing to magicUAV[index].member...
// retval = magicUAV.IncrementCounter()
// [optional (see below): retval2 = magicUAV.IncrementCounter()]
//
// This type of operand returns the result with the closing IncrementCounter(). There could be
// multiple results, so numOutputs is set before any, and then that many IncrementCounter() are
// emitted with each result.
//
// More complex, instructions that use UAVs. Mostly atomics
//
// index1 = magicUAV.IncrementCounter()
// magicUAV[index1].markUAV = 1;
// userUAV[index1] = 0; // or some variation of such
// index2 = magicUAV.IncrementCounter()
// set params and opcode as above in magicUAV[index2].member...
// retval = magicUAV[index2].dst
//
// Also note that if the shader doesn't use the return result of an atomic, the dst may never be
// read!
//
// The difficulty then is distinguishing between the two and knowing where the boundaries are.
// We do this with a simple state machine tracking where we are in an opcode:
//
// +----------> Nothing
// | v
// | |
// | IncrementCounter()
// Emit instruction |
// | v
// | Instruction >--write markUAV---> UAV instruction header
// | v (wait for other UAV write)
// | | v
// | | |
// | write opcode ] |
// | | ] |
// | v ] simple |
// | Instruction Body ] case |
// | v ] |
// | | ] |
// | IncrementCounter() ] |
// | | |
// +----<---------+ |
// | IncrementCounter()
// | |
// | UAV instruction body <------------------+
// | v
// | |
// | write opcode
// | |
// +--------------+
//
// so most state transitions are marked by an IncrementCounter(). The exceptions being
// Instruction where we wait for a write to either markUAV or opcode to move to either simple
// instruction body or to the UAV instruction header, and UAV instruction body which leaves
// when we see an opcode write.
//
// We assume that markUAV will be written BEFORE the fake UAV write. It's not entirely clear if
// this is guaranteed to not be re-ordered but it seems to be true and it's implied that NV's
// driver relies on this. This simplifies tracking since we can use it as a state transition.
//
// We also assume that multiple accesses to the UAV don't overlap. This should be guaranteed by
// the use of the index from the counter being used for access. However we don't actually check
// the index itself.
//
// all src/dst are uint4, others are all uint
enum class InstructionState
{
// if something goes wrong we enter this state and stop patching
Broken,
Nothing,
// this is a state only used for AMD's UAV atomic op, which takes more parameters and uses the
// operation phases.
AMDUAVAtomic,
// this is the state when we're not sure what type we are. Either markUAV is written, in which
// case we move to UAVInstructionHeader1, or opcode is written, in which case we move to
// Instruction1Out. We should see one or the other.
//
// FP16 UAV instructions (NV_EXTN_OP_FP16_ATOMIC) that operate on float4 resources have two
// return values. Unfortunately we can't reliably detect this from the bytecode, so what
// happens is that when we see opode get written if it's NV_EXTN_OP_FP16_ATOMIC then we jump
// straight to UAVInstructionBody and re-use the UAV instruction header from last time. We
// know this MUST be a continuation because otherwise NV_EXTN_OP_FP16_ATOMIC is always
// preceeded by a UAV instruction header (via markUAV).
InstructionHeader,
InstructionBody,
// we move from Instruction1Out to this state when markUAV is written. The next UAV write is
// used to determine the 'target' UAV.
// We then move to header2 so we don't consume any other UAV writes.
UAVInstructionHeader1,
// here we do nothing but sit and wait for the IncrementCounter() so we can move to the UAV
// body state
UAVInstructionHeader2,
// in this state we aren't sure exactly when to leave it. We wait *at least* until opcode is
// written, but there may be more instructions after that to read from dst :(
UAVInstructionBody,
};
enum class NvUAVParam
{
opcode = 0,
src0 = 76,
src1 = 92,
src2 = 108,
src3 = 28,
src4 = 44,
src5 = 60,
dst = 124,
markUAV = 140,
numOutputs = 144,
};
InstructionState state = InstructionState::Nothing;
NvShaderOpcode nvopcode = NvShaderOpcode::Unknown;
Operand srcParam[8];
Operand dstParam[4];
Operand uavParam;
int numOutputs = 0, outputsNeeded = 0;
ToString flags = friendly ? ToString::FriendlyNameRegisters : ToString::None;
for(size_t i = 0; i < m_Instructions.size(); i++)
{
// reserve space for an added instruction so that curOp can stay valid even if we insert a new
// op. This only actually does work the first time (or after we've inserted a new
// instruction).
m_Instructions.reserve(m_Instructions.size() + 1);
Operation &curOp = m_Instructions[i];
if(state == InstructionState::Broken)
break;
if((curOp.operation == OPCODE_IMM_ATOMIC_CMP_EXCH &&
curOp.operands[1].indices[0].index == magicID) ||
(curOp.operation == OPCODE_ATOMIC_CMP_STORE && curOp.operands[0].indices[0].index == magicID))
{
// AMD operations where the return value isn't used becomes an atomic_cmp_store instead of
// imm_atomic_cmp_exch
const int32_t instructionIndex = curOp.operation == OPCODE_ATOMIC_CMP_STORE ? 1 : 2;
const int32_t param0Index = instructionIndex + 1;
const int32_t param1Index = param0Index + 1;
Operand dstOperand = curOp.operands[0];
// if we have a store there's no destination, so set it to null
if(curOp.operation == OPCODE_ATOMIC_CMP_STORE)
{
dstOperand = Operand();
dstOperand.type = TYPE_NULL;
dstOperand.setComps(0xff, 0xff, 0xff, 0xff);
}
// AMD operation
if(curOp.operands[instructionIndex].type != TYPE_IMMEDIATE32)
{
RDCERR(
"Expected literal value for AMD extension instruction. Was the shader compiled with "
"optimisations disabled?");
state = InstructionState::Broken;
break;
}
uint32_t instruction = curOp.operands[instructionIndex].values[0];
if(AMDInstruction::Magic.Get(instruction) == 5)
{
AMDInstruction::DX12Op amdop;
if(m_API == GraphicsAPI::D3D11)
amdop = AMDInstruction::convert(AMDInstruction::Opcode11.Get(instruction));
else
amdop = AMDInstruction::Opcode12.Get(instruction);
uint32_t phase = AMDInstruction::Phase.Get(instruction);
if(phase == 0)
{
srcParam[0] = curOp.operands[param0Index];
srcParam[1] = curOp.operands[param1Index];
}
else if(phase == 1)
{
srcParam[2] = curOp.operands[param0Index];
srcParam[3] = curOp.operands[param1Index];
}
else if(phase == 2)
{
srcParam[4] = curOp.operands[param0Index];
srcParam[5] = curOp.operands[param1Index];
}
else if(phase == 3)
{
srcParam[6] = curOp.operands[param0Index];
srcParam[7] = curOp.operands[param1Index];
}
Operation op;
switch(amdop)
{
case AMDInstruction::DX12Op::Readfirstlane:
{
op.operation = OPCODE_AMD_READFIRSTLANE;
op.operands.resize(2);
op.operands[0] = dstOperand;
op.operands[1].name = "src"_lit;
op.operands[1] = srcParam[0];
break;
}
case AMDInstruction::DX12Op::Readlane:
{
op.operation = OPCODE_AMD_READLANE;
op.operands.resize(3);
op.operands[0] = dstOperand;
op.operands[1].name = "src"_lit;
op.operands[1] = srcParam[0];
// lane is encoded in instruction data
op.operands[2].name = "lane"_lit;
op.operands[2].type = TYPE_IMMEDIATE32;
op.operands[2].numComponents = NUMCOMPS_1;
op.operands[2].values[0] = AMDInstruction::Data.Get(instruction);
break;
}
case AMDInstruction::DX12Op::LaneId:
{
op.operation = OPCODE_AMD_LANEID;
op.operands = {dstOperand};
break;
}
case AMDInstruction::DX12Op::Swizzle:
{
op.operation = OPCODE_AMD_SWIZZLE;
op.operands.resize(2);
op.operands[0] = dstOperand;
op.operands[1].name = "src"_lit;
op.operands[1] = srcParam[0];
break;
}
case AMDInstruction::DX12Op::Ballot:
{
if(phase == 0)
{
// srcParams already stored, store the dst for phase 0
dstParam[0] = dstOperand;
}
else if(phase == 1)
{
op.operation = OPCODE_AMD_BALLOT;
op.operands.resize(3);
op.operands[0] = dstParam[0];
op.operands[1] = dstOperand;
op.operands[2] = srcParam[0];
op.operands[2].name = "predicate"_lit;
}
break;
}
case AMDInstruction::DX12Op::MBCnt:
{
op.operation = OPCODE_AMD_MBCNT;
op.operands.resize(3);
op.operands[0] = dstOperand;
op.operands[1] = srcParam[0];
op.operands[2] = srcParam[1];
break;
}
case AMDInstruction::DX12Op::Min3U:
case AMDInstruction::DX12Op::Min3F:
case AMDInstruction::DX12Op::Med3U:
case AMDInstruction::DX12Op::Med3F:
case AMDInstruction::DX12Op::Max3U:
case AMDInstruction::DX12Op::Max3F:
{
if(phase == 0)
{
// don't need the output at all, it's just used to chain the instructions
}
else if(phase == 1)
{
switch(amdop)
{
case AMDInstruction::DX12Op::Min3U: op.operation = OPCODE_AMD_MIN3U; break;
case AMDInstruction::DX12Op::Min3F: op.operation = OPCODE_AMD_MIN3F; break;
case AMDInstruction::DX12Op::Med3U: op.operation = OPCODE_AMD_MED3U; break;
case AMDInstruction::DX12Op::Med3F: op.operation = OPCODE_AMD_MED3F; break;
case AMDInstruction::DX12Op::Max3U: op.operation = OPCODE_AMD_MAX3U; break;
case AMDInstruction::DX12Op::Max3F: op.operation = OPCODE_AMD_MAX3F; break;
default: break;
}
op.operands.resize(4);
op.operands[0] = dstOperand;
op.operands[1] = srcParam[0];
op.operands[2] = srcParam[1];
op.operands[3] = srcParam[2];
}
break;
}
case AMDInstruction::DX12Op::BaryCoord:
{
if(phase == 0)
{
// srcParams already stored, store the dst for phase 0
dstParam[0] = dstOperand;
}
else if(phase == 1)
{
if(AMDInstruction::BaryInterp.Get(instruction) != AMDInstruction::PerspPullModel)
{
// all modes except pull model have two outputs
op.operation = OPCODE_AMD_BARYCOORD;
op.operands.resize(2);
op.operands[0].name = "i"_lit;
op.operands[0] = dstParam[0];
op.operands[0].name = "j"_lit;
op.operands[1] = dstOperand;
}
else
{
dstParam[1] = dstOperand;
}
}
else if(phase == 2)
{
// all modes except pull model have two outputs
op.operation = OPCODE_AMD_BARYCOORD;
op.operands.resize(3);
op.operands[0].name = "invW"_lit;
op.operands[0] = dstParam[0];
op.operands[1].name = "invI"_lit;
op.operands[1] = dstParam[1];
op.operands[2].name = "invJ"_lit;
op.operands[2] = dstOperand;
}
break;
}
case AMDInstruction::DX12Op::VtxParam:
{
op.operation = OPCODE_AMD_VTXPARAM;
op.operands.resize(3);
op.operands[0] = dstOperand;
// vertexIndex is encoded in instruction data
op.operands[1].name = "vertexIndex"_lit;
op.operands[1].type = TYPE_IMMEDIATE32;
op.operands[1].numComponents = NUMCOMPS_1;
op.operands[1].values[0] = AMDInstruction::VtxParamVertex.Get(instruction);
// decode and pretty-ify the parameter index and component
op.operands[2].name = "parameter"_lit;
op.operands[2].type = TYPE_INPUT;
op.operands[2].numComponents = NUMCOMPS_1;
op.operands[2].indices.resize(1);
op.operands[2].indices[0].absolute = true;
op.operands[2].indices[0].index = AMDInstruction::VtxParamParameter.Get(instruction);
op.operands[2].setComps(AMDInstruction::VtxParamComponent.Get(instruction), 0xff, 0xff,
0xff);
break;
}
case AMDInstruction::DX12Op::ViewportIndex:
{
op.operation = OPCODE_AMD_GET_VIEWPORTINDEX;
op.operands = {dstOperand};
break;
}
case AMDInstruction::DX12Op::RtArraySlice:
{
op.operation = OPCODE_AMD_GET_RTARRAYSLICE;
op.operands = {dstOperand};
break;
}
case AMDInstruction::DX12Op::WaveReduce:
case AMDInstruction::DX12Op::WaveScan:
{
if(amdop == AMDInstruction::DX12Op::WaveReduce)
op.operation = OPCODE_AMD_WAVE_REDUCE;
else
op.operation = OPCODE_AMD_WAVE_SCAN;
op.preciseValues = AMDInstruction::WaveOp.Get(instruction);
break;
}
case AMDInstruction::DX12Op::LoadDwAtAddr:
{
if(phase == 0)
{
// don't need the output at all, it's just used to chain the instructions
}
else if(phase == 1)
{
op.operation = OPCODE_AMD_LOADDWATADDR;
op.operands.resize(4);
op.operands[0] = dstOperand;
op.operands[1] = srcParam[0];
op.operands[1].name = "gpuVaLoBits"_lit;
op.operands[2] = srcParam[1];
op.operands[2].name = "gpuVaHiBits"_lit;
op.operands[3] = srcParam[2];
op.operands[3].name = "offset"_lit;
}
break;
}
case AMDInstruction::DX12Op::DrawIndex:
{
op.operation = OPCODE_AMD_GET_DRAWINDEX;
op.operands = {dstOperand};
break;
}
case AMDInstruction::DX12Op::GetWaveSize:
{
op.operation = OPCODE_AMD_GET_WAVESIZE;
op.operands = {dstOperand};
break;
}
case AMDInstruction::DX12Op::BaseInstance:
{
op.operation = OPCODE_AMD_GET_BASEINSTANCE;
op.operands = {dstOperand};
break;
}
case AMDInstruction::DX12Op::BaseVertex:
{
op.operation = OPCODE_AMD_GET_BASEVERTEX;
op.operands = {dstOperand};
break;
}
case AMDInstruction::DX12Op::AtomicU64:
{
// if we're in the nothing state, move to the AMD UAV state so we watch for a UAV access
// and nop it out
if(state == InstructionState::Nothing)
state = InstructionState::AMDUAVAtomic;
VendorAtomicOp atomicop = convert(AMDInstruction::AtomicOp.Get(instruction));
op.preciseValues = atomicop;
bool isCAS = (atomicop == ATOMIC_OP_CAS);
// for CAS we have four phases, only exit the state when we're in phase 3. For all other
// instructions we have three phases so exit in phase 2.
if(phase == 3 || (phase == 2 && !isCAS))
{
op.operation = OPCODE_AMD_U64_ATOMIC;
state = InstructionState::Nothing;
// output values first
op.operands.push_back(dstParam[0]);
op.operands.push_back(dstOperand);
// then the saved UAV
op.operands.push_back(uavParam);
// then the address. This is in params [0], [1], [2]. If they all come from the same
// register we can compact this
if(srcParam[0].indices == srcParam[1].indices &&
srcParam[1].indices == srcParam[2].indices)
{
op.operands.push_back(srcParam[0]);
op.operands.back().setComps(srcParam[0].comps[0], srcParam[1].comps[0],
srcParam[2].comps[0], 0xff);
op.operands.back().name = "address"_lit;
// store in texelOffset whether the parameter is combined (1) or split (2)
op.texelOffset[0] = 1;
}
else
{
op.operands.push_back(srcParam[0]);
op.operands.back().name = "address.x"_lit;
op.operands.back().setComps(srcParam[0].comps[0], 0xff, 0xff, 0xff);
op.operands.push_back(srcParam[1]);
op.operands.back().name = "address.y"_lit;
op.operands.back().setComps(srcParam[1].comps[0], 0xff, 0xff, 0xff);
op.operands.push_back(srcParam[2]);
op.operands.back().name = "address.z"_lit;
op.operands.back().setComps(srcParam[2].comps[0], 0xff, 0xff, 0xff);
// store in texelOffset whether the parameter is combined (1) or split (2)
op.texelOffset[0] = 2;
}
// for CAS, the compare value next
if(isCAS)
{
if(srcParam[5].indices == srcParam[6].indices)
{
op.operands.push_back(srcParam[5]);
op.operands.back().setComps(srcParam[5].comps[0], srcParam[6].comps[0], 0xff, 0xff);
op.operands.back().values[1] = srcParam[6].values[0];
op.operands.back().name = "compare_value"_lit;
// store in texelOffset whether the parameter is combined (1) or split (2)
op.texelOffset[1] = 1;
}
else
{
op.operands.push_back(srcParam[5].reswizzle(0));
op.operands.back().name = "compare_value.x"_lit;
op.operands.back().setComps(srcParam[5].comps[0], 0xff, 0xff, 0xff);
op.operands.push_back(srcParam[6].reswizzle(0));
op.operands.back().name = "compare_value.y"_lit;
op.operands.back().setComps(srcParam[6].comps[0], 0xff, 0xff, 0xff);
// store in texelOffset whether the parameter is combined (1) or split (2)
op.texelOffset[1] = 2;
}
}
// then the value
if(srcParam[3].indices == srcParam[4].indices)
{
op.operands.push_back(srcParam[3]);
op.operands.back().setComps(srcParam[3].comps[0], srcParam[4].comps[0], 0xff, 0xff);
op.operands.back().values[1] = srcParam[4].values[0];
op.operands.back().name = "value"_lit;
// store in texelOffset whether the parameter is combined (1) or split (2)
op.texelOffset[2] = 1;
}
else
{
op.operands.push_back(srcParam[3].reswizzle(0));
op.operands.back().name = "value.x"_lit;
op.operands.back().setComps(srcParam[3].comps[0], 0xff, 0xff, 0xff);
op.operands.push_back(srcParam[4].reswizzle(0));
op.operands.back().name = "value.y"_lit;
op.operands.back().setComps(srcParam[4].comps[0], 0xff, 0xff, 0xff);
// store in texelOffset whether the parameter is combined (1) or split (2)
op.texelOffset[2] = 2;
}
}
// phase 0's destination is the first destination
if(phase == 0)
dstParam[0] = dstOperand;
break;
}
}
// if the operation wasn't set we're on an intermediate phase. operands were saved,
// wait until we have the full operation
if(op.operation != NUM_REAL_OPCODES)
{
op.offset = curOp.offset;
op.str = ToStr(op.operation);
if(op.operation == OPCODE_AMD_BARYCOORD)
{
switch(AMDInstruction::BaryInterp.Get(instruction))
{
case AMDInstruction::LinearCenter: op.str += "_linear_center"; break;
case AMDInstruction::LinearCentroid: op.str += "_linear_centroid"; break;
case AMDInstruction::LinearSample: op.str += "_linear_sample"; break;
case AMDInstruction::PerspCenter: op.str += "_persp_center"; break;
case AMDInstruction::PerspCentroid: op.str += "_persp_centroid"; break;
case AMDInstruction::PerspSample: op.str += "_persp_sample"; break;
case AMDInstruction::PerspPullModel: op.str += "_persp_pullmodel"; break;
default: op.str += "_unknown"; break;
}
}
else if(op.operation == OPCODE_AMD_SWIZZLE)
{
switch(AMDInstruction::SwizzleOp.Get(instruction))
{
case AMDInstruction::SwapX1: op.str += "_swap1"; break;
case AMDInstruction::SwapX2: op.str += "_swap2"; break;
case AMDInstruction::SwapX4: op.str += "_swap4"; break;
case AMDInstruction::SwapX8: op.str += "_swap8"; break;
case AMDInstruction::SwapX16: op.str += "_swap16"; break;
case AMDInstruction::ReverseX4: op.str += "_reverse4"; break;
case AMDInstruction::ReverseX8: op.str += "_reverse8"; break;
case AMDInstruction::ReverseX16: op.str += "_reverse16:"; break;
case AMDInstruction::ReverseX32: op.str += "_reverse32:"; break;
case AMDInstruction::BCastX2: op.str += "_bcast2"; break;
case AMDInstruction::BCastX4: op.str += "_bcast4"; break;
case AMDInstruction::BCastX8: op.str += "_bcast8"; break;
case AMDInstruction::BCastX16: op.str += "_bcast16"; break;
case AMDInstruction::BCastX32: op.str += "_bcast32"; break;
}
}
else if(op.operation == OPCODE_AMD_WAVE_REDUCE || op.operation == OPCODE_AMD_WAVE_SCAN)
{
switch((VendorWaveOp)op.preciseValues)
{
default: break;
case WAVE_OP_ADD_FLOAT: op.str += "_addf"; break;
case WAVE_OP_ADD_SINT: op.str += "_addi"; break;
case WAVE_OP_ADD_UINT: op.str += "_addu"; break;
case WAVE_OP_MUL_FLOAT: op.str += "_mulf"; break;
case WAVE_OP_MUL_SINT: op.str += "_muli"; break;
case WAVE_OP_MUL_UINT: op.str += "_mulu"; break;
case WAVE_OP_MIN_FLOAT: op.str += "_minf"; break;
case WAVE_OP_MIN_SINT: op.str += "_mini"; break;
case WAVE_OP_MIN_UINT: op.str += "_minu"; break;
case WAVE_OP_MAX_FLOAT: op.str += "_maxf"; break;
case WAVE_OP_MAX_SINT: op.str += "_maxi"; break;
case WAVE_OP_MAX_UINT: op.str += "_maxu"; break;
case WAVE_OP_AND: op.str += "_and"; break;
case WAVE_OP_OR: op.str += "_or"; break;
case WAVE_OP_XOR: op.str += "_xor"; break;
}
if(op.operation == OPCODE_AMD_WAVE_SCAN)
{
if(AMDInstruction::WaveOpFlags.Get(instruction) & 0x1)
op.str += "_incl";
if(AMDInstruction::WaveOpFlags.Get(instruction) & 0x2)
op.str += "_excl";
}
}
for(size_t a = 0; a < op.operands.size(); a++)
{
if(a == 0)
op.str += " ";
else
op.str += ", ";
op.str += op.operands[a].toString(m_Reflection, flags | ToString::ShowSwizzle);
}
m_Instructions.insert(i + 1, op);
}
}
else
{
RDCERR("Expected magic value of 5 in encoded AMD instruction %x", instruction);
state = InstructionState::Broken;
break;
}
if(state == InstructionState::Broken)
continue;
// remove this operation, but keep the old operation so we can undo this if things go
// wrong
curOp.stride = curOp.operation;
curOp.operation = OPCODE_VENDOR_REMOVED;
RDCCOMPILE_ASSERT(sizeof(curOp.stride) >= sizeof(curOp.operation),
"Hackily assuming stride is big enough to hold an operation");
}
else if(curOp.operation == OPCODE_IMM_ATOMIC_ALLOC &&
curOp.operands[1].indices[0].index == magicID)
{
// NV IncrementCounter()
switch(state)
{
case InstructionState::Broken:
case InstructionState::AMDUAVAtomic: break;
// in Nothing an increment marks the beginning of an instruction of some type
case InstructionState::Nothing:
{
state = InstructionState::InstructionHeader;
break;
}
case InstructionState::InstructionHeader:
{
// the transition from instruction to any other state should happen via a markUAV or
// opcode write, not with a counter increment
RDCERR(
"Expected either markUAV or opcode write before counter increment in unknown "
"instruction header!");
state = InstructionState::Broken;
break;
}
case InstructionState::InstructionBody:
{
outputsNeeded--;
if(outputsNeeded <= 0)
{
// once we've emitted all outputs, move to Nothing state
state = InstructionState::Nothing;
// and emit vendor instruction
Operation op;
switch(nvopcode)
{
case NvShaderOpcode::Shuffle:
case NvShaderOpcode::ShuffleUp:
case NvShaderOpcode::ShuffleDown:
case NvShaderOpcode::ShuffleXor:
{
if(nvopcode == NvShaderOpcode::Shuffle)
op.operation = OPCODE_NV_SHUFFLE;
else if(nvopcode == NvShaderOpcode::ShuffleUp)
op.operation = OPCODE_NV_SHUFFLE_UP;
else if(nvopcode == NvShaderOpcode::ShuffleDown)
op.operation = OPCODE_NV_SHUFFLE_DOWN;
else if(nvopcode == NvShaderOpcode::ShuffleXor)
op.operation = OPCODE_NV_SHUFFLE_XOR;
op.operands.resize(4);
op.operands[0] = curOp.operands[0];
op.operands[1].name = "value"_lit;
op.operands[1] = srcParam[0].reswizzle(0);
if(nvopcode == NvShaderOpcode::Shuffle)
op.operands[2].name = "srcLane"_lit;
else if(nvopcode == NvShaderOpcode::ShuffleXor)
op.operands[2].name = "laneMask"_lit;
else
op.operands[2].name = "delta"_lit;
op.operands[2] = srcParam[0].reswizzle(1);
op.operands[3].name = "width"_lit;
op.operands[3] = srcParam[0].reswizzle(3);
break;
}
case NvShaderOpcode::VoteAll:
case NvShaderOpcode::VoteAny:
case NvShaderOpcode::VoteBallot:
{
if(nvopcode == NvShaderOpcode::VoteAll)
op.operation = OPCODE_NV_VOTE_ALL;
else if(nvopcode == NvShaderOpcode::VoteAny)
op.operation = OPCODE_NV_VOTE_ANY;
else if(nvopcode == NvShaderOpcode::VoteBallot)
op.operation = OPCODE_NV_VOTE_BALLOT;
op.operands.resize(2);
op.operands[0] = curOp.operands[0];
op.operands[1] = srcParam[0];
op.operands[1].name = "predicate"_lit;
break;
}
case NvShaderOpcode::GetLaneId:
{
op.operation = OPCODE_NV_GET_LANEID;
op.operands = {curOp.operands[0]};
break;
}
case NvShaderOpcode::GetSpecial:
{
if(srcParam[0].type != TYPE_IMMEDIATE32)
{
RDCERR("Expected literal value for special subopcode");
state = InstructionState::Broken;
break;
}
NvShaderSpecial special = (NvShaderSpecial)srcParam[0].values[0];
if(special == NvShaderSpecial::ThreadLtMask)
{
op.operation = OPCODE_NV_GET_THREADLTMASK;
}
else if(special == NvShaderSpecial::FootprintSingleLOD)
{
op.operation = OPCODE_NV_GET_FOOTPRINT_SINGLELOD;
}
else
{
RDCERR("Unexpected special subopcode");
state = InstructionState::Broken;
break;
}
op.operands = {curOp.operands[0]};
break;
}
case NvShaderOpcode::MatchAny:
{
op.operation = OPCODE_NV_MATCH_ANY;
op.operands.resize(2);
op.operands[0] = curOp.operands[0];
op.operands[1] = srcParam[0];
// we don't need src1, it only indicates the number of components in the value,
// which we already have
break;
}
case NvShaderOpcode::GetShadingRate:
{
op.operation = OPCODE_NV_GET_SHADING_RATE;
if(dstParam[0].indices == curOp.operands[0].indices &&
dstParam[1].indices == curOp.operands[0].indices)
{
op.operands.push_back(curOp.operands[0]);
op.operands.back().name = "result"_lit;
// fixup the comps according to the shuffle
op.operands.back().setComps(
// x
dstParam[1].comps[0],
// y
dstParam[0].comps[0],
// z
curOp.operands[0].comps[0], 0xff);
}
else
{
// these are in reverse order because we read them as numOutputs was decrementing
op.operands.push_back(dstParam[1]);
op.operands.back().name = "result.x"_lit;
op.operands.push_back(dstParam[0]);
op.operands.back().name = "result.y"_lit;
// z is last
op.operands.push_back(curOp.operands[0]);
op.operands.back().name = "result.z"_lit;
}
break;
}
// all footprint ops are very similar
case NvShaderOpcode::Footprint:
case NvShaderOpcode::FootprintBias:
case NvShaderOpcode::FootprintLevel:
case NvShaderOpcode::FootprintGrad:
{
if(nvopcode == NvShaderOpcode::Footprint)
op.operation = OPCODE_NV_FOOTPRINT;
else if(nvopcode == NvShaderOpcode::FootprintBias)
op.operation = OPCODE_NV_FOOTPRINT_BIAS;
else if(nvopcode == NvShaderOpcode::FootprintLevel)
op.operation = OPCODE_NV_FOOTPRINT_LEVEL;
else if(nvopcode == NvShaderOpcode::FootprintGrad)
op.operation = OPCODE_NV_FOOTPRINT_GRAD;
// four output values, could be assigned to different registers depending on packing
// because they come back as scalars from increment counter. In general we have to
// have them separately, but see if they all neatly line up into one output first.
if(dstParam[0].indices == curOp.operands[0].indices &&
dstParam[1].indices == curOp.operands[0].indices &&
dstParam[2].indices == curOp.operands[0].indices)
{
op.operands.push_back(curOp.operands[0]);
op.operands.back().name = "result"_lit;
// fixup the comps according to the shuffle
op.operands.back().setComps(
// x
dstParam[2].comps[0],
// y
dstParam[1].comps[0],
// z
dstParam[0].comps[0],
// w
curOp.operands[0].comps[0]);
}
else
{
// these are in reverse order because we read them as numOutputs was decrementing
op.operands.push_back(dstParam[2]);
op.operands.back().name = "result.x"_lit;
op.operands.push_back(dstParam[1]);
op.operands.back().name = "result.y"_lit;
op.operands.push_back(dstParam[0]);
op.operands.back().name = "result.z"_lit;
// w is last
op.operands.push_back(curOp.operands[0]);
op.operands.back().name = "result.w"_lit;
}
// peel out the source parameters
op.operands.push_back(srcParam[3].reswizzle(0));
op.operands.back().name = "texSpace"_lit;
op.operands.push_back(srcParam[0].reswizzle(0));
op.operands.back().name = "texIndex"_lit;
op.operands.push_back(srcParam[3].reswizzle(1));
op.operands.back().name = "smpSpace"_lit;
op.operands.push_back(srcParam[0].reswizzle(1));
op.operands.back().name = "smpIndex"_lit;
op.operands.push_back(srcParam[3].reswizzle(2));
op.operands.back().name = "texType"_lit;
op.operands.push_back(srcParam[1]);
op.operands.back().comps[3] = 0xff; // location is a float3
op.operands.back().values[3] = 0;
op.operands.back().name = "location"_lit;
op.operands.push_back(srcParam[3].reswizzle(3));
op.operands.back().name = "coarse"_lit;
op.operands.push_back(srcParam[1].reswizzle(3));
op.operands.back().name = "gran"_lit;
if(nvopcode == NvShaderOpcode::FootprintBias)
{
op.operands.push_back(srcParam[2].reswizzle(0));
op.operands.back().name = "bias"_lit;
}
else if(nvopcode == NvShaderOpcode::FootprintLevel)
{
op.operands.push_back(srcParam[2].reswizzle(0));
op.operands.back().name = "lodLevel"_lit;
}
else if(nvopcode == NvShaderOpcode::FootprintGrad)
{
op.operands.push_back(srcParam[2]);
op.operands.back().name = "ddx"_lit;
op.operands.push_back(srcParam[5]);
op.operands.back().name = "ddy"_lit;
}
op.operands.push_back(srcParam[4]);
op.operands.back().name = "offset"_lit;
break;
}
case NvShaderOpcode::ShuffleGeneric:
{
op.operation = OPCODE_NV_SHUFFLE_GENERIC;
op.operands.resize(5);
// first output is the actual result
op.operands[0] = curOp.operands[0];
// second output is the laneValid we stored previously
op.operands[1] = dstParam[0];
op.operands[1].name = "out laneValid"_lit;
// we expect the params are packed into srcParam[0]
op.operands[2] = srcParam[0].reswizzle(0);
op.operands[2].name = "value"_lit;
op.operands[3] = srcParam[0].reswizzle(1);
op.operands[3].name = "srcLane"_lit;
op.operands[4] = srcParam[0].reswizzle(2);
op.operands[4].name = "width"_lit;
break;
}
case NvShaderOpcode::VPRSEvalAttribAtSample:
case NvShaderOpcode::VPRSEvalAttribSnapped:
{
if(nvopcode == NvShaderOpcode::VPRSEvalAttribAtSample)
op.operation = OPCODE_NV_VPRS_EVAL_ATTRIB_SAMPLE;
else if(nvopcode == NvShaderOpcode::VPRSEvalAttribSnapped)
op.operation = OPCODE_NV_VPRS_EVAL_ATTRIB_SNAPPED;
// up to four output values, could be assigned to different registers depending on
// packing because they come back as scalars from increment counter. In general we
// have to have them separately, but see if they all neatly line up into one output
// first.
bool allSameReg = true;
for(int o = 0; o < numOutputs - 1; o++)
{
if(!(dstParam[o].indices == curOp.operands[0].indices))
{
allSameReg = false;
break;
}
}
if(allSameReg)
{
op.operands.push_back(curOp.operands[0]);
op.operands.back().name = "result"_lit;
for(int o = 0; o < 4; o++)
{
if(o >= numOutputs)
op.operands.back().comps[o] = 0xff;
else if(o + 1 == numOutputs)
op.operands.back().comps[o] = curOp.operands[0].comps[0];
else
op.operands.back().comps[o] = dstParam[numOutputs - 2 - o].comps[0];
}
}
else
{
const char swz[] = "xyzw";
for(int o = 0; o < numOutputs - 1; o++)
{
// these are in reverse order because we read them as numOutputs was
// decrementing
op.operands.push_back(dstParam[numOutputs - 2 - o]);
op.operands.back().name = rdcstr("result.") + swz[o];
}
op.operands.push_back(curOp.operands[0]);
op.operands.back().name = rdcstr("result.") + swz[numOutputs - 1];
}
op.operands.push_back(srcParam[0]);
op.operands.back().name = "attrib"_lit;
if(nvopcode == NvShaderOpcode::VPRSEvalAttribAtSample)
{
op.operands.push_back(srcParam[1]);
op.operands.back().name = "sampleIndex"_lit;
op.operands.push_back(srcParam[2]);
op.operands.back().name = "pixelOffset"_lit;
}
else if(nvopcode == NvShaderOpcode::VPRSEvalAttribSnapped)
{
op.operands.push_back(srcParam[1]);
op.operands.back().name = "offset"_lit;
}
break;
}
default:
RDCERR("Unexpected non-UAV opcode %d.", nvopcode);
state = InstructionState::Broken;
break;
}
if(state == InstructionState::Broken)
break;
op.offset = curOp.offset;
op.str = ToStr(op.operation);
for(size_t a = 0; a < op.operands.size(); a++)
{
if(a == 0)
op.str += " ";
else
op.str += ", ";
op.str += op.operands[a].toString(m_Reflection, flags | ToString::ShowSwizzle);
}
m_Instructions.insert(i + 1, op);
}
else
{
dstParam[outputsNeeded - 1] = curOp.operands[0];
}
break;
}
case InstructionState::UAVInstructionHeader1:
{
RDCERR("Expected other UAV write before counter increment in UAV instruction header!");
state = InstructionState::Broken;
break;
}
case InstructionState::UAVInstructionHeader2:
{
// now that we've gotten the UAV, we can go to the body
state = InstructionState::UAVInstructionBody;
break;
}
case InstructionState::UAVInstructionBody:
{
RDCERR(
"Unexpected counter increment while processing UAV instruction body. Expected "
"opcode!");
state = InstructionState::Broken;
break;
}
}
if(state == InstructionState::Broken)
continue;
// remove this operation, but keep the old operation so we can undo this if things go
// wrong
curOp.stride = curOp.operation;
curOp.operation = OPCODE_VENDOR_REMOVED;
}
else if(curOp.operation == OPCODE_STORE_STRUCTURED &&
curOp.operands[0].indices[0].index == magicID)
{
if(curOp.operands[2].type != TYPE_IMMEDIATE32)
{
RDCERR("Expected literal value for UAV write offset");
state = InstructionState::Broken;
break;
}
// NV magic UAV write
NvUAVParam param = (NvUAVParam)curOp.operands[2].values[0];
switch(param)
{
case NvUAVParam::opcode:
{
if(curOp.operands[3].type != TYPE_IMMEDIATE32)
{
RDCERR(
"Expected literal value being written as opcode. Was the shader compiled with "
"optimisations disabled?");
state = InstructionState::Broken;
break;
}
nvopcode = (NvShaderOpcode)curOp.operands[3].values[0];
// if this is NV_EXTN_OP_FP16_ATOMIC we should have come here in UAVInstructionBody.
// That we're here now means this is the continuation of an earlier instruction.
if(state == InstructionState::InstructionHeader && nvopcode == NvShaderOpcode::FP16Atomic)
state = InstructionState::UAVInstructionBody;
// if we're in instruction, this is the simple case so move to the output
if(state == InstructionState::InstructionHeader)
{
// if we haven't gotten a number of outputs at all, set it to 1
if(outputsNeeded <= 0)
numOutputs = outputsNeeded = 1;
state = InstructionState::InstructionBody;
}
else if(state == InstructionState::UAVInstructionBody)
{
// emit the instruction now, writing to the index register (which we know is
// 'unused'). There might be nothing to read the result value. We'll look out for
// loads and post-patch it.
// once we've emitted all outputs, move to Nothing state
state = InstructionState::Nothing;
// and emit vendor instruction
Operation op;
// write to the index register at first. If there's a subsequent read of dst we'll patch
// this instruction with the destination for that.
op.operands.push_back(curOp.operands[1]);
// also include the UAV we noted elsewhere
op.operands.push_back(uavParam);
NvShaderAtomic atomicop = NvShaderAtomic::Unknown;
switch(nvopcode)
{
case NvShaderOpcode::FP16Atomic:
{
op.operation = OPCODE_NV_FP16_ATOMIC;
if(srcParam[2].type != TYPE_IMMEDIATE32)
{
RDCERR(
"Expected literal value as atomic opcode. Was the shader compiled with "
"optimisations disabled?");
state = InstructionState::Broken;
break;
}
atomicop = (NvShaderAtomic)srcParam[2].values[0];
op.operands.push_back(srcParam[0]);
op.operands.back().name = "address"_lit;
op.operands.push_back(srcParam[1]);
op.operands.back().name = "value"_lit;
break;
}
case NvShaderOpcode::FP32Atomic:
{
op.operation = OPCODE_NV_FP32_ATOMIC;
if(srcParam[2].type != TYPE_IMMEDIATE32)
{
RDCERR(
"Expected literal value as atomic opcode. Was the shader compiled with "
"optimisations disabled?");
state = InstructionState::Broken;
break;
}
atomicop = (NvShaderAtomic)srcParam[2].values[0];
op.operands.push_back(srcParam[0].reswizzle(0));
op.operands.back().name = "byteAddress"_lit;
op.operands.push_back(srcParam[1].reswizzle(0));
op.operands.back().name = "value"_lit;
break;
}
case NvShaderOpcode::U64Atomic:
{
op.operation = OPCODE_NV_U64_ATOMIC;
if(srcParam[2].type != TYPE_IMMEDIATE32)
{
RDCERR(
"Expected literal value as atomic opcode. Was the shader compiled with "
"optimisations disabled?");
state = InstructionState::Broken;
break;
}
// insert second dummy return value for high bits
op.operands.insert(0, curOp.operands[1]);
// make both of them NULL
op.operands[0].type = TYPE_NULL;
op.operands[0].setComps(0xff, 0xff, 0xff, 0xff);
op.operands[1].type = TYPE_NULL;
op.operands[1].setComps(0xff, 0xff, 0xff, 0xff);
atomicop = (NvShaderAtomic)srcParam[2].values[0];
op.operands.push_back(srcParam[0]);
op.operands.back().numComponents = NUMCOMPS_1;
op.operands.back().name = "address"_lit;
// store in texelOffset whether the parameter is combined (1) or split (2).
// on nv we assume the parameters are always combined
op.texelOffset[0] = 1;
op.texelOffset[1] = 1;
op.texelOffset[2] = 1;
if(atomicop == NvShaderAtomic::CompareAndSwap)
{
op.operands.push_back(srcParam[1]);
op.operands.back().numComponents = NUMCOMPS_4;
op.operands.back().setComps(srcParam[1].comps[0], srcParam[1].comps[1], 0xff, 0xff);
op.operands.back().values[1] = srcParam[1].values[1];
op.operands.back().name = "compareValue"_lit;
op.operands.push_back(srcParam[1]);
op.operands.back().numComponents = NUMCOMPS_4;
op.operands.back().setComps(srcParam[1].comps[2], srcParam[1].comps[3], 0xff, 0xff);
op.operands.back().values[1] = srcParam[1].values[3];
op.operands.back().name = "value"_lit;
}
else
{
op.operands.push_back(srcParam[1]);
op.operands.back().numComponents = NUMCOMPS_4;
op.operands.back().setComps(srcParam[1].comps[0], srcParam[1].comps[1], 0xff, 0xff);
op.operands.back().values[1] = srcParam[1].values[1];
op.operands.back().name = "value"_lit;
}
break;
}
default:
RDCERR("Unexpected UAV opcode %d.", nvopcode);
state = InstructionState::Broken;
break;
}
if(state == InstructionState::Broken)
break;
if(atomicop == NvShaderAtomic::Unknown)
{
RDCERR("Couldn't determine atomic op");
state = InstructionState::Broken;
break;
}
op.offset = curOp.offset;
op.preciseValues = (uint8_t)atomicop;
op.str = ToStr(op.operation);
switch(atomicop)
{
case NvShaderAtomic::Unknown: break;
case NvShaderAtomic::And:
op.str += "_and";
op.preciseValues = ATOMIC_OP_AND;
break;
case NvShaderAtomic::Or:
op.str += "_or";
op.preciseValues = ATOMIC_OP_OR;
break;
case NvShaderAtomic::Xor:
op.str += "_xor";
op.preciseValues = ATOMIC_OP_XOR;
break;
case NvShaderAtomic::Add:
op.str += "_add";
op.preciseValues = ATOMIC_OP_ADD;
break;
case NvShaderAtomic::Max:
op.str += "_max";
op.preciseValues = ATOMIC_OP_MAX;
break;
case NvShaderAtomic::Min:
op.str += "_min";
op.preciseValues = ATOMIC_OP_MIN;
break;
case NvShaderAtomic::Swap:
op.str += "_swap";
op.preciseValues = ATOMIC_OP_SWAP;
break;
case NvShaderAtomic::CompareAndSwap:
op.str += "_comp_swap";
op.preciseValues = ATOMIC_OP_CAS;
break;
}
for(size_t a = 0; a < op.operands.size(); a++)
{
if(a == 0)
op.str += " ";
else
op.str += ", ";
op.str += op.operands[a].toString(m_Reflection, flags | ToString::ShowSwizzle);
}
m_Instructions.insert(i + 1, op);
// move into nothing state
state = InstructionState::Nothing;
}
else
{
// no other state should be writing an opcode.
RDCERR("Writing opcode in unexpected state %d.", state);
state = InstructionState::Broken;
}
break;
}
case NvUAVParam::markUAV:
{
if(curOp.operands[3].type != TYPE_IMMEDIATE32 || curOp.operands[3].values[0] != 1)
{
RDCERR(
"Expected literal 1 being written to markUAV. Was the shader compiled with "
"optimisations disabled?");
state = InstructionState::Broken;
break;
}
if(state == InstructionState::InstructionHeader)
{
// start waiting for the user's UAV write
state = InstructionState::UAVInstructionHeader1;
}
else
{
// no other state should be writing an opcode.
RDCERR("Writing markUAV in unexpected state %d.", state);
state = InstructionState::Broken;
}
break;
}
// store the src params unconditionally, don't care about the state.
case NvUAVParam::src0:
{
srcParam[0] = curOp.operands[3];
break;
}
case NvUAVParam::src1:
{
srcParam[1] = curOp.operands[3];
break;
}
case NvUAVParam::src2:
{
srcParam[2] = curOp.operands[3];
break;
}
case NvUAVParam::src3:
{
srcParam[3] = curOp.operands[3];
break;
}
case NvUAVParam::src4:
{
srcParam[4] = curOp.operands[3];
break;
}
case NvUAVParam::src5:
{
srcParam[5] = curOp.operands[3];
break;
}
case NvUAVParam::dst:
{
RDCERR("Unexpected store to dst");
state = InstructionState::Broken;
break;
}
case NvUAVParam::numOutputs:
{
if(curOp.operands[3].type != TYPE_IMMEDIATE32)
{
RDCERR(
"Expected literal value being written as numOutputs. Was the shader compiled "
"with optimisations disabled?");
state = InstructionState::Broken;
break;
}
if(state == InstructionState::InstructionHeader ||
state == InstructionState::InstructionBody)
{
// allow writing number of outputs in either header or body (before or after
// simple
// opcode)
numOutputs = outputsNeeded = (int)curOp.operands[3].values[0];
}
else
{
// no other state should be writing an opcode.
RDCERR("Writing numOutputs in unexpected state %d.", state);
state = InstructionState::Broken;
}
break;
}
default:
{
RDCERR("Unexpected offset %u in nvidia magic UAV write.", param);
state = InstructionState::Broken;
break;
}
}
if(state == InstructionState::Broken)
continue;
// remove this operation, but keep the old operation so we can undo this if things go
// wrong
curOp.stride = curOp.operation;
curOp.operation = OPCODE_VENDOR_REMOVED;
}
else if(curOp.operation == OPCODE_LD_STRUCTURED && curOp.operands[3].indices[0].index == magicID)
{
// NV magic UAV load. This should only be of dst and only in the Nothing state after
// we've
// emitted a UAV instruction.
if(state == InstructionState::Nothing)
{
if(curOp.operands[2].type == TYPE_IMMEDIATE32)
{
// NV magic UAV read
NvUAVParam param = (NvUAVParam)curOp.operands[2].values[0];
if(param == NvUAVParam::dst)
{
// search backwards for the last vendor operation. That's the one we're reading
// from
for(size_t j = i; j > 0; j--)
{
if(m_Instructions[j].operation >= OPCODE_VENDOR_FIRST)
{
// re-emit the instruction writing to the actual output now
Operation op = m_Instructions[j];
op.offset = curOp.offset;
op.operands[0] = curOp.operands[0];
op.str = ToStr(op.operation);
// if this is an atomic64, the low/high bits are separate operands
if(op.operation == OPCODE_NV_U64_ATOMIC)
{
op.operands[1] = curOp.operands[0];
op.operands[0].setComps(curOp.operands[0].comps[0], 0xff, 0xff, 0xff);
op.operands[1].setComps(curOp.operands[0].comps[1], 0xff, 0xff, 0xff);
}
switch((VendorAtomicOp)op.preciseValues)
{
case ATOMIC_OP_NONE: break;
case ATOMIC_OP_AND: op.str += "_and"; break;
case ATOMIC_OP_OR: op.str += "_or"; break;
case ATOMIC_OP_XOR: op.str += "_xor"; break;
case ATOMIC_OP_ADD: op.str += "_add"; break;
case ATOMIC_OP_MAX: op.str += "_max"; break;
case ATOMIC_OP_MIN: op.str += "_min"; break;
case ATOMIC_OP_SWAP: op.str += "_swap"; break;
case ATOMIC_OP_CAS: op.str += "_comp_swap"; break;
}
for(size_t a = 0; a < op.operands.size(); a++)
{
if(a == 0)
op.str += " ";
else
op.str += ", ";
op.str += op.operands[a].toString(m_Reflection, flags | ToString::ShowSwizzle);
}
m_Instructions.insert(i + 1, op);
// remove the old one, we've replaced it
m_Instructions[j].operation = OPCODE_VENDOR_REMOVED;
// if we break and try to revert this one, keep it removed
m_Instructions[j].stride = OPCODE_VENDOR_REMOVED;
// also remove the current one! but back up the original in case something
// goes
// wrong
curOp.stride = curOp.operation;
curOp.operation = OPCODE_VENDOR_REMOVED;
break;
}
}
}
else
{
RDCERR("Unexpected read of UAV at offset %d instead of dst (%d)", param, NvUAVParam::dst);
state = InstructionState::Broken;
}
}
else
{
RDCERR("Expected literal value for UAV read offset");
state = InstructionState::Broken;
}
}
else
{
RDCERR("Unexpected UAV read in state %d.", state);
state = InstructionState::Broken;
}
}
else if(state == InstructionState::UAVInstructionHeader1)
{
// while we're here the next UAV write is snooped
if(curOp.operation == OPCODE_STORE_RAW || curOp.operation == OPCODE_STORE_UAV_TYPED)
{
uavParam = curOp.operands[0];
state = InstructionState::UAVInstructionHeader2;
// remove this operation, but keep the old operation so we can undo this if things go
// wrong
curOp.stride = curOp.operation;
curOp.operation = OPCODE_VENDOR_REMOVED;
}
}
else if(state == InstructionState::AMDUAVAtomic)
{
// similarly for AMD we store the UAV referenced, but we don't change state - that happens
// when we see the appropriate phase instruction.
if(curOp.operation == OPCODE_STORE_RAW || curOp.operation == OPCODE_STORE_UAV_TYPED)
{
uavParam = curOp.operands[0];
state = InstructionState::UAVInstructionHeader2;
// remove this operation, but keep the old operation so we can undo this if things go
// wrong
curOp.stride = curOp.operation;
curOp.operation = OPCODE_VENDOR_REMOVED;
}
}
// any other operation we completely ignore
}
if(state == InstructionState::Broken)
{
// if we broke, restore the operations and remove any added vendor operations
for(size_t i = 0; i < m_Instructions.size(); i++)
{
if(m_Instructions[i].operation == OPCODE_VENDOR_REMOVED)
m_Instructions[i].operation = (OpcodeType)m_Instructions[i].stride;
else if(m_Instructions[i].operation >= OPCODE_VENDOR_FIRST)
m_Instructions[i].operation = OPCODE_VENDOR_REMOVED;
}
}
// erase any OPCODE_VENDOR_REMOVED instructions now
for(int32_t i = m_Instructions.count() - 1; i >= 0; i--)
{
if(m_Instructions[i].operation == OPCODE_VENDOR_REMOVED)
m_Instructions.erase(i);
}
}