in renderdoc/driver/shaders/dxbc/dxbc_debug.cpp [1956:4504]
void ThreadState::StepNext(ShaderDebugState *state, DebugAPIWrapper *apiWrapper,
const rdcarray<ThreadState> &prevWorkgroup)
{
if(nextInstruction >= program->GetNumInstructions())
return;
const Operation &op = program->GetInstruction((size_t)nextInstruction);
apiWrapper->SetCurrentInstruction(nextInstruction);
nextInstruction++;
if(nextInstruction >= program->GetNumInstructions())
nextInstruction--;
if(state && debug)
{
const Operation &nextOp = program->GetInstruction((size_t)nextInstruction);
debug->GetCallstack(nextInstruction, nextOp.offset, state->callstack);
}
rdcarray<ShaderVariable> srcOpers;
VarType optype = OperationType(op.operation);
for(size_t i = 1; i < op.operands.size(); i++)
srcOpers.push_back(GetSrc(op.operands[i], op));
switch(op.operation)
{
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Math operations
case OPCODE_DADD:
case OPCODE_IADD:
case OPCODE_ADD:
SetDst(state, op.operands[0], op, add(srcOpers[0], srcOpers[1], optype));
break;
case OPCODE_DDIV:
case OPCODE_DIV:
SetDst(state, op.operands[0], op, div(srcOpers[0], srcOpers[1], optype));
break;
case OPCODE_UDIV:
{
ShaderVariable quot("", (uint32_t)0xffffffff, (uint32_t)0xffffffff, (uint32_t)0xffffffff,
(uint32_t)0xffffffff);
ShaderVariable rem("", (uint32_t)0xffffffff, (uint32_t)0xffffffff, (uint32_t)0xffffffff,
(uint32_t)0xffffffff);
for(size_t i = 0; i < 4; i++)
{
if(srcOpers[2].value.u32v[i] != 0)
{
quot.value.u32v[i] = srcOpers[1].value.u32v[i] / srcOpers[2].value.u32v[i];
rem.value.u32v[i] =
srcOpers[1].value.u32v[i] - (quot.value.u32v[i] * srcOpers[2].value.u32v[i]);
}
else
{
if(state)
state->flags |= ShaderEvents::GeneratedNanOrInf;
}
}
if(op.operands[0].type != TYPE_NULL)
{
SetDst(state, op.operands[0], op, quot);
}
if(op.operands[1].type != TYPE_NULL)
{
SetDst(state, op.operands[1], op, rem);
}
break;
}
case OPCODE_BFREV:
{
ShaderVariable ret("", 0U, 0U, 0U, 0U);
for(size_t i = 0; i < 4; i++)
{
ret.value.u32v[i] = BitwiseReverseLSB16(srcOpers[0].value.u32v[i]);
}
SetDst(state, op.operands[0], op, ret);
break;
}
case OPCODE_COUNTBITS:
{
ShaderVariable ret("", 0U, 0U, 0U, 0U);
for(size_t i = 0; i < 4; i++)
{
ret.value.u32v[i] = PopCount(srcOpers[0].value.u32v[i]);
}
SetDst(state, op.operands[0], op, ret);
break;
}
case OPCODE_FIRSTBIT_HI:
{
ShaderVariable ret("", 0U, 0U, 0U, 0U);
for(size_t i = 0; i < 4; i++)
{
unsigned char found = BitScanReverse((DWORD *)&ret.value.u32v[i], srcOpers[0].value.u32v[i]);
if(found == 0)
{
ret.value.u32v[i] = ~0U;
}
else
{
// firstbit_hi counts index 0 as the MSB, BitScanReverse counts index 0 as the LSB. So we
// need to invert
ret.value.u32v[i] = 31 - ret.value.u32v[i];
}
}
SetDst(state, op.operands[0], op, ret);
break;
}
case OPCODE_FIRSTBIT_LO:
{
ShaderVariable ret("", 0U, 0U, 0U, 0U);
for(size_t i = 0; i < 4; i++)
{
unsigned char found = BitScanForward((DWORD *)&ret.value.u32v[i], srcOpers[0].value.u32v[i]);
if(found == 0)
ret.value.u32v[i] = ~0U;
}
SetDst(state, op.operands[0], op, ret);
break;
}
case OPCODE_FIRSTBIT_SHI:
{
ShaderVariable ret("", 0U, 0U, 0U, 0U);
for(size_t i = 0; i < 4; i++)
{
uint32_t u = srcOpers[0].value.u32v[i];
if(srcOpers[0].value.s32v[i] < 0)
u = ~u;
unsigned char found = BitScanReverse((DWORD *)&ret.value.u32v[i], u);
if(found == 0)
{
ret.value.u32v[i] = ~0U;
}
else
{
// firstbit_shi counts index 0 as the MSB, BitScanReverse counts index 0 as the LSB. So we
// need to invert
ret.value.u32v[i] = 31 - ret.value.u32v[i];
}
}
SetDst(state, op.operands[0], op, ret);
break;
}
case OPCODE_IMUL:
case OPCODE_UMUL:
{
ShaderVariable hi("", 0U, 0U, 0U, 0U);
ShaderVariable lo("", 0U, 0U, 0U, 0U);
for(size_t i = 0; i < 4; i++)
{
if(op.operation == OPCODE_UMUL)
{
uint64_t res = uint64_t(srcOpers[1].value.u32v[i]) * uint64_t(srcOpers[2].value.u32v[i]);
hi.value.u32v[i] = uint32_t((res >> 32) & 0xffffffff);
lo.value.u32v[i] = uint32_t(res & 0xffffffff);
}
else if(op.operation == OPCODE_IMUL)
{
int64_t res = int64_t(srcOpers[1].value.s32v[i]) * int64_t(srcOpers[2].value.s32v[i]);
hi.value.u32v[i] = uint32_t((res >> 32) & 0xffffffff);
lo.value.u32v[i] = uint32_t(res & 0xffffffff);
}
}
if(op.operands[0].type != TYPE_NULL)
{
SetDst(state, op.operands[0], op, hi);
}
if(op.operands[1].type != TYPE_NULL)
{
SetDst(state, op.operands[1], op, lo);
}
break;
}
case OPCODE_DMUL:
case OPCODE_MUL:
SetDst(state, op.operands[0], op, mul(srcOpers[0], srcOpers[1], optype));
break;
case OPCODE_UADDC:
{
uint64_t src[4];
for(int i = 0; i < 4; i++)
src[i] = (uint64_t)srcOpers[1].value.u32v[i];
for(int i = 0; i < 4; i++)
src[i] = (uint64_t)srcOpers[2].value.u32v[i];
// set the rounded result
uint32_t dst[4];
for(int i = 0; i < 4; i++)
dst[i] = (uint32_t)(src[i] & 0xffffffff);
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), dst[0], dst[1], dst[2], dst[3]));
// if not null, set the carry bits
if(op.operands[1].type != TYPE_NULL)
SetDst(state, op.operands[1], op,
ShaderVariable(rdcstr(), src[0] > 0xffffffff ? 1U : 0U, src[1] > 0xffffffff ? 1U : 0U,
src[2] > 0xffffffff ? 1U : 0U, src[3] > 0xffffffff ? 1U : 0U));
break;
}
case OPCODE_USUBB:
{
uint64_t src0[4];
uint64_t src1[4];
// add on a 'borrow' bit
for(int i = 0; i < 4; i++)
src0[i] = 0x100000000 | (uint64_t)srcOpers[1].value.u32v[i];
for(int i = 0; i < 4; i++)
src1[i] = (uint64_t)srcOpers[2].value.u32v[i];
// do the subtract
uint64_t result[4];
for(int i = 0; i < 4; i++)
result[i] = src0[i] - src1[i];
uint32_t dst[4];
for(int i = 0; i < 4; i++)
dst[i] = (uint32_t)(result[0] & 0xffffffff);
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), dst[0], dst[1], dst[2], dst[3]));
// if not null, mark where the borrow bits were used
if(op.operands[1].type != TYPE_NULL)
SetDst(state, op.operands[1], op,
ShaderVariable(rdcstr(), result[0] <= 0xffffffff ? 1U : 0U,
result[1] <= 0xffffffff ? 1U : 0U, result[2] <= 0xffffffff ? 1U : 0U,
result[3] <= 0xffffffff ? 1U : 0U));
break;
}
case OPCODE_IMAD:
case OPCODE_UMAD:
case OPCODE_MAD:
case OPCODE_DFMA:
SetDst(state, op.operands[0], op,
add(mul(srcOpers[0], srcOpers[1], optype), srcOpers[2], optype));
break;
case OPCODE_DP2:
case OPCODE_DP3:
case OPCODE_DP4:
{
ShaderVariable dot = mul(srcOpers[0], srcOpers[1], optype);
float sum = dot.value.f32v[0];
sum += dot.value.f32v[1];
if(op.operation >= OPCODE_DP3)
sum += dot.value.f32v[2];
if(op.operation >= OPCODE_DP4)
sum += dot.value.f32v[3];
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), sum, sum, sum, sum));
break;
}
case OPCODE_F16TOF32:
{
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[0] & 0xffff)),
flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[1] & 0xffff)),
flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[2] & 0xffff)),
flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[3] & 0xffff))));
break;
}
case OPCODE_F32TOF16:
{
SetDst(
state, op.operands[0], op,
ShaderVariable(rdcstr(), (uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[0])),
(uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[1])),
(uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[2])),
(uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[3]))));
break;
}
case OPCODE_FRC:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), srcOpers[0].value.f32v[0] - floorf(srcOpers[0].value.f32v[0]),
srcOpers[0].value.f32v[1] - floorf(srcOpers[0].value.f32v[1]),
srcOpers[0].value.f32v[2] - floorf(srcOpers[0].value.f32v[2]),
srcOpers[0].value.f32v[3] - floorf(srcOpers[0].value.f32v[3])));
break;
// positive infinity
case OPCODE_ROUND_PI:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), ceilf(srcOpers[0].value.f32v[0]),
ceilf(srcOpers[0].value.f32v[1]), ceilf(srcOpers[0].value.f32v[2]),
ceilf(srcOpers[0].value.f32v[3])));
break;
// negative infinity
case OPCODE_ROUND_NI:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), floorf(srcOpers[0].value.f32v[0]),
floorf(srcOpers[0].value.f32v[1]), floorf(srcOpers[0].value.f32v[2]),
floorf(srcOpers[0].value.f32v[3])));
break;
// towards zero
case OPCODE_ROUND_Z:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
srcOpers[0].value.f32v[0] < 0 ? ceilf(srcOpers[0].value.f32v[0])
: floorf(srcOpers[0].value.f32v[0]),
srcOpers[0].value.f32v[1] < 0 ? ceilf(srcOpers[0].value.f32v[1])
: floorf(srcOpers[0].value.f32v[1]),
srcOpers[0].value.f32v[2] < 0 ? ceilf(srcOpers[0].value.f32v[2])
: floorf(srcOpers[0].value.f32v[2]),
srcOpers[0].value.f32v[3] < 0 ? ceilf(srcOpers[0].value.f32v[3])
: floorf(srcOpers[0].value.f32v[3])));
break;
// to nearest even int (banker's rounding)
case OPCODE_ROUND_NE:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), round_ne(srcOpers[0].value.f32v[0]),
round_ne(srcOpers[0].value.f32v[1]), round_ne(srcOpers[0].value.f32v[2]),
round_ne(srcOpers[0].value.f32v[3])));
break;
case OPCODE_INEG: SetDst(state, op.operands[0], op, neg(srcOpers[0], optype)); break;
case OPCODE_IMIN:
SetDst(state, op.operands[0], op,
ShaderVariable(
"",
srcOpers[0].value.s32v[0] < srcOpers[1].value.s32v[0] ? srcOpers[0].value.s32v[0]
: srcOpers[1].value.s32v[0],
srcOpers[0].value.s32v[1] < srcOpers[1].value.s32v[1] ? srcOpers[0].value.s32v[1]
: srcOpers[1].value.s32v[1],
srcOpers[0].value.s32v[2] < srcOpers[1].value.s32v[2] ? srcOpers[0].value.s32v[2]
: srcOpers[1].value.s32v[2],
srcOpers[0].value.s32v[3] < srcOpers[1].value.s32v[3] ? srcOpers[0].value.s32v[3]
: srcOpers[1].value.s32v[3]));
break;
case OPCODE_UMIN:
SetDst(state, op.operands[0], op,
ShaderVariable(
"",
srcOpers[0].value.u32v[0] < srcOpers[1].value.u32v[0] ? srcOpers[0].value.u32v[0]
: srcOpers[1].value.u32v[0],
srcOpers[0].value.u32v[1] < srcOpers[1].value.u32v[1] ? srcOpers[0].value.u32v[1]
: srcOpers[1].value.u32v[1],
srcOpers[0].value.u32v[2] < srcOpers[1].value.u32v[2] ? srcOpers[0].value.u32v[2]
: srcOpers[1].value.u32v[2],
srcOpers[0].value.u32v[3] < srcOpers[1].value.u32v[3] ? srcOpers[0].value.u32v[3]
: srcOpers[1].value.u32v[3]));
break;
case OPCODE_DMIN:
{
double src0[2], src1[2];
DoubleGet(srcOpers[0], src0);
DoubleGet(srcOpers[1], src1);
double dst[2];
dst[0] = dxbc_min(src0[0], src1[0]);
dst[1] = dxbc_min(src0[1], src1[1]);
ShaderVariable r("", 0U, 0U, 0U, 0U);
DoubleSet(r, dst);
SetDst(state, op.operands[0], op, r);
break;
}
case OPCODE_MIN:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), dxbc_min(srcOpers[0].value.f32v[0], srcOpers[1].value.f32v[0]),
dxbc_min(srcOpers[0].value.f32v[1], srcOpers[1].value.f32v[1]),
dxbc_min(srcOpers[0].value.f32v[2], srcOpers[1].value.f32v[2]),
dxbc_min(srcOpers[0].value.f32v[3], srcOpers[1].value.f32v[3])));
break;
case OPCODE_UMAX:
SetDst(state, op.operands[0], op,
ShaderVariable(
"",
srcOpers[0].value.u32v[0] >= srcOpers[1].value.u32v[0] ? srcOpers[0].value.u32v[0]
: srcOpers[1].value.u32v[0],
srcOpers[0].value.u32v[1] >= srcOpers[1].value.u32v[1] ? srcOpers[0].value.u32v[1]
: srcOpers[1].value.u32v[1],
srcOpers[0].value.u32v[2] >= srcOpers[1].value.u32v[2] ? srcOpers[0].value.u32v[2]
: srcOpers[1].value.u32v[2],
srcOpers[0].value.u32v[3] >= srcOpers[1].value.u32v[3] ? srcOpers[0].value.u32v[3]
: srcOpers[1].value.u32v[3]));
break;
case OPCODE_IMAX:
SetDst(state, op.operands[0], op,
ShaderVariable(
"",
srcOpers[0].value.s32v[0] >= srcOpers[1].value.s32v[0] ? srcOpers[0].value.s32v[0]
: srcOpers[1].value.s32v[0],
srcOpers[0].value.s32v[1] >= srcOpers[1].value.s32v[1] ? srcOpers[0].value.s32v[1]
: srcOpers[1].value.s32v[1],
srcOpers[0].value.s32v[2] >= srcOpers[1].value.s32v[2] ? srcOpers[0].value.s32v[2]
: srcOpers[1].value.s32v[2],
srcOpers[0].value.s32v[3] >= srcOpers[1].value.s32v[3] ? srcOpers[0].value.s32v[3]
: srcOpers[1].value.s32v[3]));
break;
case OPCODE_DMAX:
{
double src0[2], src1[2];
DoubleGet(srcOpers[0], src0);
DoubleGet(srcOpers[1], src1);
double dst[2];
dst[0] = dxbc_max(src0[0], src1[0]);
dst[1] = dxbc_max(src0[1], src1[1]);
ShaderVariable r("", 0U, 0U, 0U, 0U);
DoubleSet(r, dst);
SetDst(state, op.operands[0], op, r);
break;
}
case OPCODE_MAX:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), dxbc_max(srcOpers[0].value.f32v[0], srcOpers[1].value.f32v[0]),
dxbc_max(srcOpers[0].value.f32v[1], srcOpers[1].value.f32v[1]),
dxbc_max(srcOpers[0].value.f32v[2], srcOpers[1].value.f32v[2]),
dxbc_max(srcOpers[0].value.f32v[3], srcOpers[1].value.f32v[3])));
break;
case OPCODE_SQRT:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), sqrtf(srcOpers[0].value.f32v[0]),
sqrtf(srcOpers[0].value.f32v[1]), sqrtf(srcOpers[0].value.f32v[2]),
sqrtf(srcOpers[0].value.f32v[3])));
break;
case OPCODE_DRCP:
{
double ds[2] = {0.0, 0.0};
DoubleGet(srcOpers[0], ds);
ds[0] = 1.0f / ds[0];
ds[1] = 1.0f / ds[1];
ShaderVariable r("", 0U, 0U, 0U, 0U);
DoubleSet(r, ds);
SetDst(state, op.operands[0], op, r);
break;
}
case OPCODE_IBFE:
{
// bottom 5 bits
ShaderVariable width("", (int32_t)(srcOpers[0].value.s32v[0] & 0x1f),
(int32_t)(srcOpers[0].value.s32v[1] & 0x1f),
(int32_t)(srcOpers[0].value.s32v[2] & 0x1f),
(int32_t)(srcOpers[0].value.s32v[3] & 0x1f));
ShaderVariable offset("", (int32_t)(srcOpers[1].value.s32v[0] & 0x1f),
(int32_t)(srcOpers[1].value.s32v[1] & 0x1f),
(int32_t)(srcOpers[1].value.s32v[2] & 0x1f),
(int32_t)(srcOpers[1].value.s32v[3] & 0x1f));
ShaderVariable dest("", (int32_t)0, (int32_t)0, (int32_t)0, (int32_t)0);
for(int comp = 0; comp < 4; comp++)
{
if(width.value.s32v[comp] == 0)
{
dest.value.s32v[comp] = 0;
}
else if(width.value.s32v[comp] + offset.value.s32v[comp] < 32)
{
dest.value.s32v[comp] = srcOpers[2].value.s32v[comp]
<< (32 - (width.value.s32v[comp] + offset.value.s32v[comp]));
dest.value.s32v[comp] = dest.value.s32v[comp] >> (32 - width.value.s32v[comp]);
}
else
{
dest.value.s32v[comp] = srcOpers[2].value.s32v[comp] >> offset.value.s32v[comp];
}
}
SetDst(state, op.operands[0], op, dest);
break;
}
case OPCODE_UBFE:
{
// bottom 5 bits
ShaderVariable width("", (uint32_t)(srcOpers[0].value.u32v[0] & 0x1f),
(uint32_t)(srcOpers[0].value.u32v[1] & 0x1f),
(uint32_t)(srcOpers[0].value.u32v[2] & 0x1f),
(uint32_t)(srcOpers[0].value.u32v[3] & 0x1f));
ShaderVariable offset("", (uint32_t)(srcOpers[1].value.u32v[0] & 0x1f),
(uint32_t)(srcOpers[1].value.u32v[1] & 0x1f),
(uint32_t)(srcOpers[1].value.u32v[2] & 0x1f),
(uint32_t)(srcOpers[1].value.u32v[3] & 0x1f));
ShaderVariable dest("", (uint32_t)0, (uint32_t)0, (uint32_t)0, (uint32_t)0);
for(int comp = 0; comp < 4; comp++)
{
if(width.value.u32v[comp] == 0)
{
dest.value.u32v[comp] = 0;
}
else if(width.value.u32v[comp] + offset.value.u32v[comp] < 32)
{
dest.value.u32v[comp] = srcOpers[2].value.u32v[comp]
<< (32 - (width.value.u32v[comp] + offset.value.u32v[comp]));
dest.value.u32v[comp] = dest.value.u32v[comp] >> (32 - width.value.u32v[comp]);
}
else
{
dest.value.u32v[comp] = srcOpers[2].value.u32v[comp] >> offset.value.u32v[comp];
}
}
SetDst(state, op.operands[0], op, dest);
break;
}
case OPCODE_BFI:
{
// bottom 5 bits
ShaderVariable width("", (uint32_t)(srcOpers[0].value.u32v[0] & 0x1f),
(uint32_t)(srcOpers[0].value.u32v[1] & 0x1f),
(uint32_t)(srcOpers[0].value.u32v[2] & 0x1f),
(uint32_t)(srcOpers[0].value.u32v[3] & 0x1f));
ShaderVariable offset("", (uint32_t)(srcOpers[1].value.u32v[0] & 0x1f),
(uint32_t)(srcOpers[1].value.u32v[1] & 0x1f),
(uint32_t)(srcOpers[1].value.u32v[2] & 0x1f),
(uint32_t)(srcOpers[1].value.u32v[3] & 0x1f));
ShaderVariable dest("", (uint32_t)0, (uint32_t)0, (uint32_t)0, (uint32_t)0);
for(int comp = 0; comp < 4; comp++)
{
uint32_t bitmask =
(((1 << width.value.u32v[comp]) - 1) << offset.value.u32v[comp]) & 0xffffffff;
dest.value.u32v[comp] =
(uint32_t)(((srcOpers[2].value.u32v[comp] << offset.value.u32v[comp]) & bitmask) |
(srcOpers[3].value.u32v[comp] & ~bitmask));
}
SetDst(state, op.operands[0], op, dest);
break;
}
case OPCODE_ISHL:
{
uint32_t shifts[] = {
srcOpers[1].value.u32v[0] & 0x1f,
srcOpers[1].value.u32v[1] & 0x1f,
srcOpers[1].value.u32v[2] & 0x1f,
srcOpers[1].value.u32v[3] & 0x1f,
};
// if we were only given a single component, it's the form that shifts all components
// by the same amount
if(op.operands[2].numComponents == NUMCOMPS_1 ||
(op.operands[2].comps[2] < 4 && op.operands[2].comps[2] == 0xff))
shifts[3] = shifts[2] = shifts[1] = shifts[0];
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] << shifts[0],
srcOpers[0].value.s32v[1] << shifts[1],
srcOpers[0].value.s32v[2] << shifts[2],
srcOpers[0].value.s32v[3] << shifts[3]));
break;
}
case OPCODE_USHR:
{
uint32_t shifts[] = {
srcOpers[1].value.u32v[0] & 0x1f,
srcOpers[1].value.u32v[1] & 0x1f,
srcOpers[1].value.u32v[2] & 0x1f,
srcOpers[1].value.u32v[3] & 0x1f,
};
// if we were only given a single component, it's the form that shifts all components
// by the same amount
if(op.operands[2].numComponents == NUMCOMPS_1 ||
(op.operands[2].comps[2] < 4 && op.operands[2].comps[2] == 0xff))
shifts[3] = shifts[2] = shifts[1] = shifts[0];
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), srcOpers[0].value.u32v[0] >> shifts[0],
srcOpers[0].value.u32v[1] >> shifts[1],
srcOpers[0].value.u32v[2] >> shifts[2],
srcOpers[0].value.u32v[3] >> shifts[3]));
break;
}
case OPCODE_ISHR:
{
uint32_t shifts[] = {
srcOpers[1].value.u32v[0] & 0x1f,
srcOpers[1].value.u32v[1] & 0x1f,
srcOpers[1].value.u32v[2] & 0x1f,
srcOpers[1].value.u32v[3] & 0x1f,
};
// if we were only given a single component, it's the form that shifts all components
// by the same amount
if(op.operands[2].numComponents == NUMCOMPS_1 ||
(op.operands[2].comps[2] < 4 && op.operands[2].comps[2] == 0xff))
shifts[3] = shifts[2] = shifts[1] = shifts[0];
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] >> shifts[0],
srcOpers[0].value.s32v[1] >> shifts[1],
srcOpers[0].value.s32v[2] >> shifts[2],
srcOpers[0].value.s32v[3] >> shifts[3]));
break;
}
case OPCODE_AND:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] & srcOpers[1].value.s32v[0],
srcOpers[0].value.s32v[1] & srcOpers[1].value.s32v[1],
srcOpers[0].value.s32v[2] & srcOpers[1].value.s32v[2],
srcOpers[0].value.s32v[3] & srcOpers[1].value.s32v[3]));
break;
case OPCODE_OR:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] | srcOpers[1].value.s32v[0],
srcOpers[0].value.s32v[1] | srcOpers[1].value.s32v[1],
srcOpers[0].value.s32v[2] | srcOpers[1].value.s32v[2],
srcOpers[0].value.s32v[3] | srcOpers[1].value.s32v[3]));
break;
case OPCODE_XOR:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), srcOpers[0].value.u32v[0] ^ srcOpers[1].value.u32v[0],
srcOpers[0].value.u32v[1] ^ srcOpers[1].value.u32v[1],
srcOpers[0].value.u32v[2] ^ srcOpers[1].value.u32v[2],
srcOpers[0].value.u32v[3] ^ srcOpers[1].value.u32v[3]));
break;
case OPCODE_NOT:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), ~srcOpers[0].value.u32v[0], ~srcOpers[0].value.u32v[1],
~srcOpers[0].value.u32v[2], ~srcOpers[0].value.u32v[3]));
break;
/////////////////////////////////////////////////////////////////////////////////////////////////////
// transcendental functions with loose ULP requirements, so we pass them to the GPU to get
// more accurate (well, LESS accurate but more representative) answers.
case OPCODE_RCP:
case OPCODE_RSQ:
case OPCODE_EXP:
case OPCODE_LOG:
{
ShaderVariable calcResultA("calcA", 0.0f, 0.0f, 0.0f, 0.0f);
ShaderVariable calcResultB("calcB", 0.0f, 0.0f, 0.0f, 0.0f);
if(apiWrapper->CalculateMathIntrinsic(op.operation, srcOpers[0], calcResultA, calcResultB))
{
SetDst(state, op.operands[0], op, calcResultA);
}
else
{
return;
}
break;
}
case OPCODE_SINCOS:
{
ShaderVariable calcResultA("calcA", 0.0f, 0.0f, 0.0f, 0.0f);
ShaderVariable calcResultB("calcB", 0.0f, 0.0f, 0.0f, 0.0f);
if(apiWrapper->CalculateMathIntrinsic(OPCODE_SINCOS, srcOpers[1], calcResultA, calcResultB))
{
if(op.operands[0].type != TYPE_NULL)
SetDst(state, op.operands[0], op, calcResultA);
if(op.operands[1].type != TYPE_NULL)
SetDst(state, op.operands[1], op, calcResultB);
}
else
{
return;
}
break;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Misc
case OPCODE_NOP:
case OPCODE_CUSTOMDATA:
case OPCODE_OPAQUE_CUSTOMDATA:
case OPCODE_SHADER_MESSAGE:
case OPCODE_DCL_IMMEDIATE_CONSTANT_BUFFER: break;
case OPCODE_SYNC: // might never need to implement this. Who knows!
break;
case OPCODE_DMOV:
case OPCODE_MOV: SetDst(state, op.operands[0], op, srcOpers[0]); break;
case OPCODE_DMOVC:
SetDst(
state, op.operands[0], op,
ShaderVariable(
"", srcOpers[0].value.u32v[0] ? srcOpers[1].value.u32v[0] : srcOpers[2].value.u32v[0],
srcOpers[0].value.u32v[0] ? srcOpers[1].value.u32v[1] : srcOpers[2].value.u32v[1],
srcOpers[0].value.u32v[1] ? srcOpers[1].value.u32v[2] : srcOpers[2].value.u32v[2],
srcOpers[0].value.u32v[1] ? srcOpers[1].value.u32v[3] : srcOpers[2].value.u32v[3]));
break;
case OPCODE_MOVC:
SetDst(
state, op.operands[0], op,
ShaderVariable(
"", srcOpers[0].value.s32v[0] ? srcOpers[1].value.s32v[0] : srcOpers[2].value.s32v[0],
srcOpers[0].value.s32v[1] ? srcOpers[1].value.s32v[1] : srcOpers[2].value.s32v[1],
srcOpers[0].value.s32v[2] ? srcOpers[1].value.s32v[2] : srcOpers[2].value.s32v[2],
srcOpers[0].value.s32v[3] ? srcOpers[1].value.s32v[3] : srcOpers[2].value.s32v[3]));
break;
case OPCODE_SWAPC:
SetDst(
state, op.operands[0], op,
ShaderVariable(
"", srcOpers[1].value.s32v[0] ? srcOpers[3].value.s32v[0] : srcOpers[2].value.s32v[0],
srcOpers[1].value.s32v[1] ? srcOpers[3].value.s32v[1] : srcOpers[2].value.s32v[1],
srcOpers[1].value.s32v[2] ? srcOpers[3].value.s32v[2] : srcOpers[2].value.s32v[2],
srcOpers[1].value.s32v[3] ? srcOpers[3].value.s32v[3] : srcOpers[2].value.s32v[3]));
SetDst(
state, op.operands[1], op,
ShaderVariable(
"", srcOpers[1].value.s32v[0] ? srcOpers[2].value.s32v[0] : srcOpers[3].value.s32v[0],
srcOpers[1].value.s32v[1] ? srcOpers[2].value.s32v[1] : srcOpers[3].value.s32v[1],
srcOpers[1].value.s32v[2] ? srcOpers[2].value.s32v[2] : srcOpers[3].value.s32v[2],
srcOpers[1].value.s32v[3] ? srcOpers[2].value.s32v[3] : srcOpers[3].value.s32v[3]));
break;
case OPCODE_ITOF:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), (float)srcOpers[0].value.s32v[0],
(float)srcOpers[0].value.s32v[1], (float)srcOpers[0].value.s32v[2],
(float)srcOpers[0].value.s32v[3]));
break;
case OPCODE_UTOF:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), (float)srcOpers[0].value.u32v[0],
(float)srcOpers[0].value.u32v[1], (float)srcOpers[0].value.u32v[2],
(float)srcOpers[0].value.u32v[3]));
break;
case OPCODE_FTOI:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), (int)srcOpers[0].value.f32v[0], (int)srcOpers[0].value.f32v[1],
(int)srcOpers[0].value.f32v[2], (int)srcOpers[0].value.f32v[3]));
break;
case OPCODE_FTOU:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(), (uint32_t)srcOpers[0].value.f32v[0],
(uint32_t)srcOpers[0].value.f32v[1], (uint32_t)srcOpers[0].value.f32v[2],
(uint32_t)srcOpers[0].value.f32v[3]));
break;
case OPCODE_ITOD:
case OPCODE_UTOD:
case OPCODE_FTOD:
{
double res[2];
if(op.operation == OPCODE_ITOD)
{
res[0] = (double)srcOpers[0].value.s32v[0];
res[1] = (double)srcOpers[0].value.s32v[1];
}
else if(op.operation == OPCODE_UTOD)
{
res[0] = (double)srcOpers[0].value.u32v[0];
res[1] = (double)srcOpers[0].value.u32v[1];
}
else if(op.operation == OPCODE_FTOD)
{
res[0] = (double)srcOpers[0].value.f32v[0];
res[1] = (double)srcOpers[0].value.f32v[1];
}
// if we only did a 1-wide double op, copy .xy into .zw so we can then
// swizzle into .xy or .zw freely on the destination operand.
// e.g. ftod r0.zw, r0.z - if we didn't do this, there'd be nothing valid in .zw
if(op.operands[1].comps[2] == 0xff)
res[1] = res[0];
ShaderVariable r("", 0U, 0U, 0U, 0U);
DoubleSet(r, res);
SetDst(state, op.operands[0], op, r);
break;
}
case OPCODE_DTOI:
case OPCODE_DTOU:
case OPCODE_DTOF:
{
double src[2];
DoubleGet(srcOpers[0], src);
// special behaviour for dest mask. if it's .xz then first goes into .x, second into .z.
// if the mask is .y then the first goes into .y and second goes nowhere.
// so we need to check the dest mask and put the results into the right place
ShaderVariable r("", 0U, 0U, 0U, 0U);
if(op.operation == OPCODE_DTOU)
{
if(op.operands[0].comps[1] == 0xff) // only one mask
{
r.value.u32v[op.operands[0].comps[0]] = uint32_t(src[0]);
}
else
{
r.value.u32v[op.operands[0].comps[0]] = uint32_t(src[0]);
r.value.u32v[op.operands[0].comps[1]] = uint32_t(src[1]);
}
}
else if(op.operation == OPCODE_DTOI)
{
if(op.operands[0].comps[1] == 0xff) // only one mask
{
r.value.s32v[op.operands[0].comps[0]] = int32_t(src[0]);
}
else
{
r.value.s32v[op.operands[0].comps[0]] = int32_t(src[0]);
r.value.s32v[op.operands[0].comps[1]] = int32_t(src[1]);
}
}
else if(op.operation == OPCODE_DTOF)
{
if(op.operands[0].comps[1] == 0xff) // only one mask
{
r.value.f32v[op.operands[0].comps[0]] = float(src[0]);
}
else
{
r.value.f32v[op.operands[0].comps[0]] = float(src[0]);
r.value.f32v[op.operands[0].comps[1]] = float(src[1]);
}
}
SetDst(state, op.operands[0], op, r);
break;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Comparison
case OPCODE_EQ:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.f32v[0] == srcOpers[1].value.f32v[0] ? ~0u : 0u),
(srcOpers[0].value.f32v[1] == srcOpers[1].value.f32v[1] ? ~0u : 0u),
(srcOpers[0].value.f32v[2] == srcOpers[1].value.f32v[2] ? ~0u : 0u),
(srcOpers[0].value.f32v[3] == srcOpers[1].value.f32v[3] ? ~0u : 0u)));
break;
case OPCODE_NE:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.f32v[0] != srcOpers[1].value.f32v[0] ? ~0u : 0u),
(srcOpers[0].value.f32v[1] != srcOpers[1].value.f32v[1] ? ~0u : 0u),
(srcOpers[0].value.f32v[2] != srcOpers[1].value.f32v[2] ? ~0u : 0u),
(srcOpers[0].value.f32v[3] != srcOpers[1].value.f32v[3] ? ~0u : 0u)));
break;
case OPCODE_LT:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.f32v[0] < srcOpers[1].value.f32v[0] ? ~0u : 0u),
(srcOpers[0].value.f32v[1] < srcOpers[1].value.f32v[1] ? ~0u : 0u),
(srcOpers[0].value.f32v[2] < srcOpers[1].value.f32v[2] ? ~0u : 0u),
(srcOpers[0].value.f32v[3] < srcOpers[1].value.f32v[3] ? ~0u : 0u)));
break;
case OPCODE_GE:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.f32v[0] >= srcOpers[1].value.f32v[0] ? ~0u : 0u),
(srcOpers[0].value.f32v[1] >= srcOpers[1].value.f32v[1] ? ~0u : 0u),
(srcOpers[0].value.f32v[2] >= srcOpers[1].value.f32v[2] ? ~0u : 0u),
(srcOpers[0].value.f32v[3] >= srcOpers[1].value.f32v[3] ? ~0u : 0u)));
break;
case OPCODE_DEQ:
case OPCODE_DNE:
case OPCODE_DGE:
case OPCODE_DLT:
{
double src0[2], src1[2];
DoubleGet(srcOpers[0], src0);
DoubleGet(srcOpers[1], src1);
uint32_t cmp1 = 0;
uint32_t cmp2 = 0;
switch(op.operation)
{
case OPCODE_DEQ:
cmp1 = (src0[0] == src1[0] ? ~0l : 0l);
cmp2 = (src0[1] == src1[1] ? ~0l : 0l);
break;
case OPCODE_DNE:
cmp1 = (src0[0] != src1[0] ? ~0l : 0l);
cmp2 = (src0[1] != src1[1] ? ~0l : 0l);
break;
case OPCODE_DGE:
cmp1 = (src0[0] >= src1[0] ? ~0l : 0l);
cmp2 = (src0[1] >= src1[1] ? ~0l : 0l);
break;
case OPCODE_DLT:
cmp1 = (src0[0] < src1[0] ? ~0l : 0l);
cmp2 = (src0[1] < src1[1] ? ~0l : 0l);
break;
default: break;
}
// special behaviour for dest mask. if it's .xz then first comparison goes into .x, second
// into .z.
// if the mask is .y then the first comparison goes into .y and second goes nowhere.
// so we need to check the dest mask and put the comparison results into the right place
ShaderVariable r("", 0U, 0U, 0U, 0U);
if(op.operands[0].comps[1] == 0xff) // only one mask
{
r.value.u32v[op.operands[0].comps[0]] = cmp1;
}
else
{
r.value.u32v[op.operands[0].comps[0]] = cmp1;
r.value.u32v[op.operands[0].comps[1]] = cmp2;
}
SetDst(state, op.operands[0], op, r);
break;
}
case OPCODE_IEQ:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.s32v[0] == srcOpers[1].value.s32v[0] ? ~0u : 0u),
(srcOpers[0].value.s32v[1] == srcOpers[1].value.s32v[1] ? ~0u : 0u),
(srcOpers[0].value.s32v[2] == srcOpers[1].value.s32v[2] ? ~0u : 0u),
(srcOpers[0].value.s32v[3] == srcOpers[1].value.s32v[3] ? ~0u : 0u)));
break;
case OPCODE_INE:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.s32v[0] != srcOpers[1].value.s32v[0] ? ~0u : 0u),
(srcOpers[0].value.s32v[1] != srcOpers[1].value.s32v[1] ? ~0u : 0u),
(srcOpers[0].value.s32v[2] != srcOpers[1].value.s32v[2] ? ~0u : 0u),
(srcOpers[0].value.s32v[3] != srcOpers[1].value.s32v[3] ? ~0u : 0u)));
break;
case OPCODE_IGE:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.s32v[0] >= srcOpers[1].value.s32v[0] ? ~0u : 0u),
(srcOpers[0].value.s32v[1] >= srcOpers[1].value.s32v[1] ? ~0u : 0u),
(srcOpers[0].value.s32v[2] >= srcOpers[1].value.s32v[2] ? ~0u : 0u),
(srcOpers[0].value.s32v[3] >= srcOpers[1].value.s32v[3] ? ~0u : 0u)));
break;
case OPCODE_ILT:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.s32v[0] < srcOpers[1].value.s32v[0] ? ~0u : 0u),
(srcOpers[0].value.s32v[1] < srcOpers[1].value.s32v[1] ? ~0u : 0u),
(srcOpers[0].value.s32v[2] < srcOpers[1].value.s32v[2] ? ~0u : 0u),
(srcOpers[0].value.s32v[3] < srcOpers[1].value.s32v[3] ? ~0u : 0u)));
break;
case OPCODE_ULT:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.u32v[0] < srcOpers[1].value.u32v[0] ? ~0u : 0u),
(srcOpers[0].value.u32v[1] < srcOpers[1].value.u32v[1] ? ~0u : 0u),
(srcOpers[0].value.u32v[2] < srcOpers[1].value.u32v[2] ? ~0u : 0u),
(srcOpers[0].value.u32v[3] < srcOpers[1].value.u32v[3] ? ~0u : 0u)));
break;
case OPCODE_UGE:
SetDst(state, op.operands[0], op,
ShaderVariable(rdcstr(),
(srcOpers[0].value.u32v[0] >= srcOpers[1].value.u32v[0] ? ~0u : 0u),
(srcOpers[0].value.u32v[1] >= srcOpers[1].value.u32v[1] ? ~0u : 0u),
(srcOpers[0].value.u32v[2] >= srcOpers[1].value.u32v[2] ? ~0u : 0u),
(srcOpers[0].value.u32v[3] >= srcOpers[1].value.u32v[3] ? ~0u : 0u)));
break;
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Atomic instructions
case OPCODE_IMM_ATOMIC_ALLOC:
{
BindingSlot slot = GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW,
srcOpers[0].value.u32v[0]);
GlobalState::UAVIterator uav = global.uavs.find(slot);
if(uav == global.uavs.end())
{
apiWrapper->FetchUAV(slot);
uav = global.uavs.find(slot);
}
MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);
// if it's not a buffer or the buffer is empty this UAV is NULL/invalid, return 0 for the
// counter
uint32_t count = uav->second.data.empty() ? 0 : uav->second.hiddenCounter++;
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), count, count, count, count));
break;
}
case OPCODE_IMM_ATOMIC_CONSUME:
{
BindingSlot slot = GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW,
srcOpers[0].value.u32v[0]);
GlobalState::UAVIterator uav = global.uavs.find(slot);
if(uav == global.uavs.end())
{
apiWrapper->FetchUAV(slot);
uav = global.uavs.find(slot);
}
MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);
// if it's not a buffer or the buffer is empty this UAV is NULL/invalid, return 0 for the
// counter
uint32_t count = uav->second.data.empty() ? 0 : --uav->second.hiddenCounter;
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), count, count, count, count));
break;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Derivative instructions
// don't differentiate, coarse, fine, whatever. The spec lets us implement it all as fine.
case OPCODE_DERIV_RTX:
case OPCODE_DERIV_RTX_COARSE:
case OPCODE_DERIV_RTX_FINE:
if(program->GetShaderType() != DXBC::ShaderType::Pixel || prevWorkgroup.size() != 4)
RDCERR(
"Attempt to use derivative instruction not in pixel shader. Undefined results will "
"occur!");
else
SetDst(state, op.operands[0], op,
DDX(op.operation == OPCODE_DERIV_RTX_FINE, prevWorkgroup, op.operands[1], op));
break;
case OPCODE_DERIV_RTY:
case OPCODE_DERIV_RTY_COARSE:
case OPCODE_DERIV_RTY_FINE:
if(program->GetShaderType() != DXBC::ShaderType::Pixel || prevWorkgroup.size() != 4)
RDCERR(
"Attempt to use derivative instruction not in pixel shader. Undefined results will "
"occur!");
else
SetDst(state, op.operands[0], op,
DDY(op.operation == OPCODE_DERIV_RTY_FINE, prevWorkgroup, op.operands[1], op));
break;
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Buffer/Texture load and store
// handle atomic operations all together
case OPCODE_ATOMIC_IADD:
case OPCODE_ATOMIC_IMAX:
case OPCODE_ATOMIC_IMIN:
case OPCODE_ATOMIC_AND:
case OPCODE_ATOMIC_OR:
case OPCODE_ATOMIC_XOR:
case OPCODE_ATOMIC_CMP_STORE:
case OPCODE_ATOMIC_UMAX:
case OPCODE_ATOMIC_UMIN:
case OPCODE_IMM_ATOMIC_IADD:
case OPCODE_IMM_ATOMIC_IMAX:
case OPCODE_IMM_ATOMIC_IMIN:
case OPCODE_IMM_ATOMIC_AND:
case OPCODE_IMM_ATOMIC_OR:
case OPCODE_IMM_ATOMIC_XOR:
case OPCODE_IMM_ATOMIC_EXCH:
case OPCODE_IMM_ATOMIC_CMP_EXCH:
case OPCODE_IMM_ATOMIC_UMAX:
case OPCODE_IMM_ATOMIC_UMIN:
{
Operand beforeResult;
uint32_t resIndex = 0;
ShaderVariable *dstAddress = NULL;
ShaderVariable *src0 = NULL;
ShaderVariable *src1 = NULL;
bool gsm = false;
if(op.operation == OPCODE_IMM_ATOMIC_IADD || op.operation == OPCODE_IMM_ATOMIC_IMAX ||
op.operation == OPCODE_IMM_ATOMIC_IMIN || op.operation == OPCODE_IMM_ATOMIC_AND ||
op.operation == OPCODE_IMM_ATOMIC_OR || op.operation == OPCODE_IMM_ATOMIC_XOR ||
op.operation == OPCODE_IMM_ATOMIC_EXCH || op.operation == OPCODE_IMM_ATOMIC_CMP_EXCH ||
op.operation == OPCODE_IMM_ATOMIC_UMAX || op.operation == OPCODE_IMM_ATOMIC_UMIN)
{
beforeResult = op.operands[0];
resIndex = (uint32_t)op.operands[1].indices[0].index;
gsm = (op.operands[1].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
dstAddress = &srcOpers[1];
src0 = &srcOpers[2];
if(srcOpers.size() > 3)
src1 = &srcOpers[3];
}
else
{
beforeResult.type = TYPE_NULL;
resIndex = (uint32_t)op.operands[0].indices[0].index;
gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
dstAddress = &srcOpers[0];
src0 = &srcOpers[1];
if(srcOpers.size() > 2)
src1 = &srcOpers[2];
}
uint32_t stride = 4;
uint32_t offset = 0;
uint32_t numElems = 0;
bool structured = false;
byte *data = NULL;
if(gsm)
{
offset = 0;
if(resIndex > global.groupshared.size())
{
numElems = 0;
stride = 4;
data = NULL;
}
else
{
numElems = global.groupshared[resIndex].count;
stride = global.groupshared[resIndex].bytestride;
data = &global.groupshared[resIndex].data[0];
structured = global.groupshared[resIndex].structured;
}
}
else
{
BindingSlot slot =
GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW, resIndex);
GlobalState::UAVIterator uav = global.uavs.find(slot);
if(uav == global.uavs.end())
{
apiWrapper->FetchUAV(slot);
uav = global.uavs.find(slot);
}
MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);
offset = uav->second.firstElement;
numElems = uav->second.numElements;
data = &uav->second.data[0];
const DXBCBytecode::Declaration *pDecl =
program->FindDeclaration(TYPE_UNORDERED_ACCESS_VIEW, resIndex);
if(pDecl)
{
if(pDecl->declaration == OPCODE_DCL_UNORDERED_ACCESS_VIEW_RAW)
{
stride = 4;
structured = false;
}
else if(pDecl->declaration == OPCODE_DCL_UNORDERED_ACCESS_VIEW_STRUCTURED)
{
stride = pDecl->structured.stride;
structured = true;
}
}
}
RDCASSERT(data);
// seems like .x is element index, and .y is byte address, in the dstAddress operand
//
// "Out of bounds addressing on u# causes nothing to be written to memory, except if the
// u# is structured, and byte offset into the struct (second component of the address) is
// causing the out of bounds access, then the entire contents of the UAV become undefined."
//
// "The number of components taken from the address is determined by the dimensionality of dst
// u# or g#."
if(data)
{
data += (offset + dstAddress->value.u32v[0]) * stride;
if(structured)
data += dstAddress->value.u32v[1];
}
// if out of bounds, undefined result is returned to dst0 for immediate operands,
// so we only need to care about the in-bounds case.
// Also helper/inactive pixels are not allowed to modify UAVs
if(data && offset + dstAddress->value.u32v[0] < numElems && !Finished())
{
uint32_t *udst = (uint32_t *)data;
int32_t *idst = (int32_t *)data;
if(beforeResult.type != TYPE_NULL)
{
SetDst(state, beforeResult, op, ShaderVariable(rdcstr(), *udst, *udst, *udst, *udst));
}
// not verified below since by definition the operations that expect usrc1 will have it
uint32_t *usrc0 = src0->value.u32v.data();
uint32_t *usrc1 = src1->value.u32v.data();
int32_t *isrc0 = src0->value.s32v.data();
switch(op.operation)
{
case OPCODE_IMM_ATOMIC_IADD:
case OPCODE_ATOMIC_IADD: *udst = *udst + *usrc0; break;
case OPCODE_IMM_ATOMIC_IMAX:
case OPCODE_ATOMIC_IMAX: *idst = RDCMAX(*idst, *isrc0); break;
case OPCODE_IMM_ATOMIC_IMIN:
case OPCODE_ATOMIC_IMIN: *idst = RDCMIN(*idst, *isrc0); break;
case OPCODE_IMM_ATOMIC_AND:
case OPCODE_ATOMIC_AND: *udst = *udst & *usrc0; break;
case OPCODE_IMM_ATOMIC_OR:
case OPCODE_ATOMIC_OR: *udst = *udst | *usrc0; break;
case OPCODE_IMM_ATOMIC_XOR:
case OPCODE_ATOMIC_XOR: *udst = *udst ^ *usrc0; break;
case OPCODE_IMM_ATOMIC_EXCH: *udst = *usrc0; break;
case OPCODE_IMM_ATOMIC_CMP_EXCH:
case OPCODE_ATOMIC_CMP_STORE:
if(*udst == *usrc1)
*udst = *usrc0;
break;
case OPCODE_IMM_ATOMIC_UMAX:
case OPCODE_ATOMIC_UMAX: *udst = RDCMAX(*udst, *usrc0); break;
case OPCODE_IMM_ATOMIC_UMIN:
case OPCODE_ATOMIC_UMIN: *udst = RDCMIN(*udst, *usrc0); break;
default: break;
}
}
break;
}
// store and load paths are mostly identical
case OPCODE_STORE_UAV_TYPED:
case OPCODE_STORE_RAW:
case OPCODE_STORE_STRUCTURED:
case OPCODE_LD_RAW:
case OPCODE_LD_UAV_TYPED:
case OPCODE_LD_STRUCTURED:
{
uint32_t resIndex = 0;
uint32_t structOffset = 0;
uint32_t elemIdx = 0;
uint32_t texCoords[3] = {0, 0, 0};
uint32_t stride = 0;
bool srv = true;
bool gsm = false;
bool load = true;
uint8_t resComps[4] = {0, 1, 2, 3};
if(op.operation == OPCODE_STORE_UAV_TYPED || op.operation == OPCODE_STORE_RAW ||
op.operation == OPCODE_STORE_STRUCTURED)
{
load = false;
}
if(load && state)
state->flags |= ShaderEvents::SampleLoadGather;
if(op.operation == OPCODE_LD_STRUCTURED || op.operation == OPCODE_STORE_STRUCTURED)
{
if(load)
{
resIndex = (uint32_t)op.operands[3].indices[0].index;
srv = (op.operands[3].type == TYPE_RESOURCE);
gsm = (op.operands[3].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
memcpy(resComps, op.operands[3].comps, sizeof(resComps));
stride = op.stride;
}
else
{
resIndex = (uint32_t)op.operands[0].indices[0].index;
srv = false;
gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
}
if(stride == 0)
{
if(gsm && resIndex < global.groupshared.size())
{
stride = global.groupshared[resIndex].bytestride;
}
else if(!gsm)
{
OperandType declType = srv ? TYPE_RESOURCE : TYPE_UNORDERED_ACCESS_VIEW;
OpcodeType declOpcode =
srv ? OPCODE_DCL_RESOURCE_STRUCTURED : OPCODE_DCL_UNORDERED_ACCESS_VIEW_STRUCTURED;
const DXBCBytecode::Declaration *pDecl = program->FindDeclaration(declType, resIndex);
if(pDecl && pDecl->declaration == declOpcode)
stride = pDecl->structured.stride;
}
}
structOffset = srcOpers[1].value.u32v[0];
elemIdx = srcOpers[0].value.u32v[0];
}
else if(op.operation == OPCODE_LD_UAV_TYPED || op.operation == OPCODE_STORE_UAV_TYPED)
{
if(load)
{
resIndex = (uint32_t)op.operands[2].indices[0].index;
gsm = (op.operands[2].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
memcpy(resComps, op.operands[2].comps, sizeof(resComps));
}
else
{
resIndex = (uint32_t)op.operands[0].indices[0].index;
gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
}
elemIdx = srcOpers[0].value.u32v[0];
// could be a tex load
texCoords[0] = srcOpers[0].value.u32v[0];
texCoords[1] = srcOpers[0].value.u32v[1];
texCoords[2] = srcOpers[0].value.u32v[2];
stride = 4;
srv = false;
}
else if(op.operation == OPCODE_LD_RAW || op.operation == OPCODE_STORE_RAW)
{
if(load)
{
resIndex = (uint32_t)op.operands[2].indices[0].index;
srv = (op.operands[2].type == TYPE_RESOURCE);
gsm = (op.operands[2].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
memcpy(resComps, op.operands[2].comps, sizeof(resComps));
}
else
{
resIndex = (uint32_t)op.operands[0].indices[0].index;
srv = false;
gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
}
// the index is supposed to be a multiple of 4 but the behaviour seems to be to round down
elemIdx = (srcOpers[0].value.u32v[0] & ~0x3);
stride = 1;
}
RDCASSERT(stride != 0);
byte *data = NULL;
size_t dataSize = 0;
bool texData = false;
uint32_t rowPitch = 0;
uint32_t depthPitch = 0;
uint32_t firstElem = 0;
uint32_t numElems = 0;
GlobalState::ViewFmt fmt;
if(gsm)
{
firstElem = 0;
if(resIndex > global.groupshared.size())
{
numElems = 0;
stride = 4;
data = NULL;
}
else
{
numElems = global.groupshared[resIndex].count;
stride = global.groupshared[resIndex].bytestride;
data = global.groupshared[resIndex].data.data();
dataSize = global.groupshared[resIndex].data.size();
fmt.fmt = CompType::UInt;
fmt.byteWidth = 4;
fmt.numComps = global.groupshared[resIndex].bytestride / 4;
fmt.stride = 0;
}
texData = false;
}
else
{
BindingSlot slot = GetBindingSlotForIdentifier(
*program, srv ? TYPE_RESOURCE : TYPE_UNORDERED_ACCESS_VIEW, resIndex);
if(srv)
{
GlobalState::SRVIterator srvIter = global.srvs.find(slot);
if(srvIter == global.srvs.end())
{
apiWrapper->FetchSRV(slot);
srvIter = global.srvs.find(slot);
}
MarkResourceAccess(state, TYPE_RESOURCE, slot);
data = srvIter->second.data.data();
dataSize = srvIter->second.data.size();
firstElem = srvIter->second.firstElement;
numElems = srvIter->second.numElements;
fmt = srvIter->second.format;
}
else
{
GlobalState::UAVIterator uavIter = global.uavs.find(slot);
if(uavIter == global.uavs.end())
{
apiWrapper->FetchUAV(slot);
uavIter = global.uavs.find(slot);
}
MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);
data = uavIter->second.data.data();
dataSize = uavIter->second.data.size();
texData = uavIter->second.tex;
rowPitch = uavIter->second.rowPitch;
depthPitch = uavIter->second.depthPitch;
firstElem = uavIter->second.firstElement;
numElems = uavIter->second.numElements;
fmt = uavIter->second.format;
}
if(op.operation == OPCODE_LD_UAV_TYPED || op.operation == OPCODE_STORE_UAV_TYPED)
stride = fmt.Stride();
}
// indexing for raw views is in bytes, but firstElement/numElements is in format-sized
// units. Multiply up by stride
if(op.operation == OPCODE_LD_RAW || op.operation == OPCODE_STORE_RAW)
{
firstElem *= RDCMIN(4, fmt.byteWidth);
numElems *= RDCMIN(4, fmt.byteWidth);
}
RDCASSERT(data);
size_t dataOffset = 0;
if(texData)
{
dataOffset += texCoords[0] * fmt.Stride();
dataOffset += texCoords[1] * rowPitch;
dataOffset += texCoords[2] * depthPitch;
}
else
{
dataOffset += (firstElem + elemIdx) * stride;
dataOffset += structOffset;
}
if(!data || (!texData && elemIdx >= numElems) || (texData && dataOffset >= dataSize))
{
if(load)
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), 0U, 0U, 0U, 0U));
}
else
{
data += dataOffset;
int maxIndex = fmt.numComps;
uint32_t srcIdx = 1;
if(op.operation == OPCODE_STORE_STRUCTURED || op.operation == OPCODE_LD_STRUCTURED)
{
srcIdx = 2;
maxIndex = (stride - structOffset) / sizeof(uint32_t);
fmt.byteWidth = 4;
fmt.numComps = 4;
if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
fmt.numComps = 1;
fmt.fmt = CompType::UInt;
}
// raw loads/stores can come from any component (as long as it's within range of the data!)
if(op.operation == OPCODE_LD_RAW || op.operation == OPCODE_STORE_RAW)
{
fmt.byteWidth = 4;
// normally we can read 4 elements
fmt.numComps = 4;
// clamp to out of bounds based on numElems
fmt.numComps = RDCMIN(fmt.numComps, int(numElems - elemIdx) / 4);
maxIndex = fmt.numComps;
if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
fmt.numComps = 1;
fmt.fmt = CompType::UInt;
}
if(load)
{
ShaderVariable result = TypedUAVLoad(fmt, data);
// apply the swizzle on the resource operand
ShaderVariable fetch("", 0U, 0U, 0U, 0U);
for(int c = 0; c < 4; c++)
{
uint8_t comp = resComps[c];
if(comp == 0xff)
comp = 0;
fetch.value.u32v[c] = result.value.u32v[comp];
}
if(op.operation != OPCODE_LD_RAW && op.operation != OPCODE_LD_STRUCTURED)
{
// if we are assigning into a scalar, SetDst expects the result to be in .x (as normally
// we are assigning FROM a scalar also).
// to match this expectation, propogate the component across.
if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
fetch.value.u32v[0] = fetch.value.u32v[op.operands[0].comps[0]];
}
SetDst(state, op.operands[0], op, fetch);
}
else if(!Finished()) // helper/inactive pixels can't modify UAVs
{
for(int i = 0; i < 4; i++)
{
uint8_t comp = op.operands[0].comps[i];
// masks must be contiguous from x, if we reach the 'end' we're done
if(comp == 0xff || comp >= maxIndex)
break;
TypedUAVStore(fmt, data, srcOpers[srcIdx]);
}
}
}
break;
}
case OPCODE_EVAL_CENTROID:
case OPCODE_EVAL_SAMPLE_INDEX:
case OPCODE_EVAL_SNAPPED:
{
// opcodes only seem to be supported for regular inputs
RDCASSERT(op.operands[1].type == TYPE_INPUT);
GlobalState::SampleEvalCacheKey key;
RDCASSERT(program->GetShaderType() == DXBC::ShaderType::Pixel);
key.quadIndex = workgroupIndex;
// if this is TYPE_INPUT we can look up the index directly
key.inputRegisterIndex = (int32_t)op.operands[1].indices[0].index;
for(int c = 0; c < 4; c++)
{
if(op.operands[0].comps[c] == 0xff)
break;
key.numComponents = c + 1;
}
key.firstComponent = op.operands[1].comps[op.operands[0].comps[0]];
if(op.operation == OPCODE_EVAL_SAMPLE_INDEX)
{
key.sample = srcOpers[1].value.s32v[0];
}
else if(op.operation == OPCODE_EVAL_SNAPPED)
{
key.offsetx = RDCCLAMP(srcOpers[1].value.s32v[0], -8, 7);
key.offsety = RDCCLAMP(srcOpers[1].value.s32v[1], -8, 7);
}
else if(op.operation == OPCODE_EVAL_CENTROID)
{
// OPCODE_EVAL_CENTROID is the default, -1 sample and 0,0 offset
}
// look up this combination in the cache, if we get a hit then return that value.
auto it = global.sampleEvalCache.find(key);
if(it != global.sampleEvalCache.end())
{
// perform source operand swizzling
ShaderVariable var = it->second;
for(int i = 0; i < 4; i++)
if(op.operands[1].comps[i] < 4)
var.value.u32v[i] = it->second.value.u32v[op.operands[1].comps[i]];
SetDst(state, op.operands[0], op, var);
}
else
{
// if we got here, either the cache is empty (we're not rendering MSAA at all) so we should
// just return the interpolant, or something went wrong and the item we want isn't cached so
// the best we can do is return the interpolant.
if(!global.sampleEvalCache.empty())
{
apiWrapper->AddDebugMessage(
MessageCategory::Shaders, MessageSeverity::Medium, MessageSource::RuntimeWarning,
StringFormat::Fmt(
"Shader debugging %d: %s\n"
"No sample evaluate found in cache. Possible out-of-bounds sample index",
nextInstruction - 1, op.str.c_str()));
}
SetDst(state, op.operands[0], op, srcOpers[0]);
}
break;
}
case OPCODE_SAMPLE_INFO:
case OPCODE_SAMPLE_POS:
{
size_t numIndices = program->IsShaderModel51() ? 2 : 1;
bool isAbsoluteResource =
(op.operands[1].indices.size() == numIndices && op.operands[1].indices[0].absolute &&
!op.operands[1].indices[0].relative);
BindingSlot slot;
if(op.operands[1].type != TYPE_RASTERIZER)
{
UINT identifier = (UINT)(op.operands[1].indices[0].index & 0xffffffff);
slot = GetBindingSlotForIdentifier(*program, op.operands[1].type, identifier);
MarkResourceAccess(state, op.operands[1].type, slot);
}
ShaderVariable result =
apiWrapper->GetSampleInfo(op.operands[1].type, isAbsoluteResource, slot, op.str.c_str());
// "If there is no resource bound to the specified slot, 0 is returned."
// lookup sample pos if we got a count from above
if(op.operation == OPCODE_SAMPLE_POS && result.value.u32v[0] > 0 &&
(op.operands[2].type == TYPE_IMMEDIATE32 || op.operands[2].type == TYPE_TEMP))
{
// assume standard sample pattern - this might not hold in all cases
// http://msdn.microsoft.com/en-us/library/windows/desktop/ff476218(v=vs.85).aspx
uint32_t sampleIndex = srcOpers[1].value.u32v[0];
uint32_t sampleCount = result.value.u32v[0];
if(sampleIndex >= sampleCount)
{
// Per HLSL docs, if sampleIndex is out of bounds a zero vector is returned
RDCWARN("sample index %u is out of bounds on resource bound to sample_pos (%u samples)",
sampleIndex, sampleCount);
result.value.f32v[0] = 0.0f;
result.value.f32v[1] = 0.0f;
result.value.f32v[2] = 0.0f;
result.value.f32v[3] = 0.0f;
}
else
{
const float *sample_pattern = NULL;
// co-ordinates are given as (i,j) in 16ths of a pixel
#define _SMP(c) ((c) / 16.0f)
if(sampleCount == 1)
{
RDCWARN("Non-multisampled texture being passed to sample_pos");
apiWrapper->AddDebugMessage(
MessageCategory::Shaders, MessageSeverity::Medium, MessageSource::RuntimeWarning,
StringFormat::Fmt(
"Shader debugging %d: %s\nNon-multisampled texture being passed to sample_pos",
nextInstruction - 1, op.str.c_str()));
sample_pattern = NULL;
}
else if(sampleCount == 2)
{
static const float pattern_2x[] = {
_SMP(4.0f),
_SMP(4.0f),
_SMP(-4.0f),
_SMP(-4.0f),
};
sample_pattern = &pattern_2x[0];
}
else if(sampleCount == 4)
{
static const float pattern_4x[] = {
_SMP(-2.0f), _SMP(-6.0f), _SMP(6.0f), _SMP(-2.0f),
_SMP(-6.0f), _SMP(2.0f), _SMP(2.0f), _SMP(6.0f),
};
sample_pattern = &pattern_4x[0];
}
else if(sampleCount == 8)
{
static const float pattern_8x[] = {
_SMP(1.0f), _SMP(-3.0f), _SMP(-1.0f), _SMP(3.0f), _SMP(5.0f), _SMP(1.0f),
_SMP(-3.0f), _SMP(-5.0f), _SMP(-5.0f), _SMP(5.0f), _SMP(-7.0f), _SMP(-1.0f),
_SMP(3.0f), _SMP(7.0f), _SMP(7.0f), _SMP(-7.0f),
};
sample_pattern = &pattern_8x[0];
}
else if(sampleCount == 16)
{
static const float pattern_16x[] = {
_SMP(1.0f), _SMP(1.0f), _SMP(-1.0f), _SMP(-3.0f), _SMP(-3.0f), _SMP(2.0f),
_SMP(4.0f), _SMP(-1.0f), _SMP(-5.0f), _SMP(-2.0f), _SMP(2.0f), _SMP(5.0f),
_SMP(5.0f), _SMP(3.0f), _SMP(3.0f), _SMP(-5.0f), _SMP(-2.0f), _SMP(6.0f),
_SMP(0.0f), _SMP(-7.0f), _SMP(-4.0f), _SMP(-6.0f), _SMP(-6.0f), _SMP(4.0f),
_SMP(-8.0f), _SMP(0.0f), _SMP(7.0f), _SMP(-4.0f), _SMP(6.0f), _SMP(7.0f),
_SMP(-7.0f), _SMP(-8.0f),
};
sample_pattern = &pattern_16x[0];
}
else // unsupported sample count
{
RDCERR("Unsupported sample count on resource for sample_pos: %u", result.value.u32v[0]);
sample_pattern = NULL;
}
if(sample_pattern == NULL)
{
result.value.f32v[0] = 0.0f;
result.value.f32v[1] = 0.0f;
}
else
{
result.value.f32v[0] = sample_pattern[sampleIndex * 2 + 0];
result.value.f32v[1] = sample_pattern[sampleIndex * 2 + 1];
}
}
#undef _SMP
}
// apply swizzle
ShaderVariable swizzled("", 0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < 4; i++)
{
if(op.operands[1].comps[i] == 0xff)
swizzled.value.u32v[i] = result.value.u32v[0];
else
swizzled.value.u32v[i] = result.value.u32v[op.operands[1].comps[i]];
}
// apply ret type
if(op.operation == OPCODE_SAMPLE_POS)
{
result = swizzled;
result.type = VarType::Float;
}
else if(op.infoRetType == RETTYPE_FLOAT)
{
result.value.f32v[0] = (float)swizzled.value.u32v[0];
result.value.f32v[1] = (float)swizzled.value.u32v[1];
result.value.f32v[2] = (float)swizzled.value.u32v[2];
result.value.f32v[3] = (float)swizzled.value.u32v[3];
result.type = VarType::Float;
}
else
{
result = swizzled;
result.type = VarType::UInt;
}
// if we are assigning into a scalar, SetDst expects the result to be in .x (as normally we
// are assigning FROM a scalar also).
// to match this expectation, propogate the component across.
if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
result.value.u32v[0] = result.value.u32v[op.operands[0].comps[0]];
SetDst(state, op.operands[0], op, result);
break;
}
case OPCODE_BUFINFO:
{
size_t numIndices = program->IsShaderModel51() ? 2 : 1;
if(op.operands[1].indices.size() == numIndices && op.operands[1].indices[0].absolute &&
!op.operands[1].indices[0].relative)
{
UINT identifier = (UINT)(op.operands[1].indices[0].index & 0xffffffff);
BindingSlot slot = GetBindingSlotForIdentifier(*program, op.operands[1].type, identifier);
ShaderVariable result = apiWrapper->GetBufferInfo(op.operands[1].type, slot, op.str.c_str());
MarkResourceAccess(state, op.operands[1].type, slot);
// apply swizzle
ShaderVariable swizzled("", 0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < 4; i++)
{
if(op.operands[1].comps[i] == 0xff)
swizzled.value.u32v[i] = result.value.u32v[0];
else
swizzled.value.u32v[i] = result.value.u32v[op.operands[1].comps[i]];
}
result = swizzled;
result.type = VarType::UInt;
// if we are assigning into a scalar, SetDst expects the result to be in .x (as normally we
// are assigning FROM a scalar also).
// to match this expectation, propogate the component across.
if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
result.value.u32v[0] = result.value.u32v[op.operands[0].comps[0]];
SetDst(state, op.operands[0], op, result);
}
else
{
RDCERR("Unexpected relative addressing");
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), 0.0f, 0.0f, 0.0f, 0.0f));
}
break;
}
case OPCODE_RESINFO:
{
// spec says "srcMipLevel is read as an unsigned integer scalar"
uint32_t mipLevel = srcOpers[0].value.u32v[0];
size_t numIndices = program->IsShaderModel51() ? 2 : 1;
if(op.operands[2].indices.size() == numIndices && op.operands[2].indices[0].absolute &&
!op.operands[2].indices[0].relative)
{
int dim = 0;
UINT identifier = (UINT)(op.operands[2].indices[0].index & 0xffffffff);
BindingSlot slot = GetBindingSlotForIdentifier(*program, op.operands[2].type, identifier);
ShaderVariable result = apiWrapper->GetResourceInfo(op.operands[2].type, slot, mipLevel, dim);
MarkResourceAccess(state, op.operands[2].type, slot);
// need a valid dimension even if the resource was unbound, so
// search for the declaration
if(dim == 0)
{
const Declaration *pDecl =
program->FindDeclaration(TYPE_RESOURCE, (uint32_t)op.operands[2].indices[0].index);
if(pDecl && pDecl->declaration == OPCODE_DCL_RESOURCE)
{
switch(pDecl->resource.dim)
{
default:
case RESOURCE_DIMENSION_UNKNOWN:
case NUM_DIMENSIONS:
case RESOURCE_DIMENSION_BUFFER:
case RESOURCE_DIMENSION_RAW_BUFFER:
case RESOURCE_DIMENSION_STRUCTURED_BUFFER:
case RESOURCE_DIMENSION_TEXTURE1D:
case RESOURCE_DIMENSION_TEXTURE1DARRAY: dim = 1; break;
case RESOURCE_DIMENSION_TEXTURE2D:
case RESOURCE_DIMENSION_TEXTURE2DMS:
case RESOURCE_DIMENSION_TEXTURE2DARRAY:
case RESOURCE_DIMENSION_TEXTURE2DMSARRAY:
case RESOURCE_DIMENSION_TEXTURECUBE:
case RESOURCE_DIMENSION_TEXTURECUBEARRAY: dim = 2; break;
case RESOURCE_DIMENSION_TEXTURE3D: dim = 3; break;
}
}
}
// apply swizzle
ShaderVariable swizzled("", 0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < 4; i++)
{
if(op.operands[2].comps[i] == 0xff)
swizzled.value.u32v[i] = result.value.u32v[0];
else
swizzled.value.u32v[i] = result.value.u32v[op.operands[2].comps[i]];
}
// apply ret type
if(op.infoRetType == RETTYPE_FLOAT)
{
result.value.f32v[0] = (float)swizzled.value.u32v[0];
result.value.f32v[1] = (float)swizzled.value.u32v[1];
result.value.f32v[2] = (float)swizzled.value.u32v[2];
result.value.f32v[3] = (float)swizzled.value.u32v[3];
result.type = VarType::Float;
}
else if(op.infoRetType == RETTYPE_RCPFLOAT)
{
// only width/height/depth values we set are reciprocated, other values
// are just left as is
if(dim <= 1)
result.value.f32v[0] = 1.0f / (float)swizzled.value.u32v[0];
else
result.value.f32v[0] = (float)swizzled.value.u32v[0];
if(dim <= 2)
result.value.f32v[1] = 1.0f / (float)swizzled.value.u32v[1];
else
result.value.f32v[1] = (float)swizzled.value.u32v[1];
if(dim <= 3)
result.value.f32v[2] = 1.0f / (float)swizzled.value.u32v[2];
else
result.value.f32v[2] = (float)swizzled.value.u32v[2];
result.value.f32v[3] = (float)swizzled.value.u32v[3];
result.type = VarType::Float;
}
else if(op.infoRetType == RETTYPE_UINT)
{
result = swizzled;
result.type = VarType::UInt;
}
// if we are assigning into a scalar, SetDst expects the result to be in .x (as normally we
// are assigning FROM a scalar also).
// to match this expectation, propogate the component across.
if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
result.value.u32v[0] = result.value.u32v[op.operands[0].comps[0]];
SetDst(state, op.operands[0], op, result);
}
else
{
RDCERR("Unexpected relative addressing");
SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), 0.0f, 0.0f, 0.0f, 0.0f));
}
break;
}
case OPCODE_SAMPLE:
case OPCODE_SAMPLE_L:
case OPCODE_SAMPLE_B:
case OPCODE_SAMPLE_D:
case OPCODE_SAMPLE_C:
case OPCODE_SAMPLE_C_LZ:
case OPCODE_LD:
case OPCODE_LD_MS:
case OPCODE_GATHER4:
case OPCODE_GATHER4_C:
case OPCODE_GATHER4_PO:
case OPCODE_GATHER4_PO_C:
case OPCODE_LOD:
{
if(op.operation != OPCODE_LOD && state)
state->flags |= ShaderEvents::SampleLoadGather;
SamplerMode samplerMode = NUM_SAMPLERS;
ResourceDimension resourceDim = RESOURCE_DIMENSION_UNKNOWN;
DXBC::ResourceRetType resourceRetType = DXBC::RETURN_TYPE_UNKNOWN;
int sampleCount = 0;
// Default assumptions for bindings
Operand destOperand = op.operands[0];
Operand resourceOperand = op.operands[2];
Operand samplerOperand;
if(op.operands.size() > 3)
samplerOperand = op.operands[3];
if(op.operation == OPCODE_GATHER4_PO || op.operation == OPCODE_GATHER4_PO_C)
{
resourceOperand = op.operands[3];
samplerOperand = op.operands[4];
}
BindingSlot resourceBinding((uint32_t)resourceOperand.indices[0].index, 0);
BindingSlot samplerBinding(0, 0);
for(size_t i = 0; i < program->GetNumDeclarations(); i++)
{
const Declaration &decl = program->GetDeclaration(i);
if(decl.declaration == OPCODE_DCL_SAMPLER && decl.operand.sameResource(samplerOperand))
{
samplerMode = decl.samplerMode;
samplerBinding = GetBindingSlotForDeclaration(*program, decl);
}
if(op.operation == OPCODE_LD && decl.declaration == OPCODE_DCL_RESOURCE &&
decl.resource.dim == RESOURCE_DIMENSION_BUFFER &&
decl.operand.sameResource(resourceOperand))
{
resourceDim = decl.resource.dim;
resourceBinding = GetBindingSlotForDeclaration(*program, decl);
GlobalState::SRVIterator srv = global.srvs.find(resourceBinding);
if(srv == global.srvs.end())
{
apiWrapper->FetchSRV(resourceBinding);
srv = global.srvs.find(resourceBinding);
}
const byte *data = &srv->second.data[0];
uint32_t offset = srv->second.firstElement;
uint32_t numElems = srv->second.numElements;
GlobalState::ViewFmt fmt = srv->second.format;
data += fmt.Stride() * offset;
ShaderVariable result;
{
result = ShaderVariable(rdcstr(), 0.0f, 0.0f, 0.0f, 0.0f);
if(srcOpers[0].value.u32v[0] < numElems &&
data + srcOpers[0].value.u32v[0] * fmt.Stride() <= srv->second.data.end())
result = TypedUAVLoad(fmt, data + srcOpers[0].value.u32v[0] * fmt.Stride());
}
ShaderVariable fetch("", 0U, 0U, 0U, 0U);
for(int c = 0; c < 4; c++)
{
uint8_t comp = resourceOperand.comps[c];
if(resourceOperand.comps[c] == 0xff)
comp = 0;
fetch.value.u32v[c] = result.value.u32v[comp];
}
// if we are assigning into a scalar, SetDst expects the result to be in .x (as normally
// we are assigning FROM a scalar also).
// to match this expectation, propogate the component across.
if(destOperand.comps[0] != 0xff && destOperand.comps[1] == 0xff &&
destOperand.comps[2] == 0xff && destOperand.comps[3] == 0xff)
fetch.value.u32v[0] = fetch.value.u32v[destOperand.comps[0]];
SetDst(state, destOperand, op, fetch);
MarkResourceAccess(state, TYPE_RESOURCE, resourceBinding);
return;
}
if(decl.declaration == OPCODE_DCL_RESOURCE && decl.operand.sameResource(resourceOperand))
{
resourceDim = decl.resource.dim;
resourceRetType = decl.resource.resType[0];
sampleCount = decl.resource.sampleCount;
resourceBinding = GetBindingSlotForDeclaration(*program, decl);
// With SM5.1, resource arrays need to offset the shader register by the array index
if(program->IsShaderModel51())
resourceBinding.shaderRegister = srcOpers[1].value.u32v[1];
// doesn't seem like these are ever less than four components, even if the texture is
// declared <float3> for example.
// shouldn't matter though is it just comes out in the wash.
RDCASSERT(decl.resource.resType[0] == decl.resource.resType[1] &&
decl.resource.resType[1] == decl.resource.resType[2] &&
decl.resource.resType[2] == decl.resource.resType[3]);
RDCASSERT(decl.resource.resType[0] != DXBC::RETURN_TYPE_CONTINUED &&
decl.resource.resType[0] != DXBC::RETURN_TYPE_UNUSED &&
decl.resource.resType[0] != DXBC::RETURN_TYPE_MIXED &&
decl.resource.resType[0] >= 0 &&
decl.resource.resType[0] < DXBC::NUM_RETURN_TYPES);
}
}
// for lod operation, it's only defined for certain resources - otherwise just returns 0
if(op.operation == OPCODE_LOD && resourceDim != RESOURCE_DIMENSION_TEXTURE1D &&
resourceDim != RESOURCE_DIMENSION_TEXTURE1DARRAY &&
resourceDim != RESOURCE_DIMENSION_TEXTURE2D &&
resourceDim != RESOURCE_DIMENSION_TEXTURE2DARRAY &&
resourceDim != RESOURCE_DIMENSION_TEXTURE3D && resourceDim != RESOURCE_DIMENSION_TEXTURECUBE)
{
ShaderVariable invalidResult("tex", 0.0f, 0.0f, 0.0f, 0.0f);
SetDst(state, destOperand, op, invalidResult);
break;
}
ShaderVariable uv = srcOpers[0];
ShaderVariable ddxCalc;
ShaderVariable ddyCalc;
// these ops need DDX/DDY
if(op.operation == OPCODE_SAMPLE || op.operation == OPCODE_SAMPLE_B ||
op.operation == OPCODE_SAMPLE_C || op.operation == OPCODE_LOD)
{
if(program->GetShaderType() != DXBC::ShaderType::Pixel || prevWorkgroup.size() != 4)
{
RDCERR(
"Attempt to use derivative instruction not in pixel shader. Undefined results will "
"occur!");
}
else
{
// texture samples use coarse derivatives
ddxCalc = DDX(false, prevWorkgroup, op.operands[1], op);
ddyCalc = DDY(false, prevWorkgroup, op.operands[1], op);
}
}
else if(op.operation == OPCODE_SAMPLE_D)
{
ddxCalc = srcOpers[3];
ddyCalc = srcOpers[4];
}
int multisampleIndex = 0;
if(srcOpers.size() >= 3)
multisampleIndex = srcOpers[2].value.s32v[0];
float lodOrCompareValue = 0.0f;
if(srcOpers.size() >= 4)
lodOrCompareValue = srcOpers[3].value.f32v[0];
if(op.operation == OPCODE_GATHER4_PO_C)
lodOrCompareValue = srcOpers[5].value.f32v[0];
uint8_t swizzle[4] = {0};
for(int i = 0; i < 4; i++)
{
if(resourceOperand.comps[i] == 0xff)
swizzle[i] = 0;
else
swizzle[i] = resourceOperand.comps[i];
}
GatherChannel gatherChannel = GatherChannel::Red;
if(op.operation == OPCODE_GATHER4 || op.operation == OPCODE_GATHER4_C ||
op.operation == OPCODE_GATHER4_PO || op.operation == OPCODE_GATHER4_PO_C)
{
gatherChannel = (GatherChannel)samplerOperand.comps[0];
}
// for bias instruction we can't do a SampleGradBias, so add the bias into the sampler state.
float samplerBias = 0.0f;
if(op.operation == OPCODE_SAMPLE_B)
samplerBias = srcOpers[3].value.f32v[0];
SampleGatherResourceData resourceData;
resourceData.dim = resourceDim;
resourceData.retType = resourceRetType;
resourceData.sampleCount = sampleCount;
resourceData.binding = resourceBinding;
SampleGatherSamplerData samplerData;
samplerData.mode = samplerMode;
samplerData.binding = samplerBinding;
samplerData.bias = samplerBias;
MarkResourceAccess(state, TYPE_RESOURCE, resourceBinding);
ShaderVariable lookupResult("tex", 0.0f, 0.0f, 0.0f, 0.0f);
if(apiWrapper->CalculateSampleGather(op.operation, resourceData, samplerData, uv, ddxCalc,
ddyCalc, op.texelOffset, multisampleIndex,
lodOrCompareValue, swizzle, gatherChannel,
op.str.c_str(), lookupResult))
{
// should be a better way of doing this
if(destOperand.comps[1] == 0xff)
lookupResult.value.s32v[0] = lookupResult.value.s32v[destOperand.comps[0]];
SetDst(state, destOperand, op, lookupResult);
}
else
{
return;
}
break;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////
// Flow control
case OPCODE_SWITCH:
{
uint32_t switchValue = GetSrc(op.operands[0], op).value.u32v[0];
int depth = 0;
uint32_t jumpLocation = 0;
uint32_t search = nextInstruction;
for(; search < (uint32_t)program->GetNumInstructions(); search++)
{
const Operation &nextOp = program->GetInstruction((size_t)search);
// track nested switch statements to ensure we don't accidentally pick the case from a
// different switch
if(nextOp.operation == OPCODE_SWITCH)
{
depth++;
}
else if(depth > 0 && nextOp.operation == OPCODE_ENDSWITCH)
{
depth--;
}
else if(depth == 0)
{
// note the default: location as jumpLocation if we haven't found a matching
// case yet. If we find one later, this will be overridden
if(nextOp.operation == OPCODE_DEFAULT)
jumpLocation = search;
// reached end of our switch statement
if(nextOp.operation == OPCODE_ENDSWITCH)
break;
if(nextOp.operation == OPCODE_CASE)
{
uint32_t caseValue = GetSrc(nextOp.operands[0], nextOp).value.u32v[0];
// comparison is defined to be bitwise
if(caseValue == switchValue)
{
// we've found our case, break out
jumpLocation = search;
break;
}
}
}
}
// jumpLocation points to the case we're taking, either a matching case or default
if(jumpLocation == 0)
{
RDCERR("Didn't find matching case or default: for switch(%u)!", switchValue);
}
else
{
// skip straight past any case or default labels as we don't want to step to them, we want
// next instruction to point
// at the next excutable instruction (which might be a break if we're doing nothing)
for(; jumpLocation < (uint32_t)program->GetNumInstructions(); jumpLocation++)
{
const Operation &nextOp = program->GetInstruction(jumpLocation);
if(nextOp.operation != OPCODE_CASE && nextOp.operation != OPCODE_DEFAULT)
break;
}
nextInstruction = jumpLocation;
}
break;
}
case OPCODE_CASE:
case OPCODE_DEFAULT:
case OPCODE_LOOP:
case OPCODE_ENDSWITCH:
case OPCODE_ENDIF:
// do nothing. Basically just an anonymous label that is used elsewhere
// (IF/ELSE/SWITCH/ENDLOOP/BREAK)
break;
case OPCODE_CONTINUE:
case OPCODE_CONTINUEC:
case OPCODE_ENDLOOP:
{
int depth = 0;
int32_t test = op.operation == OPCODE_CONTINUEC ? GetSrc(op.operands[0], op).value.s32v[0] : 0;
if(op.operation == OPCODE_CONTINUE || op.operation == OPCODE_CONTINUEC)
depth = 1;
if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()) ||
op.operation == OPCODE_CONTINUE || op.operation == OPCODE_ENDLOOP)
{
// skip back one to the endloop that we're processing
nextInstruction--;
for(; nextInstruction >= 0; nextInstruction--)
{
if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDLOOP)
depth++;
if(program->GetInstruction(nextInstruction).operation == OPCODE_LOOP)
depth--;
if(depth == 0)
{
break;
}
}
RDCASSERT(nextInstruction >= 0);
}
break;
}
case OPCODE_BREAK:
case OPCODE_BREAKC:
{
int32_t test = op.operation == OPCODE_BREAKC ? GetSrc(op.operands[0], op).value.s32v[0] : 0;
if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()) || op.operation == OPCODE_BREAK)
{
// break out (jump to next endloop/endswitch)
int depth = 1;
for(; nextInstruction < (int)program->GetNumInstructions(); nextInstruction++)
{
if(program->GetInstruction(nextInstruction).operation == OPCODE_LOOP ||
program->GetInstruction(nextInstruction).operation == OPCODE_SWITCH)
depth++;
if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDLOOP ||
program->GetInstruction(nextInstruction).operation == OPCODE_ENDSWITCH)
depth--;
if(depth == 0)
{
break;
}
}
RDCASSERT(program->GetInstruction(nextInstruction).operation == OPCODE_ENDLOOP ||
program->GetInstruction(nextInstruction).operation == OPCODE_ENDSWITCH);
// don't want to process the endloop and jump again!
nextInstruction++;
}
break;
}
case OPCODE_IF:
{
int32_t test = GetSrc(op.operands[0], op).value.s32v[0];
if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()))
{
// nothing, we go into the if.
}
else
{
// jump to after the next matching else/endif
int depth = 0;
// skip back one to the if that we're processing
nextInstruction--;
for(; nextInstruction < (int)program->GetNumInstructions(); nextInstruction++)
{
if(program->GetInstruction(nextInstruction).operation == OPCODE_IF)
depth++;
// only step out on an else if it's the matching depth to our starting if (depth == 1)
if(depth == 1 && program->GetInstruction(nextInstruction).operation == OPCODE_ELSE)
depth--;
if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF)
depth--;
if(depth == 0)
{
break;
}
}
RDCASSERT(program->GetInstruction(nextInstruction).operation == OPCODE_ELSE ||
program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF);
// step to next instruction after the else/endif (processing an else would skip that block)
nextInstruction++;
}
break;
}
case OPCODE_ELSE:
{
// if we hit an else then we've just processed the if() bracket and need to break out (jump to
// next endif)
int depth = 1;
for(; nextInstruction < (int)program->GetNumInstructions(); nextInstruction++)
{
if(program->GetInstruction(nextInstruction).operation == OPCODE_IF)
depth++;
if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF)
depth--;
if(depth == 0)
{
break;
}
}
RDCASSERT(program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF);
// step to next instruction after the else/endif (for consistency with handling in the if
// block)
nextInstruction++;
break;
}
case OPCODE_DISCARD:
{
int32_t test = GetSrc(op.operands[0], op).value.s32v[0];
if((test != 0 && !op.nonzero()) || (test == 0 && op.nonzero()))
{
// don't discard
break;
}
// discarding.
done = true;
break;
}
case OPCODE_RET:
case OPCODE_RETC:
{
int32_t test = op.operation == OPCODE_RETC ? GetSrc(op.operands[0], op).value.s32v[0] : 0;
if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()) || op.operation == OPCODE_RET)
{
// assumes not in a function call
done = true;
}
break;
}
//////////////////////////////////////////////////////////////////////////
// Vendor extensions
//////////////////////////////////////////////////////////////////////////
case OPCODE_AMD_U64_ATOMIC:
case OPCODE_NV_U64_ATOMIC:
{
VendorAtomicOp atomicOp = (VendorAtomicOp)op.preciseValues;
uint32_t resIndex = (uint32_t)op.operands[2].indices[0].index;
ShaderVariable dstAddress, compare, value;
int param = 2;
if(op.texelOffset[0] == 1)
{
// single operand for address - simple
dstAddress = srcOpers[param++];
}
else if(op.texelOffset[0] == 2)
{
dstAddress = srcOpers[param++];
dstAddress.value.u32v[1] = srcOpers[param++].value.u32v[0];
dstAddress.value.u32v[2] = srcOpers[param++].value.u32v[2];
}
else
{
RDCERR("Unexpected parameter compression value %d ", op.texelOffset[0]);
break;
}
if(atomicOp == ATOMIC_OP_CAS)
{
if(op.texelOffset[1] == 1)
{
compare = srcOpers[param++];
}
else if(op.texelOffset[1] == 2)
{
compare = srcOpers[param++];
compare.value.u32v[1] = srcOpers[param++].value.u32v[0];
compare.value.u32v[2] = srcOpers[param++].value.u32v[2];
}
else
{
RDCERR("Unexpected parameter compression value %d ", op.texelOffset[1]);
break;
}
}
if(op.texelOffset[2] == 1)
{
value = srcOpers[param++];
}
else if(op.texelOffset[2] == 2)
{
value = srcOpers[param++];
value.value.u32v[1] = srcOpers[param++].value.u32v[0];
value.value.u32v[2] = srcOpers[param++].value.u32v[2];
}
else
{
RDCERR("Unexpected parameter compression value %d ", op.texelOffset[2]);
break;
}
BindingSlot slot = GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW, resIndex);
GlobalState::UAVIterator uav = global.uavs.find(slot);
if(uav == global.uavs.end())
{
apiWrapper->FetchUAV(slot);
uav = global.uavs.find(slot);
}
MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);
const uint32_t stride = sizeof(uint64_t);
byte *data = &uav->second.data[0];
RDCASSERT(data);
if(data)
{
if(uav->second.tex)
{
data += dstAddress.value.u32v[0] * stride;
data += dstAddress.value.u32v[1] * uav->second.rowPitch;
data += dstAddress.value.u32v[2] * uav->second.depthPitch;
}
else
{
data += uav->second.firstElement * stride + dstAddress.value.u32v[0];
}
}
if(data && data < uav->second.data.end() && !Finished())
{
ShaderVariable result(rdcstr(), 0U, 0U, 0U, 0U);
uint64_t *data64 = (uint64_t *)data;
result.value.u32v[0] = uint32_t(*data64);
SetDst(state, op.operands[0], op, result);
result.value.u32v[0] = uint32_t((*data64) >> 32U);
SetDst(state, op.operands[1], op, result);
uint64_t compare64 = compare.value.u64v[0];
uint64_t value64 = value.value.u64v[0];
switch(atomicOp)
{
case ATOMIC_OP_NONE: break;
case ATOMIC_OP_AND: *data64 = *data64 & value64; break;
case ATOMIC_OP_OR: *data64 = *data64 | value64; break;
case ATOMIC_OP_XOR: *data64 = *data64 ^ value64; break;
case ATOMIC_OP_ADD: *data64 = *data64 + value64; break;
case ATOMIC_OP_MAX: *data64 = RDCMAX(*data64, value64); break;
case ATOMIC_OP_MIN: *data64 = RDCMIN(*data64, value64); break;
case ATOMIC_OP_SWAP: *data64 = value64; break;
case ATOMIC_OP_CAS:
if(*data64 == compare64)
*data64 = value64;
break;
}
}
break;
}
//////////////////////////////////////////////////////////////////////////
//
//////////////////////////////////////////////////////////////////////////
default:
{
RDCERR("Unsupported operation %d in assembly debugging", op.operation);
break;
}
}
}