void ThreadState::StepNext()

in renderdoc/driver/shaders/dxbc/dxbc_debug.cpp [1956:4504]


void ThreadState::StepNext(ShaderDebugState *state, DebugAPIWrapper *apiWrapper,
                           const rdcarray<ThreadState> &prevWorkgroup)
{
  if(nextInstruction >= program->GetNumInstructions())
    return;

  const Operation &op = program->GetInstruction((size_t)nextInstruction);

  apiWrapper->SetCurrentInstruction(nextInstruction);
  nextInstruction++;

  if(nextInstruction >= program->GetNumInstructions())
    nextInstruction--;

  if(state && debug)
  {
    const Operation &nextOp = program->GetInstruction((size_t)nextInstruction);
    debug->GetCallstack(nextInstruction, nextOp.offset, state->callstack);
  }

  rdcarray<ShaderVariable> srcOpers;

  VarType optype = OperationType(op.operation);

  for(size_t i = 1; i < op.operands.size(); i++)
    srcOpers.push_back(GetSrc(op.operands[i], op));

  switch(op.operation)
  {
      /////////////////////////////////////////////////////////////////////////////////////////////////////
      // Math operations

    case OPCODE_DADD:
    case OPCODE_IADD:
    case OPCODE_ADD:
      SetDst(state, op.operands[0], op, add(srcOpers[0], srcOpers[1], optype));
      break;
    case OPCODE_DDIV:
    case OPCODE_DIV:
      SetDst(state, op.operands[0], op, div(srcOpers[0], srcOpers[1], optype));
      break;
    case OPCODE_UDIV:
    {
      ShaderVariable quot("", (uint32_t)0xffffffff, (uint32_t)0xffffffff, (uint32_t)0xffffffff,
                          (uint32_t)0xffffffff);
      ShaderVariable rem("", (uint32_t)0xffffffff, (uint32_t)0xffffffff, (uint32_t)0xffffffff,
                         (uint32_t)0xffffffff);

      for(size_t i = 0; i < 4; i++)
      {
        if(srcOpers[2].value.u32v[i] != 0)
        {
          quot.value.u32v[i] = srcOpers[1].value.u32v[i] / srcOpers[2].value.u32v[i];
          rem.value.u32v[i] =
              srcOpers[1].value.u32v[i] - (quot.value.u32v[i] * srcOpers[2].value.u32v[i]);
        }
        else
        {
          if(state)
            state->flags |= ShaderEvents::GeneratedNanOrInf;
        }
      }

      if(op.operands[0].type != TYPE_NULL)
      {
        SetDst(state, op.operands[0], op, quot);
      }
      if(op.operands[1].type != TYPE_NULL)
      {
        SetDst(state, op.operands[1], op, rem);
      }
      break;
    }
    case OPCODE_BFREV:
    {
      ShaderVariable ret("", 0U, 0U, 0U, 0U);

      for(size_t i = 0; i < 4; i++)
      {
        ret.value.u32v[i] = BitwiseReverseLSB16(srcOpers[0].value.u32v[i]);
      }

      SetDst(state, op.operands[0], op, ret);

      break;
    }
    case OPCODE_COUNTBITS:
    {
      ShaderVariable ret("", 0U, 0U, 0U, 0U);

      for(size_t i = 0; i < 4; i++)
      {
        ret.value.u32v[i] = PopCount(srcOpers[0].value.u32v[i]);
      }

      SetDst(state, op.operands[0], op, ret);
      break;
    }
    case OPCODE_FIRSTBIT_HI:
    {
      ShaderVariable ret("", 0U, 0U, 0U, 0U);

      for(size_t i = 0; i < 4; i++)
      {
        unsigned char found = BitScanReverse((DWORD *)&ret.value.u32v[i], srcOpers[0].value.u32v[i]);
        if(found == 0)
        {
          ret.value.u32v[i] = ~0U;
        }
        else
        {
          // firstbit_hi counts index 0 as the MSB, BitScanReverse counts index 0 as the LSB. So we
          // need to invert
          ret.value.u32v[i] = 31 - ret.value.u32v[i];
        }
      }

      SetDst(state, op.operands[0], op, ret);
      break;
    }
    case OPCODE_FIRSTBIT_LO:
    {
      ShaderVariable ret("", 0U, 0U, 0U, 0U);

      for(size_t i = 0; i < 4; i++)
      {
        unsigned char found = BitScanForward((DWORD *)&ret.value.u32v[i], srcOpers[0].value.u32v[i]);
        if(found == 0)
          ret.value.u32v[i] = ~0U;
      }

      SetDst(state, op.operands[0], op, ret);
      break;
    }
    case OPCODE_FIRSTBIT_SHI:
    {
      ShaderVariable ret("", 0U, 0U, 0U, 0U);

      for(size_t i = 0; i < 4; i++)
      {
        uint32_t u = srcOpers[0].value.u32v[i];
        if(srcOpers[0].value.s32v[i] < 0)
          u = ~u;

        unsigned char found = BitScanReverse((DWORD *)&ret.value.u32v[i], u);

        if(found == 0)
        {
          ret.value.u32v[i] = ~0U;
        }
        else
        {
          // firstbit_shi counts index 0 as the MSB, BitScanReverse counts index 0 as the LSB. So we
          // need to invert
          ret.value.u32v[i] = 31 - ret.value.u32v[i];
        }
      }

      SetDst(state, op.operands[0], op, ret);
      break;
    }
    case OPCODE_IMUL:
    case OPCODE_UMUL:
    {
      ShaderVariable hi("", 0U, 0U, 0U, 0U);
      ShaderVariable lo("", 0U, 0U, 0U, 0U);

      for(size_t i = 0; i < 4; i++)
      {
        if(op.operation == OPCODE_UMUL)
        {
          uint64_t res = uint64_t(srcOpers[1].value.u32v[i]) * uint64_t(srcOpers[2].value.u32v[i]);

          hi.value.u32v[i] = uint32_t((res >> 32) & 0xffffffff);
          lo.value.u32v[i] = uint32_t(res & 0xffffffff);
        }
        else if(op.operation == OPCODE_IMUL)
        {
          int64_t res = int64_t(srcOpers[1].value.s32v[i]) * int64_t(srcOpers[2].value.s32v[i]);

          hi.value.u32v[i] = uint32_t((res >> 32) & 0xffffffff);
          lo.value.u32v[i] = uint32_t(res & 0xffffffff);
        }
      }

      if(op.operands[0].type != TYPE_NULL)
      {
        SetDst(state, op.operands[0], op, hi);
      }
      if(op.operands[1].type != TYPE_NULL)
      {
        SetDst(state, op.operands[1], op, lo);
      }
      break;
    }
    case OPCODE_DMUL:
    case OPCODE_MUL:
      SetDst(state, op.operands[0], op, mul(srcOpers[0], srcOpers[1], optype));
      break;
    case OPCODE_UADDC:
    {
      uint64_t src[4];
      for(int i = 0; i < 4; i++)
        src[i] = (uint64_t)srcOpers[1].value.u32v[i];
      for(int i = 0; i < 4; i++)
        src[i] = (uint64_t)srcOpers[2].value.u32v[i];

      // set the rounded result
      uint32_t dst[4];

      for(int i = 0; i < 4; i++)
        dst[i] = (uint32_t)(src[i] & 0xffffffff);

      SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), dst[0], dst[1], dst[2], dst[3]));

      // if not null, set the carry bits
      if(op.operands[1].type != TYPE_NULL)
        SetDst(state, op.operands[1], op,
               ShaderVariable(rdcstr(), src[0] > 0xffffffff ? 1U : 0U, src[1] > 0xffffffff ? 1U : 0U,
                              src[2] > 0xffffffff ? 1U : 0U, src[3] > 0xffffffff ? 1U : 0U));

      break;
    }
    case OPCODE_USUBB:
    {
      uint64_t src0[4];
      uint64_t src1[4];

      // add on a 'borrow' bit
      for(int i = 0; i < 4; i++)
        src0[i] = 0x100000000 | (uint64_t)srcOpers[1].value.u32v[i];
      for(int i = 0; i < 4; i++)
        src1[i] = (uint64_t)srcOpers[2].value.u32v[i];

      // do the subtract
      uint64_t result[4];
      for(int i = 0; i < 4; i++)
        result[i] = src0[i] - src1[i];

      uint32_t dst[4];
      for(int i = 0; i < 4; i++)
        dst[i] = (uint32_t)(result[0] & 0xffffffff);

      SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), dst[0], dst[1], dst[2], dst[3]));

      // if not null, mark where the borrow bits were used
      if(op.operands[1].type != TYPE_NULL)
        SetDst(state, op.operands[1], op,
               ShaderVariable(rdcstr(), result[0] <= 0xffffffff ? 1U : 0U,
                              result[1] <= 0xffffffff ? 1U : 0U, result[2] <= 0xffffffff ? 1U : 0U,
                              result[3] <= 0xffffffff ? 1U : 0U));

      break;
    }
    case OPCODE_IMAD:
    case OPCODE_UMAD:
    case OPCODE_MAD:
    case OPCODE_DFMA:
      SetDst(state, op.operands[0], op,
             add(mul(srcOpers[0], srcOpers[1], optype), srcOpers[2], optype));
      break;
    case OPCODE_DP2:
    case OPCODE_DP3:
    case OPCODE_DP4:
    {
      ShaderVariable dot = mul(srcOpers[0], srcOpers[1], optype);

      float sum = dot.value.f32v[0];
      sum += dot.value.f32v[1];
      if(op.operation >= OPCODE_DP3)
        sum += dot.value.f32v[2];
      if(op.operation >= OPCODE_DP4)
        sum += dot.value.f32v[3];

      SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), sum, sum, sum, sum));
      break;
    }
    case OPCODE_F16TOF32:
    {
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[0] & 0xffff)),
                            flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[1] & 0xffff)),
                            flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[2] & 0xffff)),
                            flush_denorm(ConvertFromHalf(srcOpers[0].value.u32v[3] & 0xffff))));
      break;
    }
    case OPCODE_F32TOF16:
    {
      SetDst(
          state, op.operands[0], op,
          ShaderVariable(rdcstr(), (uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[0])),
                         (uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[1])),
                         (uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[2])),
                         (uint32_t)ConvertToHalf(flush_denorm(srcOpers[0].value.f32v[3]))));
      break;
    }
    case OPCODE_FRC:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), srcOpers[0].value.f32v[0] - floorf(srcOpers[0].value.f32v[0]),
                            srcOpers[0].value.f32v[1] - floorf(srcOpers[0].value.f32v[1]),
                            srcOpers[0].value.f32v[2] - floorf(srcOpers[0].value.f32v[2]),
                            srcOpers[0].value.f32v[3] - floorf(srcOpers[0].value.f32v[3])));
      break;
    // positive infinity
    case OPCODE_ROUND_PI:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), ceilf(srcOpers[0].value.f32v[0]),
                            ceilf(srcOpers[0].value.f32v[1]), ceilf(srcOpers[0].value.f32v[2]),
                            ceilf(srcOpers[0].value.f32v[3])));
      break;
    // negative infinity
    case OPCODE_ROUND_NI:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), floorf(srcOpers[0].value.f32v[0]),
                            floorf(srcOpers[0].value.f32v[1]), floorf(srcOpers[0].value.f32v[2]),
                            floorf(srcOpers[0].value.f32v[3])));
      break;
    // towards zero
    case OPCODE_ROUND_Z:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            srcOpers[0].value.f32v[0] < 0 ? ceilf(srcOpers[0].value.f32v[0])
                                                          : floorf(srcOpers[0].value.f32v[0]),
                            srcOpers[0].value.f32v[1] < 0 ? ceilf(srcOpers[0].value.f32v[1])
                                                          : floorf(srcOpers[0].value.f32v[1]),
                            srcOpers[0].value.f32v[2] < 0 ? ceilf(srcOpers[0].value.f32v[2])
                                                          : floorf(srcOpers[0].value.f32v[2]),
                            srcOpers[0].value.f32v[3] < 0 ? ceilf(srcOpers[0].value.f32v[3])
                                                          : floorf(srcOpers[0].value.f32v[3])));
      break;
    // to nearest even int (banker's rounding)
    case OPCODE_ROUND_NE:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), round_ne(srcOpers[0].value.f32v[0]),
                            round_ne(srcOpers[0].value.f32v[1]), round_ne(srcOpers[0].value.f32v[2]),
                            round_ne(srcOpers[0].value.f32v[3])));
      break;
    case OPCODE_INEG: SetDst(state, op.operands[0], op, neg(srcOpers[0], optype)); break;
    case OPCODE_IMIN:
      SetDst(state, op.operands[0], op,
             ShaderVariable(
                 "",
                 srcOpers[0].value.s32v[0] < srcOpers[1].value.s32v[0] ? srcOpers[0].value.s32v[0]
                                                                       : srcOpers[1].value.s32v[0],
                 srcOpers[0].value.s32v[1] < srcOpers[1].value.s32v[1] ? srcOpers[0].value.s32v[1]
                                                                       : srcOpers[1].value.s32v[1],
                 srcOpers[0].value.s32v[2] < srcOpers[1].value.s32v[2] ? srcOpers[0].value.s32v[2]
                                                                       : srcOpers[1].value.s32v[2],
                 srcOpers[0].value.s32v[3] < srcOpers[1].value.s32v[3] ? srcOpers[0].value.s32v[3]
                                                                       : srcOpers[1].value.s32v[3]));
      break;
    case OPCODE_UMIN:
      SetDst(state, op.operands[0], op,
             ShaderVariable(
                 "",
                 srcOpers[0].value.u32v[0] < srcOpers[1].value.u32v[0] ? srcOpers[0].value.u32v[0]
                                                                       : srcOpers[1].value.u32v[0],
                 srcOpers[0].value.u32v[1] < srcOpers[1].value.u32v[1] ? srcOpers[0].value.u32v[1]
                                                                       : srcOpers[1].value.u32v[1],
                 srcOpers[0].value.u32v[2] < srcOpers[1].value.u32v[2] ? srcOpers[0].value.u32v[2]
                                                                       : srcOpers[1].value.u32v[2],
                 srcOpers[0].value.u32v[3] < srcOpers[1].value.u32v[3] ? srcOpers[0].value.u32v[3]
                                                                       : srcOpers[1].value.u32v[3]));
      break;
    case OPCODE_DMIN:
    {
      double src0[2], src1[2];
      DoubleGet(srcOpers[0], src0);
      DoubleGet(srcOpers[1], src1);

      double dst[2];
      dst[0] = dxbc_min(src0[0], src1[0]);
      dst[1] = dxbc_min(src0[1], src1[1]);

      ShaderVariable r("", 0U, 0U, 0U, 0U);
      DoubleSet(r, dst);

      SetDst(state, op.operands[0], op, r);
      break;
    }
    case OPCODE_MIN:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), dxbc_min(srcOpers[0].value.f32v[0], srcOpers[1].value.f32v[0]),
                            dxbc_min(srcOpers[0].value.f32v[1], srcOpers[1].value.f32v[1]),
                            dxbc_min(srcOpers[0].value.f32v[2], srcOpers[1].value.f32v[2]),
                            dxbc_min(srcOpers[0].value.f32v[3], srcOpers[1].value.f32v[3])));
      break;
    case OPCODE_UMAX:
      SetDst(state, op.operands[0], op,
             ShaderVariable(
                 "",
                 srcOpers[0].value.u32v[0] >= srcOpers[1].value.u32v[0] ? srcOpers[0].value.u32v[0]
                                                                        : srcOpers[1].value.u32v[0],
                 srcOpers[0].value.u32v[1] >= srcOpers[1].value.u32v[1] ? srcOpers[0].value.u32v[1]
                                                                        : srcOpers[1].value.u32v[1],
                 srcOpers[0].value.u32v[2] >= srcOpers[1].value.u32v[2] ? srcOpers[0].value.u32v[2]
                                                                        : srcOpers[1].value.u32v[2],
                 srcOpers[0].value.u32v[3] >= srcOpers[1].value.u32v[3] ? srcOpers[0].value.u32v[3]
                                                                        : srcOpers[1].value.u32v[3]));
      break;
    case OPCODE_IMAX:
      SetDst(state, op.operands[0], op,
             ShaderVariable(
                 "",
                 srcOpers[0].value.s32v[0] >= srcOpers[1].value.s32v[0] ? srcOpers[0].value.s32v[0]
                                                                        : srcOpers[1].value.s32v[0],
                 srcOpers[0].value.s32v[1] >= srcOpers[1].value.s32v[1] ? srcOpers[0].value.s32v[1]
                                                                        : srcOpers[1].value.s32v[1],
                 srcOpers[0].value.s32v[2] >= srcOpers[1].value.s32v[2] ? srcOpers[0].value.s32v[2]
                                                                        : srcOpers[1].value.s32v[2],
                 srcOpers[0].value.s32v[3] >= srcOpers[1].value.s32v[3] ? srcOpers[0].value.s32v[3]
                                                                        : srcOpers[1].value.s32v[3]));
      break;
    case OPCODE_DMAX:
    {
      double src0[2], src1[2];
      DoubleGet(srcOpers[0], src0);
      DoubleGet(srcOpers[1], src1);

      double dst[2];
      dst[0] = dxbc_max(src0[0], src1[0]);
      dst[1] = dxbc_max(src0[1], src1[1]);

      ShaderVariable r("", 0U, 0U, 0U, 0U);
      DoubleSet(r, dst);

      SetDst(state, op.operands[0], op, r);
      break;
    }
    case OPCODE_MAX:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), dxbc_max(srcOpers[0].value.f32v[0], srcOpers[1].value.f32v[0]),
                            dxbc_max(srcOpers[0].value.f32v[1], srcOpers[1].value.f32v[1]),
                            dxbc_max(srcOpers[0].value.f32v[2], srcOpers[1].value.f32v[2]),
                            dxbc_max(srcOpers[0].value.f32v[3], srcOpers[1].value.f32v[3])));
      break;
    case OPCODE_SQRT:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), sqrtf(srcOpers[0].value.f32v[0]),
                            sqrtf(srcOpers[0].value.f32v[1]), sqrtf(srcOpers[0].value.f32v[2]),
                            sqrtf(srcOpers[0].value.f32v[3])));
      break;
    case OPCODE_DRCP:
    {
      double ds[2] = {0.0, 0.0};
      DoubleGet(srcOpers[0], ds);
      ds[0] = 1.0f / ds[0];
      ds[1] = 1.0f / ds[1];

      ShaderVariable r("", 0U, 0U, 0U, 0U);
      DoubleSet(r, ds);

      SetDst(state, op.operands[0], op, r);
      break;
    }

    case OPCODE_IBFE:
    {
      // bottom 5 bits
      ShaderVariable width("", (int32_t)(srcOpers[0].value.s32v[0] & 0x1f),
                           (int32_t)(srcOpers[0].value.s32v[1] & 0x1f),
                           (int32_t)(srcOpers[0].value.s32v[2] & 0x1f),
                           (int32_t)(srcOpers[0].value.s32v[3] & 0x1f));
      ShaderVariable offset("", (int32_t)(srcOpers[1].value.s32v[0] & 0x1f),
                            (int32_t)(srcOpers[1].value.s32v[1] & 0x1f),
                            (int32_t)(srcOpers[1].value.s32v[2] & 0x1f),
                            (int32_t)(srcOpers[1].value.s32v[3] & 0x1f));

      ShaderVariable dest("", (int32_t)0, (int32_t)0, (int32_t)0, (int32_t)0);

      for(int comp = 0; comp < 4; comp++)
      {
        if(width.value.s32v[comp] == 0)
        {
          dest.value.s32v[comp] = 0;
        }
        else if(width.value.s32v[comp] + offset.value.s32v[comp] < 32)
        {
          dest.value.s32v[comp] = srcOpers[2].value.s32v[comp]
                                  << (32 - (width.value.s32v[comp] + offset.value.s32v[comp]));
          dest.value.s32v[comp] = dest.value.s32v[comp] >> (32 - width.value.s32v[comp]);
        }
        else
        {
          dest.value.s32v[comp] = srcOpers[2].value.s32v[comp] >> offset.value.s32v[comp];
        }
      }

      SetDst(state, op.operands[0], op, dest);
      break;
    }
    case OPCODE_UBFE:
    {
      // bottom 5 bits
      ShaderVariable width("", (uint32_t)(srcOpers[0].value.u32v[0] & 0x1f),
                           (uint32_t)(srcOpers[0].value.u32v[1] & 0x1f),
                           (uint32_t)(srcOpers[0].value.u32v[2] & 0x1f),
                           (uint32_t)(srcOpers[0].value.u32v[3] & 0x1f));
      ShaderVariable offset("", (uint32_t)(srcOpers[1].value.u32v[0] & 0x1f),
                            (uint32_t)(srcOpers[1].value.u32v[1] & 0x1f),
                            (uint32_t)(srcOpers[1].value.u32v[2] & 0x1f),
                            (uint32_t)(srcOpers[1].value.u32v[3] & 0x1f));

      ShaderVariable dest("", (uint32_t)0, (uint32_t)0, (uint32_t)0, (uint32_t)0);

      for(int comp = 0; comp < 4; comp++)
      {
        if(width.value.u32v[comp] == 0)
        {
          dest.value.u32v[comp] = 0;
        }
        else if(width.value.u32v[comp] + offset.value.u32v[comp] < 32)
        {
          dest.value.u32v[comp] = srcOpers[2].value.u32v[comp]
                                  << (32 - (width.value.u32v[comp] + offset.value.u32v[comp]));
          dest.value.u32v[comp] = dest.value.u32v[comp] >> (32 - width.value.u32v[comp]);
        }
        else
        {
          dest.value.u32v[comp] = srcOpers[2].value.u32v[comp] >> offset.value.u32v[comp];
        }
      }

      SetDst(state, op.operands[0], op, dest);
      break;
    }
    case OPCODE_BFI:
    {
      // bottom 5 bits
      ShaderVariable width("", (uint32_t)(srcOpers[0].value.u32v[0] & 0x1f),
                           (uint32_t)(srcOpers[0].value.u32v[1] & 0x1f),
                           (uint32_t)(srcOpers[0].value.u32v[2] & 0x1f),
                           (uint32_t)(srcOpers[0].value.u32v[3] & 0x1f));
      ShaderVariable offset("", (uint32_t)(srcOpers[1].value.u32v[0] & 0x1f),
                            (uint32_t)(srcOpers[1].value.u32v[1] & 0x1f),
                            (uint32_t)(srcOpers[1].value.u32v[2] & 0x1f),
                            (uint32_t)(srcOpers[1].value.u32v[3] & 0x1f));

      ShaderVariable dest("", (uint32_t)0, (uint32_t)0, (uint32_t)0, (uint32_t)0);

      for(int comp = 0; comp < 4; comp++)
      {
        uint32_t bitmask =
            (((1 << width.value.u32v[comp]) - 1) << offset.value.u32v[comp]) & 0xffffffff;
        dest.value.u32v[comp] =
            (uint32_t)(((srcOpers[2].value.u32v[comp] << offset.value.u32v[comp]) & bitmask) |
                       (srcOpers[3].value.u32v[comp] & ~bitmask));
      }

      SetDst(state, op.operands[0], op, dest);
      break;
    }
    case OPCODE_ISHL:
    {
      uint32_t shifts[] = {
          srcOpers[1].value.u32v[0] & 0x1f,
          srcOpers[1].value.u32v[1] & 0x1f,
          srcOpers[1].value.u32v[2] & 0x1f,
          srcOpers[1].value.u32v[3] & 0x1f,
      };

      // if we were only given a single component, it's the form that shifts all components
      // by the same amount
      if(op.operands[2].numComponents == NUMCOMPS_1 ||
         (op.operands[2].comps[2] < 4 && op.operands[2].comps[2] == 0xff))
        shifts[3] = shifts[2] = shifts[1] = shifts[0];

      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] << shifts[0],
                            srcOpers[0].value.s32v[1] << shifts[1],
                            srcOpers[0].value.s32v[2] << shifts[2],
                            srcOpers[0].value.s32v[3] << shifts[3]));
      break;
    }
    case OPCODE_USHR:
    {
      uint32_t shifts[] = {
          srcOpers[1].value.u32v[0] & 0x1f,
          srcOpers[1].value.u32v[1] & 0x1f,
          srcOpers[1].value.u32v[2] & 0x1f,
          srcOpers[1].value.u32v[3] & 0x1f,
      };

      // if we were only given a single component, it's the form that shifts all components
      // by the same amount
      if(op.operands[2].numComponents == NUMCOMPS_1 ||
         (op.operands[2].comps[2] < 4 && op.operands[2].comps[2] == 0xff))
        shifts[3] = shifts[2] = shifts[1] = shifts[0];

      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), srcOpers[0].value.u32v[0] >> shifts[0],
                            srcOpers[0].value.u32v[1] >> shifts[1],
                            srcOpers[0].value.u32v[2] >> shifts[2],
                            srcOpers[0].value.u32v[3] >> shifts[3]));
      break;
    }
    case OPCODE_ISHR:
    {
      uint32_t shifts[] = {
          srcOpers[1].value.u32v[0] & 0x1f,
          srcOpers[1].value.u32v[1] & 0x1f,
          srcOpers[1].value.u32v[2] & 0x1f,
          srcOpers[1].value.u32v[3] & 0x1f,
      };

      // if we were only given a single component, it's the form that shifts all components
      // by the same amount
      if(op.operands[2].numComponents == NUMCOMPS_1 ||
         (op.operands[2].comps[2] < 4 && op.operands[2].comps[2] == 0xff))
        shifts[3] = shifts[2] = shifts[1] = shifts[0];

      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] >> shifts[0],
                            srcOpers[0].value.s32v[1] >> shifts[1],
                            srcOpers[0].value.s32v[2] >> shifts[2],
                            srcOpers[0].value.s32v[3] >> shifts[3]));
      break;
    }
    case OPCODE_AND:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] & srcOpers[1].value.s32v[0],
                            srcOpers[0].value.s32v[1] & srcOpers[1].value.s32v[1],
                            srcOpers[0].value.s32v[2] & srcOpers[1].value.s32v[2],
                            srcOpers[0].value.s32v[3] & srcOpers[1].value.s32v[3]));
      break;
    case OPCODE_OR:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), srcOpers[0].value.s32v[0] | srcOpers[1].value.s32v[0],
                            srcOpers[0].value.s32v[1] | srcOpers[1].value.s32v[1],
                            srcOpers[0].value.s32v[2] | srcOpers[1].value.s32v[2],
                            srcOpers[0].value.s32v[3] | srcOpers[1].value.s32v[3]));
      break;
    case OPCODE_XOR:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), srcOpers[0].value.u32v[0] ^ srcOpers[1].value.u32v[0],
                            srcOpers[0].value.u32v[1] ^ srcOpers[1].value.u32v[1],
                            srcOpers[0].value.u32v[2] ^ srcOpers[1].value.u32v[2],
                            srcOpers[0].value.u32v[3] ^ srcOpers[1].value.u32v[3]));
      break;
    case OPCODE_NOT:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), ~srcOpers[0].value.u32v[0], ~srcOpers[0].value.u32v[1],
                            ~srcOpers[0].value.u32v[2], ~srcOpers[0].value.u32v[3]));
      break;

      /////////////////////////////////////////////////////////////////////////////////////////////////////
      // transcendental functions with loose ULP requirements, so we pass them to the GPU to get
      // more accurate (well, LESS accurate but more representative) answers.

    case OPCODE_RCP:
    case OPCODE_RSQ:
    case OPCODE_EXP:
    case OPCODE_LOG:
    {
      ShaderVariable calcResultA("calcA", 0.0f, 0.0f, 0.0f, 0.0f);
      ShaderVariable calcResultB("calcB", 0.0f, 0.0f, 0.0f, 0.0f);
      if(apiWrapper->CalculateMathIntrinsic(op.operation, srcOpers[0], calcResultA, calcResultB))
      {
        SetDst(state, op.operands[0], op, calcResultA);
      }
      else
      {
        return;
      }
      break;
    }
    case OPCODE_SINCOS:
    {
      ShaderVariable calcResultA("calcA", 0.0f, 0.0f, 0.0f, 0.0f);
      ShaderVariable calcResultB("calcB", 0.0f, 0.0f, 0.0f, 0.0f);
      if(apiWrapper->CalculateMathIntrinsic(OPCODE_SINCOS, srcOpers[1], calcResultA, calcResultB))
      {
        if(op.operands[0].type != TYPE_NULL)
          SetDst(state, op.operands[0], op, calcResultA);
        if(op.operands[1].type != TYPE_NULL)
          SetDst(state, op.operands[1], op, calcResultB);
      }
      else
      {
        return;
      }
      break;
    }

      /////////////////////////////////////////////////////////////////////////////////////////////////////
      // Misc

    case OPCODE_NOP:
    case OPCODE_CUSTOMDATA:
    case OPCODE_OPAQUE_CUSTOMDATA:
    case OPCODE_SHADER_MESSAGE:
    case OPCODE_DCL_IMMEDIATE_CONSTANT_BUFFER: break;
    case OPCODE_SYNC:    // might never need to implement this. Who knows!
      break;
    case OPCODE_DMOV:
    case OPCODE_MOV: SetDst(state, op.operands[0], op, srcOpers[0]); break;
    case OPCODE_DMOVC:
      SetDst(
          state, op.operands[0], op,
          ShaderVariable(
              "", srcOpers[0].value.u32v[0] ? srcOpers[1].value.u32v[0] : srcOpers[2].value.u32v[0],
              srcOpers[0].value.u32v[0] ? srcOpers[1].value.u32v[1] : srcOpers[2].value.u32v[1],
              srcOpers[0].value.u32v[1] ? srcOpers[1].value.u32v[2] : srcOpers[2].value.u32v[2],
              srcOpers[0].value.u32v[1] ? srcOpers[1].value.u32v[3] : srcOpers[2].value.u32v[3]));
      break;
    case OPCODE_MOVC:
      SetDst(
          state, op.operands[0], op,
          ShaderVariable(
              "", srcOpers[0].value.s32v[0] ? srcOpers[1].value.s32v[0] : srcOpers[2].value.s32v[0],
              srcOpers[0].value.s32v[1] ? srcOpers[1].value.s32v[1] : srcOpers[2].value.s32v[1],
              srcOpers[0].value.s32v[2] ? srcOpers[1].value.s32v[2] : srcOpers[2].value.s32v[2],
              srcOpers[0].value.s32v[3] ? srcOpers[1].value.s32v[3] : srcOpers[2].value.s32v[3]));
      break;
    case OPCODE_SWAPC:
      SetDst(
          state, op.operands[0], op,
          ShaderVariable(
              "", srcOpers[1].value.s32v[0] ? srcOpers[3].value.s32v[0] : srcOpers[2].value.s32v[0],
              srcOpers[1].value.s32v[1] ? srcOpers[3].value.s32v[1] : srcOpers[2].value.s32v[1],
              srcOpers[1].value.s32v[2] ? srcOpers[3].value.s32v[2] : srcOpers[2].value.s32v[2],
              srcOpers[1].value.s32v[3] ? srcOpers[3].value.s32v[3] : srcOpers[2].value.s32v[3]));

      SetDst(
          state, op.operands[1], op,
          ShaderVariable(
              "", srcOpers[1].value.s32v[0] ? srcOpers[2].value.s32v[0] : srcOpers[3].value.s32v[0],
              srcOpers[1].value.s32v[1] ? srcOpers[2].value.s32v[1] : srcOpers[3].value.s32v[1],
              srcOpers[1].value.s32v[2] ? srcOpers[2].value.s32v[2] : srcOpers[3].value.s32v[2],
              srcOpers[1].value.s32v[3] ? srcOpers[2].value.s32v[3] : srcOpers[3].value.s32v[3]));
      break;
    case OPCODE_ITOF:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), (float)srcOpers[0].value.s32v[0],
                            (float)srcOpers[0].value.s32v[1], (float)srcOpers[0].value.s32v[2],
                            (float)srcOpers[0].value.s32v[3]));
      break;
    case OPCODE_UTOF:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), (float)srcOpers[0].value.u32v[0],
                            (float)srcOpers[0].value.u32v[1], (float)srcOpers[0].value.u32v[2],
                            (float)srcOpers[0].value.u32v[3]));
      break;
    case OPCODE_FTOI:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), (int)srcOpers[0].value.f32v[0], (int)srcOpers[0].value.f32v[1],
                            (int)srcOpers[0].value.f32v[2], (int)srcOpers[0].value.f32v[3]));
      break;
    case OPCODE_FTOU:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(), (uint32_t)srcOpers[0].value.f32v[0],
                            (uint32_t)srcOpers[0].value.f32v[1], (uint32_t)srcOpers[0].value.f32v[2],
                            (uint32_t)srcOpers[0].value.f32v[3]));
      break;
    case OPCODE_ITOD:
    case OPCODE_UTOD:
    case OPCODE_FTOD:
    {
      double res[2];

      if(op.operation == OPCODE_ITOD)
      {
        res[0] = (double)srcOpers[0].value.s32v[0];
        res[1] = (double)srcOpers[0].value.s32v[1];
      }
      else if(op.operation == OPCODE_UTOD)
      {
        res[0] = (double)srcOpers[0].value.u32v[0];
        res[1] = (double)srcOpers[0].value.u32v[1];
      }
      else if(op.operation == OPCODE_FTOD)
      {
        res[0] = (double)srcOpers[0].value.f32v[0];
        res[1] = (double)srcOpers[0].value.f32v[1];
      }

      // if we only did a 1-wide double op, copy .xy into .zw so we can then
      // swizzle into .xy or .zw freely on the destination operand.
      // e.g. ftod r0.zw, r0.z - if we didn't do this, there'd be nothing valid in .zw
      if(op.operands[1].comps[2] == 0xff)
        res[1] = res[0];

      ShaderVariable r("", 0U, 0U, 0U, 0U);
      DoubleSet(r, res);

      SetDst(state, op.operands[0], op, r);
      break;
    }
    case OPCODE_DTOI:
    case OPCODE_DTOU:
    case OPCODE_DTOF:
    {
      double src[2];
      DoubleGet(srcOpers[0], src);

      // special behaviour for dest mask. if it's .xz then first goes into .x, second into .z.
      // if the mask is .y then the first goes into .y and second goes nowhere.
      // so we need to check the dest mask and put the results into the right place

      ShaderVariable r("", 0U, 0U, 0U, 0U);

      if(op.operation == OPCODE_DTOU)
      {
        if(op.operands[0].comps[1] == 0xff)    // only one mask
        {
          r.value.u32v[op.operands[0].comps[0]] = uint32_t(src[0]);
        }
        else
        {
          r.value.u32v[op.operands[0].comps[0]] = uint32_t(src[0]);
          r.value.u32v[op.operands[0].comps[1]] = uint32_t(src[1]);
        }
      }
      else if(op.operation == OPCODE_DTOI)
      {
        if(op.operands[0].comps[1] == 0xff)    // only one mask
        {
          r.value.s32v[op.operands[0].comps[0]] = int32_t(src[0]);
        }
        else
        {
          r.value.s32v[op.operands[0].comps[0]] = int32_t(src[0]);
          r.value.s32v[op.operands[0].comps[1]] = int32_t(src[1]);
        }
      }
      else if(op.operation == OPCODE_DTOF)
      {
        if(op.operands[0].comps[1] == 0xff)    // only one mask
        {
          r.value.f32v[op.operands[0].comps[0]] = float(src[0]);
        }
        else
        {
          r.value.f32v[op.operands[0].comps[0]] = float(src[0]);
          r.value.f32v[op.operands[0].comps[1]] = float(src[1]);
        }
      }

      SetDst(state, op.operands[0], op, r);
      break;
    }

      /////////////////////////////////////////////////////////////////////////////////////////////////////
      // Comparison

    case OPCODE_EQ:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.f32v[0] == srcOpers[1].value.f32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[1] == srcOpers[1].value.f32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[2] == srcOpers[1].value.f32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[3] == srcOpers[1].value.f32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_NE:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.f32v[0] != srcOpers[1].value.f32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[1] != srcOpers[1].value.f32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[2] != srcOpers[1].value.f32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[3] != srcOpers[1].value.f32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_LT:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.f32v[0] < srcOpers[1].value.f32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[1] < srcOpers[1].value.f32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[2] < srcOpers[1].value.f32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[3] < srcOpers[1].value.f32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_GE:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.f32v[0] >= srcOpers[1].value.f32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[1] >= srcOpers[1].value.f32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[2] >= srcOpers[1].value.f32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.f32v[3] >= srcOpers[1].value.f32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_DEQ:
    case OPCODE_DNE:
    case OPCODE_DGE:
    case OPCODE_DLT:
    {
      double src0[2], src1[2];
      DoubleGet(srcOpers[0], src0);
      DoubleGet(srcOpers[1], src1);

      uint32_t cmp1 = 0;
      uint32_t cmp2 = 0;

      switch(op.operation)
      {
        case OPCODE_DEQ:
          cmp1 = (src0[0] == src1[0] ? ~0l : 0l);
          cmp2 = (src0[1] == src1[1] ? ~0l : 0l);
          break;
        case OPCODE_DNE:
          cmp1 = (src0[0] != src1[0] ? ~0l : 0l);
          cmp2 = (src0[1] != src1[1] ? ~0l : 0l);
          break;
        case OPCODE_DGE:
          cmp1 = (src0[0] >= src1[0] ? ~0l : 0l);
          cmp2 = (src0[1] >= src1[1] ? ~0l : 0l);
          break;
        case OPCODE_DLT:
          cmp1 = (src0[0] < src1[0] ? ~0l : 0l);
          cmp2 = (src0[1] < src1[1] ? ~0l : 0l);
          break;
        default: break;
      }

      // special behaviour for dest mask. if it's .xz then first comparison goes into .x, second
      // into .z.
      // if the mask is .y then the first comparison goes into .y and second goes nowhere.
      // so we need to check the dest mask and put the comparison results into the right place

      ShaderVariable r("", 0U, 0U, 0U, 0U);

      if(op.operands[0].comps[1] == 0xff)    // only one mask
      {
        r.value.u32v[op.operands[0].comps[0]] = cmp1;
      }
      else
      {
        r.value.u32v[op.operands[0].comps[0]] = cmp1;
        r.value.u32v[op.operands[0].comps[1]] = cmp2;
      }

      SetDst(state, op.operands[0], op, r);
      break;
    }
    case OPCODE_IEQ:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.s32v[0] == srcOpers[1].value.s32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[1] == srcOpers[1].value.s32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[2] == srcOpers[1].value.s32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[3] == srcOpers[1].value.s32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_INE:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.s32v[0] != srcOpers[1].value.s32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[1] != srcOpers[1].value.s32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[2] != srcOpers[1].value.s32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[3] != srcOpers[1].value.s32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_IGE:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.s32v[0] >= srcOpers[1].value.s32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[1] >= srcOpers[1].value.s32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[2] >= srcOpers[1].value.s32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[3] >= srcOpers[1].value.s32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_ILT:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.s32v[0] < srcOpers[1].value.s32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[1] < srcOpers[1].value.s32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[2] < srcOpers[1].value.s32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.s32v[3] < srcOpers[1].value.s32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_ULT:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.u32v[0] < srcOpers[1].value.u32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.u32v[1] < srcOpers[1].value.u32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.u32v[2] < srcOpers[1].value.u32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.u32v[3] < srcOpers[1].value.u32v[3] ? ~0u : 0u)));
      break;
    case OPCODE_UGE:
      SetDst(state, op.operands[0], op,
             ShaderVariable(rdcstr(),
                            (srcOpers[0].value.u32v[0] >= srcOpers[1].value.u32v[0] ? ~0u : 0u),
                            (srcOpers[0].value.u32v[1] >= srcOpers[1].value.u32v[1] ? ~0u : 0u),
                            (srcOpers[0].value.u32v[2] >= srcOpers[1].value.u32v[2] ? ~0u : 0u),
                            (srcOpers[0].value.u32v[3] >= srcOpers[1].value.u32v[3] ? ~0u : 0u)));
      break;

      /////////////////////////////////////////////////////////////////////////////////////////////////////
      // Atomic instructions

    case OPCODE_IMM_ATOMIC_ALLOC:
    {
      BindingSlot slot = GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW,
                                                     srcOpers[0].value.u32v[0]);
      GlobalState::UAVIterator uav = global.uavs.find(slot);
      if(uav == global.uavs.end())
      {
        apiWrapper->FetchUAV(slot);
        uav = global.uavs.find(slot);
      }

      MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);

      // if it's not a buffer or the buffer is empty this UAV is NULL/invalid, return 0 for the
      // counter
      uint32_t count = uav->second.data.empty() ? 0 : uav->second.hiddenCounter++;
      SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), count, count, count, count));
      break;
    }

    case OPCODE_IMM_ATOMIC_CONSUME:
    {
      BindingSlot slot = GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW,
                                                     srcOpers[0].value.u32v[0]);
      GlobalState::UAVIterator uav = global.uavs.find(slot);
      if(uav == global.uavs.end())
      {
        apiWrapper->FetchUAV(slot);
        uav = global.uavs.find(slot);
      }

      MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);

      // if it's not a buffer or the buffer is empty this UAV is NULL/invalid, return 0 for the
      // counter
      uint32_t count = uav->second.data.empty() ? 0 : --uav->second.hiddenCounter;
      SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), count, count, count, count));
      break;
    }

    /////////////////////////////////////////////////////////////////////////////////////////////////////
    // Derivative instructions

    // don't differentiate, coarse, fine, whatever. The spec lets us implement it all as fine.
    case OPCODE_DERIV_RTX:
    case OPCODE_DERIV_RTX_COARSE:
    case OPCODE_DERIV_RTX_FINE:
      if(program->GetShaderType() != DXBC::ShaderType::Pixel || prevWorkgroup.size() != 4)
        RDCERR(
            "Attempt to use derivative instruction not in pixel shader. Undefined results will "
            "occur!");
      else
        SetDst(state, op.operands[0], op,
               DDX(op.operation == OPCODE_DERIV_RTX_FINE, prevWorkgroup, op.operands[1], op));
      break;
    case OPCODE_DERIV_RTY:
    case OPCODE_DERIV_RTY_COARSE:
    case OPCODE_DERIV_RTY_FINE:
      if(program->GetShaderType() != DXBC::ShaderType::Pixel || prevWorkgroup.size() != 4)
        RDCERR(
            "Attempt to use derivative instruction not in pixel shader. Undefined results will "
            "occur!");
      else
        SetDst(state, op.operands[0], op,
               DDY(op.operation == OPCODE_DERIV_RTY_FINE, prevWorkgroup, op.operands[1], op));
      break;

    /////////////////////////////////////////////////////////////////////////////////////////////////////
    // Buffer/Texture load and store

    // handle atomic operations all together
    case OPCODE_ATOMIC_IADD:
    case OPCODE_ATOMIC_IMAX:
    case OPCODE_ATOMIC_IMIN:
    case OPCODE_ATOMIC_AND:
    case OPCODE_ATOMIC_OR:
    case OPCODE_ATOMIC_XOR:
    case OPCODE_ATOMIC_CMP_STORE:
    case OPCODE_ATOMIC_UMAX:
    case OPCODE_ATOMIC_UMIN:
    case OPCODE_IMM_ATOMIC_IADD:
    case OPCODE_IMM_ATOMIC_IMAX:
    case OPCODE_IMM_ATOMIC_IMIN:
    case OPCODE_IMM_ATOMIC_AND:
    case OPCODE_IMM_ATOMIC_OR:
    case OPCODE_IMM_ATOMIC_XOR:
    case OPCODE_IMM_ATOMIC_EXCH:
    case OPCODE_IMM_ATOMIC_CMP_EXCH:
    case OPCODE_IMM_ATOMIC_UMAX:
    case OPCODE_IMM_ATOMIC_UMIN:
    {
      Operand beforeResult;
      uint32_t resIndex = 0;
      ShaderVariable *dstAddress = NULL;
      ShaderVariable *src0 = NULL;
      ShaderVariable *src1 = NULL;
      bool gsm = false;

      if(op.operation == OPCODE_IMM_ATOMIC_IADD || op.operation == OPCODE_IMM_ATOMIC_IMAX ||
         op.operation == OPCODE_IMM_ATOMIC_IMIN || op.operation == OPCODE_IMM_ATOMIC_AND ||
         op.operation == OPCODE_IMM_ATOMIC_OR || op.operation == OPCODE_IMM_ATOMIC_XOR ||
         op.operation == OPCODE_IMM_ATOMIC_EXCH || op.operation == OPCODE_IMM_ATOMIC_CMP_EXCH ||
         op.operation == OPCODE_IMM_ATOMIC_UMAX || op.operation == OPCODE_IMM_ATOMIC_UMIN)
      {
        beforeResult = op.operands[0];
        resIndex = (uint32_t)op.operands[1].indices[0].index;
        gsm = (op.operands[1].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
        dstAddress = &srcOpers[1];
        src0 = &srcOpers[2];
        if(srcOpers.size() > 3)
          src1 = &srcOpers[3];
      }
      else
      {
        beforeResult.type = TYPE_NULL;
        resIndex = (uint32_t)op.operands[0].indices[0].index;
        gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
        dstAddress = &srcOpers[0];
        src0 = &srcOpers[1];
        if(srcOpers.size() > 2)
          src1 = &srcOpers[2];
      }

      uint32_t stride = 4;
      uint32_t offset = 0;
      uint32_t numElems = 0;
      bool structured = false;

      byte *data = NULL;

      if(gsm)
      {
        offset = 0;
        if(resIndex > global.groupshared.size())
        {
          numElems = 0;
          stride = 4;
          data = NULL;
        }
        else
        {
          numElems = global.groupshared[resIndex].count;
          stride = global.groupshared[resIndex].bytestride;
          data = &global.groupshared[resIndex].data[0];
          structured = global.groupshared[resIndex].structured;
        }
      }
      else
      {
        BindingSlot slot =
            GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW, resIndex);
        GlobalState::UAVIterator uav = global.uavs.find(slot);
        if(uav == global.uavs.end())
        {
          apiWrapper->FetchUAV(slot);
          uav = global.uavs.find(slot);
        }

        MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);

        offset = uav->second.firstElement;
        numElems = uav->second.numElements;
        data = &uav->second.data[0];

        const DXBCBytecode::Declaration *pDecl =
            program->FindDeclaration(TYPE_UNORDERED_ACCESS_VIEW, resIndex);
        if(pDecl)
        {
          if(pDecl->declaration == OPCODE_DCL_UNORDERED_ACCESS_VIEW_RAW)
          {
            stride = 4;
            structured = false;
          }
          else if(pDecl->declaration == OPCODE_DCL_UNORDERED_ACCESS_VIEW_STRUCTURED)
          {
            stride = pDecl->structured.stride;
            structured = true;
          }
        }
      }

      RDCASSERT(data);

      // seems like .x is element index, and .y is byte address, in the dstAddress operand
      //
      // "Out of bounds addressing on u# causes nothing to be written to memory, except if the
      //  u# is structured, and byte offset into the struct (second component of the address) is
      //  causing the out of bounds access, then the entire contents of the UAV become undefined."
      //
      // "The number of components taken from the address is determined by the dimensionality of dst
      // u# or g#."

      if(data)
      {
        data += (offset + dstAddress->value.u32v[0]) * stride;
        if(structured)
          data += dstAddress->value.u32v[1];
      }

      // if out of bounds, undefined result is returned to dst0 for immediate operands,
      // so we only need to care about the in-bounds case.
      // Also helper/inactive pixels are not allowed to modify UAVs
      if(data && offset + dstAddress->value.u32v[0] < numElems && !Finished())
      {
        uint32_t *udst = (uint32_t *)data;
        int32_t *idst = (int32_t *)data;

        if(beforeResult.type != TYPE_NULL)
        {
          SetDst(state, beforeResult, op, ShaderVariable(rdcstr(), *udst, *udst, *udst, *udst));
        }

        // not verified below since by definition the operations that expect usrc1 will have it
        uint32_t *usrc0 = src0->value.u32v.data();
        uint32_t *usrc1 = src1->value.u32v.data();

        int32_t *isrc0 = src0->value.s32v.data();

        switch(op.operation)
        {
          case OPCODE_IMM_ATOMIC_IADD:
          case OPCODE_ATOMIC_IADD: *udst = *udst + *usrc0; break;
          case OPCODE_IMM_ATOMIC_IMAX:
          case OPCODE_ATOMIC_IMAX: *idst = RDCMAX(*idst, *isrc0); break;
          case OPCODE_IMM_ATOMIC_IMIN:
          case OPCODE_ATOMIC_IMIN: *idst = RDCMIN(*idst, *isrc0); break;
          case OPCODE_IMM_ATOMIC_AND:
          case OPCODE_ATOMIC_AND: *udst = *udst & *usrc0; break;
          case OPCODE_IMM_ATOMIC_OR:
          case OPCODE_ATOMIC_OR: *udst = *udst | *usrc0; break;
          case OPCODE_IMM_ATOMIC_XOR:
          case OPCODE_ATOMIC_XOR: *udst = *udst ^ *usrc0; break;
          case OPCODE_IMM_ATOMIC_EXCH: *udst = *usrc0; break;
          case OPCODE_IMM_ATOMIC_CMP_EXCH:
          case OPCODE_ATOMIC_CMP_STORE:
            if(*udst == *usrc1)
              *udst = *usrc0;
            break;
          case OPCODE_IMM_ATOMIC_UMAX:
          case OPCODE_ATOMIC_UMAX: *udst = RDCMAX(*udst, *usrc0); break;
          case OPCODE_IMM_ATOMIC_UMIN:
          case OPCODE_ATOMIC_UMIN: *udst = RDCMIN(*udst, *usrc0); break;
          default: break;
        }
      }

      break;
    }

    // store and load paths are mostly identical
    case OPCODE_STORE_UAV_TYPED:
    case OPCODE_STORE_RAW:
    case OPCODE_STORE_STRUCTURED:

    case OPCODE_LD_RAW:
    case OPCODE_LD_UAV_TYPED:
    case OPCODE_LD_STRUCTURED:
    {
      uint32_t resIndex = 0;
      uint32_t structOffset = 0;
      uint32_t elemIdx = 0;

      uint32_t texCoords[3] = {0, 0, 0};

      uint32_t stride = 0;

      bool srv = true;
      bool gsm = false;

      bool load = true;

      uint8_t resComps[4] = {0, 1, 2, 3};

      if(op.operation == OPCODE_STORE_UAV_TYPED || op.operation == OPCODE_STORE_RAW ||
         op.operation == OPCODE_STORE_STRUCTURED)
      {
        load = false;
      }

      if(load && state)
        state->flags |= ShaderEvents::SampleLoadGather;

      if(op.operation == OPCODE_LD_STRUCTURED || op.operation == OPCODE_STORE_STRUCTURED)
      {
        if(load)
        {
          resIndex = (uint32_t)op.operands[3].indices[0].index;
          srv = (op.operands[3].type == TYPE_RESOURCE);
          gsm = (op.operands[3].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
          memcpy(resComps, op.operands[3].comps, sizeof(resComps));

          stride = op.stride;
        }
        else
        {
          resIndex = (uint32_t)op.operands[0].indices[0].index;
          srv = false;
          gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
        }

        if(stride == 0)
        {
          if(gsm && resIndex < global.groupshared.size())
          {
            stride = global.groupshared[resIndex].bytestride;
          }
          else if(!gsm)
          {
            OperandType declType = srv ? TYPE_RESOURCE : TYPE_UNORDERED_ACCESS_VIEW;
            OpcodeType declOpcode =
                srv ? OPCODE_DCL_RESOURCE_STRUCTURED : OPCODE_DCL_UNORDERED_ACCESS_VIEW_STRUCTURED;
            const DXBCBytecode::Declaration *pDecl = program->FindDeclaration(declType, resIndex);
            if(pDecl && pDecl->declaration == declOpcode)
              stride = pDecl->structured.stride;
          }
        }

        structOffset = srcOpers[1].value.u32v[0];
        elemIdx = srcOpers[0].value.u32v[0];
      }
      else if(op.operation == OPCODE_LD_UAV_TYPED || op.operation == OPCODE_STORE_UAV_TYPED)
      {
        if(load)
        {
          resIndex = (uint32_t)op.operands[2].indices[0].index;
          gsm = (op.operands[2].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
          memcpy(resComps, op.operands[2].comps, sizeof(resComps));
        }
        else
        {
          resIndex = (uint32_t)op.operands[0].indices[0].index;
          gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
        }

        elemIdx = srcOpers[0].value.u32v[0];

        // could be a tex load
        texCoords[0] = srcOpers[0].value.u32v[0];
        texCoords[1] = srcOpers[0].value.u32v[1];
        texCoords[2] = srcOpers[0].value.u32v[2];

        stride = 4;
        srv = false;
      }
      else if(op.operation == OPCODE_LD_RAW || op.operation == OPCODE_STORE_RAW)
      {
        if(load)
        {
          resIndex = (uint32_t)op.operands[2].indices[0].index;
          srv = (op.operands[2].type == TYPE_RESOURCE);
          gsm = (op.operands[2].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
          memcpy(resComps, op.operands[2].comps, sizeof(resComps));
        }
        else
        {
          resIndex = (uint32_t)op.operands[0].indices[0].index;
          srv = false;
          gsm = (op.operands[0].type == TYPE_THREAD_GROUP_SHARED_MEMORY);
        }

        // the index is supposed to be a multiple of 4 but the behaviour seems to be to round down
        elemIdx = (srcOpers[0].value.u32v[0] & ~0x3);
        stride = 1;
      }

      RDCASSERT(stride != 0);

      byte *data = NULL;
      size_t dataSize = 0;
      bool texData = false;
      uint32_t rowPitch = 0;
      uint32_t depthPitch = 0;
      uint32_t firstElem = 0;
      uint32_t numElems = 0;
      GlobalState::ViewFmt fmt;

      if(gsm)
      {
        firstElem = 0;
        if(resIndex > global.groupshared.size())
        {
          numElems = 0;
          stride = 4;
          data = NULL;
        }
        else
        {
          numElems = global.groupshared[resIndex].count;
          stride = global.groupshared[resIndex].bytestride;
          data = global.groupshared[resIndex].data.data();
          dataSize = global.groupshared[resIndex].data.size();
          fmt.fmt = CompType::UInt;
          fmt.byteWidth = 4;
          fmt.numComps = global.groupshared[resIndex].bytestride / 4;
          fmt.stride = 0;
        }
        texData = false;
      }
      else
      {
        BindingSlot slot = GetBindingSlotForIdentifier(
            *program, srv ? TYPE_RESOURCE : TYPE_UNORDERED_ACCESS_VIEW, resIndex);

        if(srv)
        {
          GlobalState::SRVIterator srvIter = global.srvs.find(slot);
          if(srvIter == global.srvs.end())
          {
            apiWrapper->FetchSRV(slot);
            srvIter = global.srvs.find(slot);
          }

          MarkResourceAccess(state, TYPE_RESOURCE, slot);

          data = srvIter->second.data.data();
          dataSize = srvIter->second.data.size();
          firstElem = srvIter->second.firstElement;
          numElems = srvIter->second.numElements;
          fmt = srvIter->second.format;
        }
        else
        {
          GlobalState::UAVIterator uavIter = global.uavs.find(slot);
          if(uavIter == global.uavs.end())
          {
            apiWrapper->FetchUAV(slot);
            uavIter = global.uavs.find(slot);
          }

          MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);

          data = uavIter->second.data.data();
          dataSize = uavIter->second.data.size();
          texData = uavIter->second.tex;
          rowPitch = uavIter->second.rowPitch;
          depthPitch = uavIter->second.depthPitch;
          firstElem = uavIter->second.firstElement;
          numElems = uavIter->second.numElements;
          fmt = uavIter->second.format;
        }

        if(op.operation == OPCODE_LD_UAV_TYPED || op.operation == OPCODE_STORE_UAV_TYPED)
          stride = fmt.Stride();
      }

      // indexing for raw views is in bytes, but firstElement/numElements is in format-sized
      // units. Multiply up by stride
      if(op.operation == OPCODE_LD_RAW || op.operation == OPCODE_STORE_RAW)
      {
        firstElem *= RDCMIN(4, fmt.byteWidth);
        numElems *= RDCMIN(4, fmt.byteWidth);
      }

      RDCASSERT(data);

      size_t dataOffset = 0;

      if(texData)
      {
        dataOffset += texCoords[0] * fmt.Stride();
        dataOffset += texCoords[1] * rowPitch;
        dataOffset += texCoords[2] * depthPitch;
      }
      else
      {
        dataOffset += (firstElem + elemIdx) * stride;
        dataOffset += structOffset;
      }

      if(!data || (!texData && elemIdx >= numElems) || (texData && dataOffset >= dataSize))
      {
        if(load)
          SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), 0U, 0U, 0U, 0U));
      }
      else
      {
        data += dataOffset;

        int maxIndex = fmt.numComps;

        uint32_t srcIdx = 1;
        if(op.operation == OPCODE_STORE_STRUCTURED || op.operation == OPCODE_LD_STRUCTURED)
        {
          srcIdx = 2;
          maxIndex = (stride - structOffset) / sizeof(uint32_t);
          fmt.byteWidth = 4;
          fmt.numComps = 4;
          if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
             op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
            fmt.numComps = 1;
          fmt.fmt = CompType::UInt;
        }
        // raw loads/stores can come from any component (as long as it's within range of the data!)
        if(op.operation == OPCODE_LD_RAW || op.operation == OPCODE_STORE_RAW)
        {
          fmt.byteWidth = 4;

          // normally we can read 4 elements
          fmt.numComps = 4;
          // clamp to out of bounds based on numElems
          fmt.numComps = RDCMIN(fmt.numComps, int(numElems - elemIdx) / 4);
          maxIndex = fmt.numComps;

          if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
             op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
            fmt.numComps = 1;
          fmt.fmt = CompType::UInt;
        }

        if(load)
        {
          ShaderVariable result = TypedUAVLoad(fmt, data);

          // apply the swizzle on the resource operand
          ShaderVariable fetch("", 0U, 0U, 0U, 0U);

          for(int c = 0; c < 4; c++)
          {
            uint8_t comp = resComps[c];
            if(comp == 0xff)
              comp = 0;

            fetch.value.u32v[c] = result.value.u32v[comp];
          }

          if(op.operation != OPCODE_LD_RAW && op.operation != OPCODE_LD_STRUCTURED)
          {
            // if we are assigning into a scalar, SetDst expects the result to be in .x (as normally
            // we are assigning FROM a scalar also).
            // to match this expectation, propogate the component across.
            if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
               op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
              fetch.value.u32v[0] = fetch.value.u32v[op.operands[0].comps[0]];
          }

          SetDst(state, op.operands[0], op, fetch);
        }
        else if(!Finished())    // helper/inactive pixels can't modify UAVs
        {
          for(int i = 0; i < 4; i++)
          {
            uint8_t comp = op.operands[0].comps[i];
            // masks must be contiguous from x, if we reach the 'end' we're done
            if(comp == 0xff || comp >= maxIndex)
              break;

            TypedUAVStore(fmt, data, srcOpers[srcIdx]);
          }
        }
      }

      break;
    }

    case OPCODE_EVAL_CENTROID:
    case OPCODE_EVAL_SAMPLE_INDEX:
    case OPCODE_EVAL_SNAPPED:
    {
      // opcodes only seem to be supported for regular inputs
      RDCASSERT(op.operands[1].type == TYPE_INPUT);

      GlobalState::SampleEvalCacheKey key;

      RDCASSERT(program->GetShaderType() == DXBC::ShaderType::Pixel);

      key.quadIndex = workgroupIndex;

      // if this is TYPE_INPUT we can look up the index directly
      key.inputRegisterIndex = (int32_t)op.operands[1].indices[0].index;

      for(int c = 0; c < 4; c++)
      {
        if(op.operands[0].comps[c] == 0xff)
          break;

        key.numComponents = c + 1;
      }

      key.firstComponent = op.operands[1].comps[op.operands[0].comps[0]];

      if(op.operation == OPCODE_EVAL_SAMPLE_INDEX)
      {
        key.sample = srcOpers[1].value.s32v[0];
      }
      else if(op.operation == OPCODE_EVAL_SNAPPED)
      {
        key.offsetx = RDCCLAMP(srcOpers[1].value.s32v[0], -8, 7);
        key.offsety = RDCCLAMP(srcOpers[1].value.s32v[1], -8, 7);
      }
      else if(op.operation == OPCODE_EVAL_CENTROID)
      {
        // OPCODE_EVAL_CENTROID is the default, -1 sample and 0,0 offset
      }

      // look up this combination in the cache, if we get a hit then return that value.
      auto it = global.sampleEvalCache.find(key);
      if(it != global.sampleEvalCache.end())
      {
        // perform source operand swizzling
        ShaderVariable var = it->second;

        for(int i = 0; i < 4; i++)
          if(op.operands[1].comps[i] < 4)
            var.value.u32v[i] = it->second.value.u32v[op.operands[1].comps[i]];

        SetDst(state, op.operands[0], op, var);
      }
      else
      {
        // if we got here, either the cache is empty (we're not rendering MSAA at all) so we should
        // just return the interpolant, or something went wrong and the item we want isn't cached so
        // the best we can do is return the interpolant.

        if(!global.sampleEvalCache.empty())
        {
          apiWrapper->AddDebugMessage(
              MessageCategory::Shaders, MessageSeverity::Medium, MessageSource::RuntimeWarning,
              StringFormat::Fmt(
                  "Shader debugging %d: %s\n"
                  "No sample evaluate found in cache. Possible out-of-bounds sample index",
                  nextInstruction - 1, op.str.c_str()));
        }

        SetDst(state, op.operands[0], op, srcOpers[0]);
      }

      break;
    }

    case OPCODE_SAMPLE_INFO:
    case OPCODE_SAMPLE_POS:
    {
      size_t numIndices = program->IsShaderModel51() ? 2 : 1;
      bool isAbsoluteResource =
          (op.operands[1].indices.size() == numIndices && op.operands[1].indices[0].absolute &&
           !op.operands[1].indices[0].relative);
      BindingSlot slot;
      if(op.operands[1].type != TYPE_RASTERIZER)
      {
        UINT identifier = (UINT)(op.operands[1].indices[0].index & 0xffffffff);
        slot = GetBindingSlotForIdentifier(*program, op.operands[1].type, identifier);

        MarkResourceAccess(state, op.operands[1].type, slot);
      }
      ShaderVariable result =
          apiWrapper->GetSampleInfo(op.operands[1].type, isAbsoluteResource, slot, op.str.c_str());

      // "If there is no resource bound to the specified slot, 0 is returned."

      // lookup sample pos if we got a count from above
      if(op.operation == OPCODE_SAMPLE_POS && result.value.u32v[0] > 0 &&
         (op.operands[2].type == TYPE_IMMEDIATE32 || op.operands[2].type == TYPE_TEMP))
      {
        // assume standard sample pattern - this might not hold in all cases
        // http://msdn.microsoft.com/en-us/library/windows/desktop/ff476218(v=vs.85).aspx

        uint32_t sampleIndex = srcOpers[1].value.u32v[0];
        uint32_t sampleCount = result.value.u32v[0];

        if(sampleIndex >= sampleCount)
        {
          // Per HLSL docs, if sampleIndex is out of bounds a zero vector is returned
          RDCWARN("sample index %u is out of bounds on resource bound to sample_pos (%u samples)",
                  sampleIndex, sampleCount);
          result.value.f32v[0] = 0.0f;
          result.value.f32v[1] = 0.0f;
          result.value.f32v[2] = 0.0f;
          result.value.f32v[3] = 0.0f;
        }
        else
        {
          const float *sample_pattern = NULL;

// co-ordinates are given as (i,j) in 16ths of a pixel
#define _SMP(c) ((c) / 16.0f)

          if(sampleCount == 1)
          {
            RDCWARN("Non-multisampled texture being passed to sample_pos");

            apiWrapper->AddDebugMessage(
                MessageCategory::Shaders, MessageSeverity::Medium, MessageSource::RuntimeWarning,
                StringFormat::Fmt(
                    "Shader debugging %d: %s\nNon-multisampled texture being passed to sample_pos",
                    nextInstruction - 1, op.str.c_str()));

            sample_pattern = NULL;
          }
          else if(sampleCount == 2)
          {
            static const float pattern_2x[] = {
                _SMP(4.0f),
                _SMP(4.0f),
                _SMP(-4.0f),
                _SMP(-4.0f),
            };

            sample_pattern = &pattern_2x[0];
          }
          else if(sampleCount == 4)
          {
            static const float pattern_4x[] = {
                _SMP(-2.0f), _SMP(-6.0f), _SMP(6.0f), _SMP(-2.0f),
                _SMP(-6.0f), _SMP(2.0f),  _SMP(2.0f), _SMP(6.0f),
            };

            sample_pattern = &pattern_4x[0];
          }
          else if(sampleCount == 8)
          {
            static const float pattern_8x[] = {
                _SMP(1.0f),  _SMP(-3.0f), _SMP(-1.0f), _SMP(3.0f),  _SMP(5.0f),  _SMP(1.0f),
                _SMP(-3.0f), _SMP(-5.0f), _SMP(-5.0f), _SMP(5.0f),  _SMP(-7.0f), _SMP(-1.0f),
                _SMP(3.0f),  _SMP(7.0f),  _SMP(7.0f),  _SMP(-7.0f),
            };

            sample_pattern = &pattern_8x[0];
          }
          else if(sampleCount == 16)
          {
            static const float pattern_16x[] = {
                _SMP(1.0f),  _SMP(1.0f),  _SMP(-1.0f), _SMP(-3.0f), _SMP(-3.0f), _SMP(2.0f),
                _SMP(4.0f),  _SMP(-1.0f), _SMP(-5.0f), _SMP(-2.0f), _SMP(2.0f),  _SMP(5.0f),
                _SMP(5.0f),  _SMP(3.0f),  _SMP(3.0f),  _SMP(-5.0f), _SMP(-2.0f), _SMP(6.0f),
                _SMP(0.0f),  _SMP(-7.0f), _SMP(-4.0f), _SMP(-6.0f), _SMP(-6.0f), _SMP(4.0f),
                _SMP(-8.0f), _SMP(0.0f),  _SMP(7.0f),  _SMP(-4.0f), _SMP(6.0f),  _SMP(7.0f),
                _SMP(-7.0f), _SMP(-8.0f),
            };

            sample_pattern = &pattern_16x[0];
          }
          else    // unsupported sample count
          {
            RDCERR("Unsupported sample count on resource for sample_pos: %u", result.value.u32v[0]);

            sample_pattern = NULL;
          }

          if(sample_pattern == NULL)
          {
            result.value.f32v[0] = 0.0f;
            result.value.f32v[1] = 0.0f;
          }
          else
          {
            result.value.f32v[0] = sample_pattern[sampleIndex * 2 + 0];
            result.value.f32v[1] = sample_pattern[sampleIndex * 2 + 1];
          }
        }

#undef _SMP
      }

      // apply swizzle
      ShaderVariable swizzled("", 0.0f, 0.0f, 0.0f, 0.0f);

      for(int i = 0; i < 4; i++)
      {
        if(op.operands[1].comps[i] == 0xff)
          swizzled.value.u32v[i] = result.value.u32v[0];
        else
          swizzled.value.u32v[i] = result.value.u32v[op.operands[1].comps[i]];
      }

      // apply ret type
      if(op.operation == OPCODE_SAMPLE_POS)
      {
        result = swizzled;
        result.type = VarType::Float;
      }
      else if(op.infoRetType == RETTYPE_FLOAT)
      {
        result.value.f32v[0] = (float)swizzled.value.u32v[0];
        result.value.f32v[1] = (float)swizzled.value.u32v[1];
        result.value.f32v[2] = (float)swizzled.value.u32v[2];
        result.value.f32v[3] = (float)swizzled.value.u32v[3];
        result.type = VarType::Float;
      }
      else
      {
        result = swizzled;
        result.type = VarType::UInt;
      }

      // if we are assigning into a scalar, SetDst expects the result to be in .x (as normally we
      // are assigning FROM a scalar also).
      // to match this expectation, propogate the component across.
      if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
         op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
        result.value.u32v[0] = result.value.u32v[op.operands[0].comps[0]];

      SetDst(state, op.operands[0], op, result);

      break;
    }

    case OPCODE_BUFINFO:
    {
      size_t numIndices = program->IsShaderModel51() ? 2 : 1;
      if(op.operands[1].indices.size() == numIndices && op.operands[1].indices[0].absolute &&
         !op.operands[1].indices[0].relative)
      {
        UINT identifier = (UINT)(op.operands[1].indices[0].index & 0xffffffff);
        BindingSlot slot = GetBindingSlotForIdentifier(*program, op.operands[1].type, identifier);
        ShaderVariable result = apiWrapper->GetBufferInfo(op.operands[1].type, slot, op.str.c_str());

        MarkResourceAccess(state, op.operands[1].type, slot);

        // apply swizzle
        ShaderVariable swizzled("", 0.0f, 0.0f, 0.0f, 0.0f);

        for(int i = 0; i < 4; i++)
        {
          if(op.operands[1].comps[i] == 0xff)
            swizzled.value.u32v[i] = result.value.u32v[0];
          else
            swizzled.value.u32v[i] = result.value.u32v[op.operands[1].comps[i]];
        }

        result = swizzled;
        result.type = VarType::UInt;

        // if we are assigning into a scalar, SetDst expects the result to be in .x (as normally we
        // are assigning FROM a scalar also).
        // to match this expectation, propogate the component across.
        if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
           op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
          result.value.u32v[0] = result.value.u32v[op.operands[0].comps[0]];

        SetDst(state, op.operands[0], op, result);
      }
      else
      {
        RDCERR("Unexpected relative addressing");
        SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), 0.0f, 0.0f, 0.0f, 0.0f));
      }

      break;
    }

    case OPCODE_RESINFO:
    {
      // spec says "srcMipLevel is read as an unsigned integer scalar"
      uint32_t mipLevel = srcOpers[0].value.u32v[0];

      size_t numIndices = program->IsShaderModel51() ? 2 : 1;
      if(op.operands[2].indices.size() == numIndices && op.operands[2].indices[0].absolute &&
         !op.operands[2].indices[0].relative)
      {
        int dim = 0;
        UINT identifier = (UINT)(op.operands[2].indices[0].index & 0xffffffff);
        BindingSlot slot = GetBindingSlotForIdentifier(*program, op.operands[2].type, identifier);
        ShaderVariable result = apiWrapper->GetResourceInfo(op.operands[2].type, slot, mipLevel, dim);

        MarkResourceAccess(state, op.operands[2].type, slot);

        // need a valid dimension even if the resource was unbound, so
        // search for the declaration
        if(dim == 0)
        {
          const Declaration *pDecl =
              program->FindDeclaration(TYPE_RESOURCE, (uint32_t)op.operands[2].indices[0].index);
          if(pDecl && pDecl->declaration == OPCODE_DCL_RESOURCE)
          {
            switch(pDecl->resource.dim)
            {
              default:
              case RESOURCE_DIMENSION_UNKNOWN:
              case NUM_DIMENSIONS:
              case RESOURCE_DIMENSION_BUFFER:
              case RESOURCE_DIMENSION_RAW_BUFFER:
              case RESOURCE_DIMENSION_STRUCTURED_BUFFER:
              case RESOURCE_DIMENSION_TEXTURE1D:
              case RESOURCE_DIMENSION_TEXTURE1DARRAY: dim = 1; break;
              case RESOURCE_DIMENSION_TEXTURE2D:
              case RESOURCE_DIMENSION_TEXTURE2DMS:
              case RESOURCE_DIMENSION_TEXTURE2DARRAY:
              case RESOURCE_DIMENSION_TEXTURE2DMSARRAY:
              case RESOURCE_DIMENSION_TEXTURECUBE:
              case RESOURCE_DIMENSION_TEXTURECUBEARRAY: dim = 2; break;
              case RESOURCE_DIMENSION_TEXTURE3D: dim = 3; break;
            }
          }
        }

        // apply swizzle
        ShaderVariable swizzled("", 0.0f, 0.0f, 0.0f, 0.0f);

        for(int i = 0; i < 4; i++)
        {
          if(op.operands[2].comps[i] == 0xff)
            swizzled.value.u32v[i] = result.value.u32v[0];
          else
            swizzled.value.u32v[i] = result.value.u32v[op.operands[2].comps[i]];
        }

        // apply ret type
        if(op.infoRetType == RETTYPE_FLOAT)
        {
          result.value.f32v[0] = (float)swizzled.value.u32v[0];
          result.value.f32v[1] = (float)swizzled.value.u32v[1];
          result.value.f32v[2] = (float)swizzled.value.u32v[2];
          result.value.f32v[3] = (float)swizzled.value.u32v[3];
          result.type = VarType::Float;
        }
        else if(op.infoRetType == RETTYPE_RCPFLOAT)
        {
          // only width/height/depth values we set are reciprocated, other values
          // are just left as is
          if(dim <= 1)
            result.value.f32v[0] = 1.0f / (float)swizzled.value.u32v[0];
          else
            result.value.f32v[0] = (float)swizzled.value.u32v[0];

          if(dim <= 2)
            result.value.f32v[1] = 1.0f / (float)swizzled.value.u32v[1];
          else
            result.value.f32v[1] = (float)swizzled.value.u32v[1];

          if(dim <= 3)
            result.value.f32v[2] = 1.0f / (float)swizzled.value.u32v[2];
          else
            result.value.f32v[2] = (float)swizzled.value.u32v[2];

          result.value.f32v[3] = (float)swizzled.value.u32v[3];
          result.type = VarType::Float;
        }
        else if(op.infoRetType == RETTYPE_UINT)
        {
          result = swizzled;
          result.type = VarType::UInt;
        }

        // if we are assigning into a scalar, SetDst expects the result to be in .x (as normally we
        // are assigning FROM a scalar also).
        // to match this expectation, propogate the component across.
        if(op.operands[0].comps[0] != 0xff && op.operands[0].comps[1] == 0xff &&
           op.operands[0].comps[2] == 0xff && op.operands[0].comps[3] == 0xff)
          result.value.u32v[0] = result.value.u32v[op.operands[0].comps[0]];

        SetDst(state, op.operands[0], op, result);
      }
      else
      {
        RDCERR("Unexpected relative addressing");
        SetDst(state, op.operands[0], op, ShaderVariable(rdcstr(), 0.0f, 0.0f, 0.0f, 0.0f));
      }

      break;
    }
    case OPCODE_SAMPLE:
    case OPCODE_SAMPLE_L:
    case OPCODE_SAMPLE_B:
    case OPCODE_SAMPLE_D:
    case OPCODE_SAMPLE_C:
    case OPCODE_SAMPLE_C_LZ:
    case OPCODE_LD:
    case OPCODE_LD_MS:
    case OPCODE_GATHER4:
    case OPCODE_GATHER4_C:
    case OPCODE_GATHER4_PO:
    case OPCODE_GATHER4_PO_C:
    case OPCODE_LOD:
    {
      if(op.operation != OPCODE_LOD && state)
        state->flags |= ShaderEvents::SampleLoadGather;

      SamplerMode samplerMode = NUM_SAMPLERS;
      ResourceDimension resourceDim = RESOURCE_DIMENSION_UNKNOWN;
      DXBC::ResourceRetType resourceRetType = DXBC::RETURN_TYPE_UNKNOWN;
      int sampleCount = 0;

      // Default assumptions for bindings
      Operand destOperand = op.operands[0];
      Operand resourceOperand = op.operands[2];
      Operand samplerOperand;
      if(op.operands.size() > 3)
        samplerOperand = op.operands[3];
      if(op.operation == OPCODE_GATHER4_PO || op.operation == OPCODE_GATHER4_PO_C)
      {
        resourceOperand = op.operands[3];
        samplerOperand = op.operands[4];
      }

      BindingSlot resourceBinding((uint32_t)resourceOperand.indices[0].index, 0);
      BindingSlot samplerBinding(0, 0);

      for(size_t i = 0; i < program->GetNumDeclarations(); i++)
      {
        const Declaration &decl = program->GetDeclaration(i);

        if(decl.declaration == OPCODE_DCL_SAMPLER && decl.operand.sameResource(samplerOperand))
        {
          samplerMode = decl.samplerMode;
          samplerBinding = GetBindingSlotForDeclaration(*program, decl);
        }
        if(op.operation == OPCODE_LD && decl.declaration == OPCODE_DCL_RESOURCE &&
           decl.resource.dim == RESOURCE_DIMENSION_BUFFER &&
           decl.operand.sameResource(resourceOperand))
        {
          resourceDim = decl.resource.dim;

          resourceBinding = GetBindingSlotForDeclaration(*program, decl);
          GlobalState::SRVIterator srv = global.srvs.find(resourceBinding);
          if(srv == global.srvs.end())
          {
            apiWrapper->FetchSRV(resourceBinding);
            srv = global.srvs.find(resourceBinding);
          }

          const byte *data = &srv->second.data[0];
          uint32_t offset = srv->second.firstElement;
          uint32_t numElems = srv->second.numElements;

          GlobalState::ViewFmt fmt = srv->second.format;

          data += fmt.Stride() * offset;

          ShaderVariable result;

          {
            result = ShaderVariable(rdcstr(), 0.0f, 0.0f, 0.0f, 0.0f);

            if(srcOpers[0].value.u32v[0] < numElems &&
               data + srcOpers[0].value.u32v[0] * fmt.Stride() <= srv->second.data.end())
              result = TypedUAVLoad(fmt, data + srcOpers[0].value.u32v[0] * fmt.Stride());
          }

          ShaderVariable fetch("", 0U, 0U, 0U, 0U);

          for(int c = 0; c < 4; c++)
          {
            uint8_t comp = resourceOperand.comps[c];
            if(resourceOperand.comps[c] == 0xff)
              comp = 0;

            fetch.value.u32v[c] = result.value.u32v[comp];
          }

          // if we are assigning into a scalar, SetDst expects the result to be in .x (as normally
          // we are assigning FROM a scalar also).
          // to match this expectation, propogate the component across.
          if(destOperand.comps[0] != 0xff && destOperand.comps[1] == 0xff &&
             destOperand.comps[2] == 0xff && destOperand.comps[3] == 0xff)
            fetch.value.u32v[0] = fetch.value.u32v[destOperand.comps[0]];

          SetDst(state, destOperand, op, fetch);

          MarkResourceAccess(state, TYPE_RESOURCE, resourceBinding);

          return;
        }
        if(decl.declaration == OPCODE_DCL_RESOURCE && decl.operand.sameResource(resourceOperand))
        {
          resourceDim = decl.resource.dim;
          resourceRetType = decl.resource.resType[0];
          sampleCount = decl.resource.sampleCount;

          resourceBinding = GetBindingSlotForDeclaration(*program, decl);

          // With SM5.1, resource arrays need to offset the shader register by the array index
          if(program->IsShaderModel51())
            resourceBinding.shaderRegister = srcOpers[1].value.u32v[1];

          // doesn't seem like these are ever less than four components, even if the texture is
          // declared <float3> for example.
          // shouldn't matter though is it just comes out in the wash.
          RDCASSERT(decl.resource.resType[0] == decl.resource.resType[1] &&
                    decl.resource.resType[1] == decl.resource.resType[2] &&
                    decl.resource.resType[2] == decl.resource.resType[3]);
          RDCASSERT(decl.resource.resType[0] != DXBC::RETURN_TYPE_CONTINUED &&
                    decl.resource.resType[0] != DXBC::RETURN_TYPE_UNUSED &&
                    decl.resource.resType[0] != DXBC::RETURN_TYPE_MIXED &&
                    decl.resource.resType[0] >= 0 &&
                    decl.resource.resType[0] < DXBC::NUM_RETURN_TYPES);
        }
      }

      // for lod operation, it's only defined for certain resources - otherwise just returns 0
      if(op.operation == OPCODE_LOD && resourceDim != RESOURCE_DIMENSION_TEXTURE1D &&
         resourceDim != RESOURCE_DIMENSION_TEXTURE1DARRAY &&
         resourceDim != RESOURCE_DIMENSION_TEXTURE2D &&
         resourceDim != RESOURCE_DIMENSION_TEXTURE2DARRAY &&
         resourceDim != RESOURCE_DIMENSION_TEXTURE3D && resourceDim != RESOURCE_DIMENSION_TEXTURECUBE)
      {
        ShaderVariable invalidResult("tex", 0.0f, 0.0f, 0.0f, 0.0f);

        SetDst(state, destOperand, op, invalidResult);
        break;
      }

      ShaderVariable uv = srcOpers[0];
      ShaderVariable ddxCalc;
      ShaderVariable ddyCalc;

      // these ops need DDX/DDY
      if(op.operation == OPCODE_SAMPLE || op.operation == OPCODE_SAMPLE_B ||
         op.operation == OPCODE_SAMPLE_C || op.operation == OPCODE_LOD)
      {
        if(program->GetShaderType() != DXBC::ShaderType::Pixel || prevWorkgroup.size() != 4)
        {
          RDCERR(
              "Attempt to use derivative instruction not in pixel shader. Undefined results will "
              "occur!");
        }
        else
        {
          // texture samples use coarse derivatives
          ddxCalc = DDX(false, prevWorkgroup, op.operands[1], op);
          ddyCalc = DDY(false, prevWorkgroup, op.operands[1], op);
        }
      }
      else if(op.operation == OPCODE_SAMPLE_D)
      {
        ddxCalc = srcOpers[3];
        ddyCalc = srcOpers[4];
      }

      int multisampleIndex = 0;
      if(srcOpers.size() >= 3)
        multisampleIndex = srcOpers[2].value.s32v[0];
      float lodOrCompareValue = 0.0f;
      if(srcOpers.size() >= 4)
        lodOrCompareValue = srcOpers[3].value.f32v[0];
      if(op.operation == OPCODE_GATHER4_PO_C)
        lodOrCompareValue = srcOpers[5].value.f32v[0];

      uint8_t swizzle[4] = {0};
      for(int i = 0; i < 4; i++)
      {
        if(resourceOperand.comps[i] == 0xff)
          swizzle[i] = 0;
        else
          swizzle[i] = resourceOperand.comps[i];
      }

      GatherChannel gatherChannel = GatherChannel::Red;
      if(op.operation == OPCODE_GATHER4 || op.operation == OPCODE_GATHER4_C ||
         op.operation == OPCODE_GATHER4_PO || op.operation == OPCODE_GATHER4_PO_C)
      {
        gatherChannel = (GatherChannel)samplerOperand.comps[0];
      }

      // for bias instruction we can't do a SampleGradBias, so add the bias into the sampler state.
      float samplerBias = 0.0f;
      if(op.operation == OPCODE_SAMPLE_B)
        samplerBias = srcOpers[3].value.f32v[0];

      SampleGatherResourceData resourceData;
      resourceData.dim = resourceDim;
      resourceData.retType = resourceRetType;
      resourceData.sampleCount = sampleCount;
      resourceData.binding = resourceBinding;

      SampleGatherSamplerData samplerData;
      samplerData.mode = samplerMode;
      samplerData.binding = samplerBinding;
      samplerData.bias = samplerBias;

      MarkResourceAccess(state, TYPE_RESOURCE, resourceBinding);

      ShaderVariable lookupResult("tex", 0.0f, 0.0f, 0.0f, 0.0f);
      if(apiWrapper->CalculateSampleGather(op.operation, resourceData, samplerData, uv, ddxCalc,
                                           ddyCalc, op.texelOffset, multisampleIndex,
                                           lodOrCompareValue, swizzle, gatherChannel,
                                           op.str.c_str(), lookupResult))
      {
        // should be a better way of doing this
        if(destOperand.comps[1] == 0xff)
          lookupResult.value.s32v[0] = lookupResult.value.s32v[destOperand.comps[0]];

        SetDst(state, destOperand, op, lookupResult);
      }
      else
      {
        return;
      }
      break;
    }

      /////////////////////////////////////////////////////////////////////////////////////////////////////
      // Flow control

    case OPCODE_SWITCH:
    {
      uint32_t switchValue = GetSrc(op.operands[0], op).value.u32v[0];

      int depth = 0;

      uint32_t jumpLocation = 0;

      uint32_t search = nextInstruction;

      for(; search < (uint32_t)program->GetNumInstructions(); search++)
      {
        const Operation &nextOp = program->GetInstruction((size_t)search);

        // track nested switch statements to ensure we don't accidentally pick the case from a
        // different switch
        if(nextOp.operation == OPCODE_SWITCH)
        {
          depth++;
        }
        else if(depth > 0 && nextOp.operation == OPCODE_ENDSWITCH)
        {
          depth--;
        }
        else if(depth == 0)
        {
          // note the default: location as jumpLocation if we haven't found a matching
          // case yet. If we find one later, this will be overridden
          if(nextOp.operation == OPCODE_DEFAULT)
            jumpLocation = search;

          // reached end of our switch statement
          if(nextOp.operation == OPCODE_ENDSWITCH)
            break;

          if(nextOp.operation == OPCODE_CASE)
          {
            uint32_t caseValue = GetSrc(nextOp.operands[0], nextOp).value.u32v[0];

            // comparison is defined to be bitwise
            if(caseValue == switchValue)
            {
              // we've found our case, break out
              jumpLocation = search;
              break;
            }
          }
        }
      }

      // jumpLocation points to the case we're taking, either a matching case or default

      if(jumpLocation == 0)
      {
        RDCERR("Didn't find matching case or default: for switch(%u)!", switchValue);
      }
      else
      {
        // skip straight past any case or default labels as we don't want to step to them, we want
        // next instruction to point
        // at the next excutable instruction (which might be a break if we're doing nothing)
        for(; jumpLocation < (uint32_t)program->GetNumInstructions(); jumpLocation++)
        {
          const Operation &nextOp = program->GetInstruction(jumpLocation);

          if(nextOp.operation != OPCODE_CASE && nextOp.operation != OPCODE_DEFAULT)
            break;
        }

        nextInstruction = jumpLocation;
      }

      break;
    }
    case OPCODE_CASE:
    case OPCODE_DEFAULT:
    case OPCODE_LOOP:
    case OPCODE_ENDSWITCH:
    case OPCODE_ENDIF:
      // do nothing. Basically just an anonymous label that is used elsewhere
      // (IF/ELSE/SWITCH/ENDLOOP/BREAK)
      break;
    case OPCODE_CONTINUE:
    case OPCODE_CONTINUEC:
    case OPCODE_ENDLOOP:
    {
      int depth = 0;

      int32_t test = op.operation == OPCODE_CONTINUEC ? GetSrc(op.operands[0], op).value.s32v[0] : 0;

      if(op.operation == OPCODE_CONTINUE || op.operation == OPCODE_CONTINUEC)
        depth = 1;

      if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()) ||
         op.operation == OPCODE_CONTINUE || op.operation == OPCODE_ENDLOOP)
      {
        // skip back one to the endloop that we're processing
        nextInstruction--;

        for(; nextInstruction >= 0; nextInstruction--)
        {
          if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDLOOP)
            depth++;
          if(program->GetInstruction(nextInstruction).operation == OPCODE_LOOP)
            depth--;

          if(depth == 0)
          {
            break;
          }
        }

        RDCASSERT(nextInstruction >= 0);
      }

      break;
    }
    case OPCODE_BREAK:
    case OPCODE_BREAKC:
    {
      int32_t test = op.operation == OPCODE_BREAKC ? GetSrc(op.operands[0], op).value.s32v[0] : 0;

      if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()) || op.operation == OPCODE_BREAK)
      {
        // break out (jump to next endloop/endswitch)
        int depth = 1;

        for(; nextInstruction < (int)program->GetNumInstructions(); nextInstruction++)
        {
          if(program->GetInstruction(nextInstruction).operation == OPCODE_LOOP ||
             program->GetInstruction(nextInstruction).operation == OPCODE_SWITCH)
            depth++;
          if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDLOOP ||
             program->GetInstruction(nextInstruction).operation == OPCODE_ENDSWITCH)
            depth--;

          if(depth == 0)
          {
            break;
          }
        }

        RDCASSERT(program->GetInstruction(nextInstruction).operation == OPCODE_ENDLOOP ||
                  program->GetInstruction(nextInstruction).operation == OPCODE_ENDSWITCH);

        // don't want to process the endloop and jump again!
        nextInstruction++;
      }

      break;
    }
    case OPCODE_IF:
    {
      int32_t test = GetSrc(op.operands[0], op).value.s32v[0];

      if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()))
      {
        // nothing, we go into the if.
      }
      else
      {
        // jump to after the next matching else/endif
        int depth = 0;

        // skip back one to the if that we're processing
        nextInstruction--;

        for(; nextInstruction < (int)program->GetNumInstructions(); nextInstruction++)
        {
          if(program->GetInstruction(nextInstruction).operation == OPCODE_IF)
            depth++;
          // only step out on an else if it's the matching depth to our starting if (depth == 1)
          if(depth == 1 && program->GetInstruction(nextInstruction).operation == OPCODE_ELSE)
            depth--;
          if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF)
            depth--;

          if(depth == 0)
          {
            break;
          }
        }

        RDCASSERT(program->GetInstruction(nextInstruction).operation == OPCODE_ELSE ||
                  program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF);

        // step to next instruction after the else/endif (processing an else would skip that block)
        nextInstruction++;
      }

      break;
    }
    case OPCODE_ELSE:
    {
      // if we hit an else then we've just processed the if() bracket and need to break out (jump to
      // next endif)
      int depth = 1;

      for(; nextInstruction < (int)program->GetNumInstructions(); nextInstruction++)
      {
        if(program->GetInstruction(nextInstruction).operation == OPCODE_IF)
          depth++;
        if(program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF)
          depth--;

        if(depth == 0)
        {
          break;
        }
      }

      RDCASSERT(program->GetInstruction(nextInstruction).operation == OPCODE_ENDIF);

      // step to next instruction after the else/endif (for consistency with handling in the if
      // block)
      nextInstruction++;

      break;
    }
    case OPCODE_DISCARD:
    {
      int32_t test = GetSrc(op.operands[0], op).value.s32v[0];

      if((test != 0 && !op.nonzero()) || (test == 0 && op.nonzero()))
      {
        // don't discard
        break;
      }

      // discarding.
      done = true;
      break;
    }
    case OPCODE_RET:
    case OPCODE_RETC:
    {
      int32_t test = op.operation == OPCODE_RETC ? GetSrc(op.operands[0], op).value.s32v[0] : 0;

      if((test == 0 && !op.nonzero()) || (test != 0 && op.nonzero()) || op.operation == OPCODE_RET)
      {
        // assumes not in a function call
        done = true;
      }
      break;
    }

    //////////////////////////////////////////////////////////////////////////
    // Vendor extensions
    //////////////////////////////////////////////////////////////////////////
    case OPCODE_AMD_U64_ATOMIC:
    case OPCODE_NV_U64_ATOMIC:
    {
      VendorAtomicOp atomicOp = (VendorAtomicOp)op.preciseValues;

      uint32_t resIndex = (uint32_t)op.operands[2].indices[0].index;
      ShaderVariable dstAddress, compare, value;

      int param = 2;

      if(op.texelOffset[0] == 1)
      {
        // single operand for address - simple
        dstAddress = srcOpers[param++];
      }
      else if(op.texelOffset[0] == 2)
      {
        dstAddress = srcOpers[param++];
        dstAddress.value.u32v[1] = srcOpers[param++].value.u32v[0];
        dstAddress.value.u32v[2] = srcOpers[param++].value.u32v[2];
      }
      else
      {
        RDCERR("Unexpected parameter compression value %d ", op.texelOffset[0]);
        break;
      }

      if(atomicOp == ATOMIC_OP_CAS)
      {
        if(op.texelOffset[1] == 1)
        {
          compare = srcOpers[param++];
        }
        else if(op.texelOffset[1] == 2)
        {
          compare = srcOpers[param++];
          compare.value.u32v[1] = srcOpers[param++].value.u32v[0];
          compare.value.u32v[2] = srcOpers[param++].value.u32v[2];
        }
        else
        {
          RDCERR("Unexpected parameter compression value %d ", op.texelOffset[1]);
          break;
        }
      }

      if(op.texelOffset[2] == 1)
      {
        value = srcOpers[param++];
      }
      else if(op.texelOffset[2] == 2)
      {
        value = srcOpers[param++];
        value.value.u32v[1] = srcOpers[param++].value.u32v[0];
        value.value.u32v[2] = srcOpers[param++].value.u32v[2];
      }
      else
      {
        RDCERR("Unexpected parameter compression value %d ", op.texelOffset[2]);
        break;
      }

      BindingSlot slot = GetBindingSlotForIdentifier(*program, TYPE_UNORDERED_ACCESS_VIEW, resIndex);
      GlobalState::UAVIterator uav = global.uavs.find(slot);
      if(uav == global.uavs.end())
      {
        apiWrapper->FetchUAV(slot);
        uav = global.uavs.find(slot);
      }

      MarkResourceAccess(state, TYPE_UNORDERED_ACCESS_VIEW, slot);

      const uint32_t stride = sizeof(uint64_t);
      byte *data = &uav->second.data[0];

      RDCASSERT(data);

      if(data)
      {
        if(uav->second.tex)
        {
          data += dstAddress.value.u32v[0] * stride;
          data += dstAddress.value.u32v[1] * uav->second.rowPitch;
          data += dstAddress.value.u32v[2] * uav->second.depthPitch;
        }
        else
        {
          data += uav->second.firstElement * stride + dstAddress.value.u32v[0];
        }
      }

      if(data && data < uav->second.data.end() && !Finished())
      {
        ShaderVariable result(rdcstr(), 0U, 0U, 0U, 0U);

        uint64_t *data64 = (uint64_t *)data;

        result.value.u32v[0] = uint32_t(*data64);
        SetDst(state, op.operands[0], op, result);
        result.value.u32v[0] = uint32_t((*data64) >> 32U);
        SetDst(state, op.operands[1], op, result);

        uint64_t compare64 = compare.value.u64v[0];
        uint64_t value64 = value.value.u64v[0];

        switch(atomicOp)
        {
          case ATOMIC_OP_NONE: break;
          case ATOMIC_OP_AND: *data64 = *data64 & value64; break;
          case ATOMIC_OP_OR: *data64 = *data64 | value64; break;
          case ATOMIC_OP_XOR: *data64 = *data64 ^ value64; break;
          case ATOMIC_OP_ADD: *data64 = *data64 + value64; break;
          case ATOMIC_OP_MAX: *data64 = RDCMAX(*data64, value64); break;
          case ATOMIC_OP_MIN: *data64 = RDCMIN(*data64, value64); break;
          case ATOMIC_OP_SWAP: *data64 = value64; break;
          case ATOMIC_OP_CAS:
            if(*data64 == compare64)
              *data64 = value64;
            break;
        }
      }
      break;
    }

    //////////////////////////////////////////////////////////////////////////
    //
    //////////////////////////////////////////////////////////////////////////
    default:
    {
      RDCERR("Unsupported operation %d in assembly debugging", op.operation);
      break;
    }
  }
}