void AMDGPURegisterBankInfo::applyMappingImpl()

in llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp [2027:3146]


void AMDGPURegisterBankInfo::applyMappingImpl(
    const OperandsMapper &OpdMapper) const {
  MachineInstr &MI = OpdMapper.getMI();
  unsigned Opc = MI.getOpcode();
  MachineRegisterInfo &MRI = OpdMapper.getMRI();
  switch (Opc) {
  case AMDGPU::G_PHI: {
    Register DstReg = MI.getOperand(0).getReg();
    LLT DstTy = MRI.getType(DstReg);
    if (DstTy != LLT::scalar(1))
      break;

    const LLT S32 = LLT::scalar(32);
    const RegisterBank *DstBank =
      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
    if (DstBank == &AMDGPU::VCCRegBank) {
      applyDefaultMapping(OpdMapper);
      // The standard handling only considers the result register bank for
      // phis. For VCC, blindly inserting a copy when the phi is lowered will
      // produce an invalid copy. We can only copy with some kind of compare to
      // get a vector boolean result. Insert a register bank copy that will be
      // correctly lowered to a compare.
      MachineIRBuilder B(*MI.getParent()->getParent());

      for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
        Register SrcReg = MI.getOperand(I).getReg();
        const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);

        if (SrcBank != &AMDGPU::VCCRegBank) {
          MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
          B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());

          auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
          MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
          MI.getOperand(I).setReg(Copy.getReg(0));
        }
      }

      return;
    }

    // Phi handling is strange and only considers the bank of the destination.
    substituteSimpleCopyRegs(OpdMapper, 0);

    // Promote SGPR/VGPR booleans to s32
    MachineFunction *MF = MI.getParent()->getParent();
    ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
    MachineIRBuilder B(MI, ApplyBank);
    LegalizerHelper Helper(*MF, ApplyBank, B);

    if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
      llvm_unreachable("widen scalar should have succeeded");

    return;
  }
  case AMDGPU::G_ICMP:
  case AMDGPU::G_UADDO:
  case AMDGPU::G_USUBO:
  case AMDGPU::G_UADDE:
  case AMDGPU::G_SADDE:
  case AMDGPU::G_USUBE:
  case AMDGPU::G_SSUBE: {
    unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
    Register DstReg = MI.getOperand(BoolDstOp).getReg();

    const RegisterBank *DstBank =
      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
    if (DstBank != &AMDGPU::SGPRRegBank)
      break;

    const bool HasCarryIn = MI.getNumOperands() == 5;

    // If this is a scalar compare, promote the result to s32, as the selection
    // will end up using a copy to a 32-bit vreg.
    const LLT S32 = LLT::scalar(32);
    Register NewDstReg = MRI.createGenericVirtualRegister(S32);
    MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
    MI.getOperand(BoolDstOp).setReg(NewDstReg);
    MachineIRBuilder B(MI);

    if (HasCarryIn) {
      Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
      MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
      B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
      MI.getOperand(4).setReg(NewSrcReg);
    }

    MachineBasicBlock *MBB = MI.getParent();
    B.setInsertPt(*MBB, std::next(MI.getIterator()));

    // If we had a constrained VCC result register, a copy was inserted to VCC
    // from SGPR.
    SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
    if (DefRegs.empty())
      DefRegs.push_back(DstReg);
    B.buildTrunc(DefRegs[0], NewDstReg);
    return;
  }
  case AMDGPU::G_SELECT: {
    Register DstReg = MI.getOperand(0).getReg();
    LLT DstTy = MRI.getType(DstReg);

    SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
    if (CondRegs.empty())
      CondRegs.push_back(MI.getOperand(1).getReg());
    else {
      assert(CondRegs.size() == 1);
    }

    const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
    if (CondBank == &AMDGPU::SGPRRegBank) {
      MachineIRBuilder B(MI);
      const LLT S32 = LLT::scalar(32);
      Register NewCondReg = MRI.createGenericVirtualRegister(S32);
      MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);

      MI.getOperand(1).setReg(NewCondReg);
      B.buildZExt(NewCondReg, CondRegs[0]);
    }

    if (DstTy.getSizeInBits() != 64)
      break;

    MachineIRBuilder B(MI);
    LLT HalfTy = getHalfSizedType(DstTy);

    SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
    SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
    SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));

    // All inputs are SGPRs, nothing special to do.
    if (DefRegs.empty()) {
      assert(Src1Regs.empty() && Src2Regs.empty());
      break;
    }

    if (Src1Regs.empty())
      split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
    else {
      setRegsToType(MRI, Src1Regs, HalfTy);
    }

    if (Src2Regs.empty())
      split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
    else
      setRegsToType(MRI, Src2Regs, HalfTy);

    setRegsToType(MRI, DefRegs, HalfTy);

    B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
    B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);

    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
    MI.eraseFromParent();
    return;
  }
  case AMDGPU::G_BRCOND: {
    Register CondReg = MI.getOperand(0).getReg();
    // FIXME: Should use legalizer helper, but should change bool ext type.
    const RegisterBank *CondBank =
      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;

    if (CondBank == &AMDGPU::SGPRRegBank) {
      MachineIRBuilder B(MI);
      const LLT S32 = LLT::scalar(32);
      Register NewCondReg = MRI.createGenericVirtualRegister(S32);
      MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);

      MI.getOperand(0).setReg(NewCondReg);
      B.buildZExt(NewCondReg, CondReg);
      return;
    }

    break;
  }
  case AMDGPU::G_AND:
  case AMDGPU::G_OR:
  case AMDGPU::G_XOR: {
    // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
    // there is a VGPR input.
    Register DstReg = MI.getOperand(0).getReg();
    LLT DstTy = MRI.getType(DstReg);

    if (DstTy.getSizeInBits() == 1) {
      const RegisterBank *DstBank =
        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
      if (DstBank == &AMDGPU::VCCRegBank)
        break;

      MachineFunction *MF = MI.getParent()->getParent();
      ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
      MachineIRBuilder B(MI, ApplyBank);
      LegalizerHelper Helper(*MF, ApplyBank, B);

      if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
          LegalizerHelper::Legalized)
        llvm_unreachable("widen scalar should have succeeded");
      return;
    }

    if (DstTy.getSizeInBits() != 64)
      break;

    LLT HalfTy = getHalfSizedType(DstTy);
    SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
    SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
    SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));

    // All inputs are SGPRs, nothing special to do.
    if (DefRegs.empty()) {
      assert(Src0Regs.empty() && Src1Regs.empty());
      break;
    }

    assert(DefRegs.size() == 2);
    assert(Src0Regs.size() == Src1Regs.size() &&
           (Src0Regs.empty() || Src0Regs.size() == 2));

    // Depending on where the source registers came from, the generic code may
    // have decided to split the inputs already or not. If not, we still need to
    // extract the values.
    MachineIRBuilder B(MI);

    if (Src0Regs.empty())
      split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
    else
      setRegsToType(MRI, Src0Regs, HalfTy);

    if (Src1Regs.empty())
      split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
    else
      setRegsToType(MRI, Src1Regs, HalfTy);

    setRegsToType(MRI, DefRegs, HalfTy);

    B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
    B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});

    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
    MI.eraseFromParent();
    return;
  }
  case AMDGPU::G_ABS: {
    Register SrcReg = MI.getOperand(1).getReg();
    const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);

    // There is no VALU abs instruction so we need to replace it with a sub and
    // max combination.
    if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
      MachineFunction *MF = MI.getParent()->getParent();
      ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
      MachineIRBuilder B(MI, Apply);
      LegalizerHelper Helper(*MF, Apply, B);

      if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
        llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
      return;
    }
    LLVM_FALLTHROUGH;
  }
  case AMDGPU::G_ADD:
  case AMDGPU::G_SUB:
  case AMDGPU::G_MUL:
  case AMDGPU::G_SHL:
  case AMDGPU::G_LSHR:
  case AMDGPU::G_ASHR:
  case AMDGPU::G_SMIN:
  case AMDGPU::G_SMAX:
  case AMDGPU::G_UMIN:
  case AMDGPU::G_UMAX: {
    Register DstReg = MI.getOperand(0).getReg();
    LLT DstTy = MRI.getType(DstReg);

    // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
    // Packed 16-bit operations need to be scalarized and promoted.
    if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
      break;

    const RegisterBank *DstBank =
      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
    if (DstBank == &AMDGPU::VGPRRegBank)
      break;

    const LLT S32 = LLT::scalar(32);
    MachineBasicBlock *MBB = MI.getParent();
    MachineFunction *MF = MBB->getParent();
    ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
    MachineIRBuilder B(MI, ApplySALU);

    if (DstTy.isVector()) {
      Register WideSrc0Lo, WideSrc0Hi;
      Register WideSrc1Lo, WideSrc1Hi;

      unsigned ExtendOp = getExtendOp(MI.getOpcode());
      std::tie(WideSrc0Lo, WideSrc0Hi)
        = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
      std::tie(WideSrc1Lo, WideSrc1Hi)
        = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
      auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
      auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
      B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
      MI.eraseFromParent();
    } else {
      LegalizerHelper Helper(*MF, ApplySALU, B);

      if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
        llvm_unreachable("widen scalar should have succeeded");

      // FIXME: s16 shift amounts should be legal.
      if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
          Opc == AMDGPU::G_ASHR) {
        B.setInsertPt(*MBB, MI.getIterator());
        if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
          llvm_unreachable("widen scalar should have succeeded");
      }
    }

    return;
  }
  case AMDGPU::G_SEXT_INREG: {
    SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
    if (SrcRegs.empty())
      break; // Nothing to repair

    const LLT S32 = LLT::scalar(32);
    MachineIRBuilder B(MI);
    ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
    GISelObserverWrapper Observer(&O);
    B.setChangeObserver(Observer);

    // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
    // we would need to further expand, and doesn't let us directly set the
    // result registers.
    SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));

    int Amt = MI.getOperand(2).getImm();
    if (Amt <= 32) {
      if (Amt == 32) {
        // The low bits are unchanged.
        B.buildCopy(DstRegs[0], SrcRegs[0]);
      } else {
        // Extend in the low bits and propagate the sign bit to the high half.
        B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
      }

      B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
    } else {
      // The low bits are unchanged, and extend in the high bits.
      B.buildCopy(DstRegs[0], SrcRegs[0]);
      B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
    }

    Register DstReg = MI.getOperand(0).getReg();
    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
    MI.eraseFromParent();
    return;
  }
  case AMDGPU::G_CTPOP:
  case AMDGPU::G_BITREVERSE: {
    const RegisterBank *DstBank =
      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
    if (DstBank == &AMDGPU::SGPRRegBank)
      break;

    Register SrcReg = MI.getOperand(1).getReg();
    const LLT S32 = LLT::scalar(32);
    LLT Ty = MRI.getType(SrcReg);
    if (Ty == S32)
      break;

    ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
    MachineIRBuilder B(MI, ApplyVALU);

    MachineFunction &MF = B.getMF();
    LegalizerHelper Helper(MF, ApplyVALU, B);

    if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
      llvm_unreachable("narrowScalar should have succeeded");
    return;
  }
  case AMDGPU::G_AMDGPU_FFBH_U32:
  case AMDGPU::G_AMDGPU_FFBL_B32:
  case AMDGPU::G_CTLZ_ZERO_UNDEF:
  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
    const RegisterBank *DstBank =
        OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
    if (DstBank == &AMDGPU::SGPRRegBank)
      break;

    Register SrcReg = MI.getOperand(1).getReg();
    const LLT S32 = LLT::scalar(32);
    LLT Ty = MRI.getType(SrcReg);
    if (Ty == S32)
      break;

    // We can narrow this more efficiently than Helper can by using ffbh/ffbl
    // which return -1 when the input is zero:
    // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
    // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
    // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
    // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
    ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
    MachineIRBuilder B(MI, ApplyVALU);
    SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
    unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
                          ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
                          : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
                                ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
                                : Opc;
    unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
    auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
    auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
    unsigned AddOpc =
        Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
            ? AMDGPU::G_ADD
            : AMDGPU::G_UADDSAT;
    Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
    Register DstReg = MI.getOperand(0).getReg();
    B.buildUMin(DstReg, X, Y);
    MI.eraseFromParent();
    return;
  }
  case AMDGPU::G_SEXT:
  case AMDGPU::G_ZEXT:
  case AMDGPU::G_ANYEXT: {
    Register SrcReg = MI.getOperand(1).getReg();
    LLT SrcTy = MRI.getType(SrcReg);
    const bool Signed = Opc == AMDGPU::G_SEXT;

    assert(empty(OpdMapper.getVRegs(1)));

    MachineIRBuilder B(MI);
    const RegisterBank *SrcBank =
      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;

    Register DstReg = MI.getOperand(0).getReg();
    LLT DstTy = MRI.getType(DstReg);
    if (DstTy.isScalar() &&
        SrcBank != &AMDGPU::SGPRRegBank &&
        SrcBank != &AMDGPU::VCCRegBank &&
        // FIXME: Should handle any type that round to s64 when irregular
        // breakdowns supported.
        DstTy.getSizeInBits() == 64 &&
        SrcTy.getSizeInBits() <= 32) {
      SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));

      // Extend to 32-bit, and then extend the low half.
      if (Signed) {
        // TODO: Should really be buildSExtOrCopy
        B.buildSExtOrTrunc(DefRegs[0], SrcReg);
      } else if (Opc == AMDGPU::G_ZEXT) {
        B.buildZExtOrTrunc(DefRegs[0], SrcReg);
      } else {
        B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
      }

      extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
      MRI.setRegBank(DstReg, *SrcBank);
      MI.eraseFromParent();
      return;
    }

    if (SrcTy != LLT::scalar(1))
      return;

    // It is not legal to have a legalization artifact with a VCC source. Rather
    // than introducing a copy, insert the select we would have to select the
    // copy to.
    if (SrcBank == &AMDGPU::VCCRegBank) {
      SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));

      const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;

      unsigned DstSize = DstTy.getSizeInBits();
      // 64-bit select is SGPR only
      const bool UseSel64 = DstSize > 32 &&
        SrcBank->getID() == AMDGPU::SGPRRegBankID;

      // TODO: Should s16 select be legal?
      LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
      auto True = B.buildConstant(SelType, Signed ? -1 : 1);
      auto False = B.buildConstant(SelType, 0);

      MRI.setRegBank(True.getReg(0), *DstBank);
      MRI.setRegBank(False.getReg(0), *DstBank);
      MRI.setRegBank(DstReg, *DstBank);

      if (DstSize > 32) {
        B.buildSelect(DefRegs[0], SrcReg, True, False);
        extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
      } else if (DstSize < 32) {
        auto Sel = B.buildSelect(SelType, SrcReg, True, False);
        MRI.setRegBank(Sel.getReg(0), *DstBank);
        B.buildTrunc(DstReg, Sel);
      } else {
        B.buildSelect(DstReg, SrcReg, True, False);
      }

      MI.eraseFromParent();
      return;
    }

    break;
  }
  case AMDGPU::G_BUILD_VECTOR:
  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
    Register DstReg = MI.getOperand(0).getReg();
    LLT DstTy = MRI.getType(DstReg);
    if (DstTy != LLT::fixed_vector(2, 16))
      break;

    assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
    substituteSimpleCopyRegs(OpdMapper, 1);
    substituteSimpleCopyRegs(OpdMapper, 2);

    const RegisterBank *DstBank =
      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
    if (DstBank == &AMDGPU::SGPRRegBank)
      break; // Can use S_PACK_* instructions.

    MachineIRBuilder B(MI);

    Register Lo = MI.getOperand(1).getReg();
    Register Hi = MI.getOperand(2).getReg();
    const LLT S32 = LLT::scalar(32);

    const RegisterBank *BankLo =
      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
    const RegisterBank *BankHi =
      OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;

    Register ZextLo;
    Register ShiftHi;

    if (Opc == AMDGPU::G_BUILD_VECTOR) {
      ZextLo = B.buildZExt(S32, Lo).getReg(0);
      MRI.setRegBank(ZextLo, *BankLo);

      Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
      MRI.setRegBank(ZextHi, *BankHi);

      auto ShiftAmt = B.buildConstant(S32, 16);
      MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);

      ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
      MRI.setRegBank(ShiftHi, *BankHi);
    } else {
      Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
      MRI.setRegBank(MaskLo, *BankLo);

      auto ShiftAmt = B.buildConstant(S32, 16);
      MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);

      ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
      MRI.setRegBank(ShiftHi, *BankHi);

      ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
      MRI.setRegBank(ZextLo, *BankLo);
    }

    auto Or = B.buildOr(S32, ZextLo, ShiftHi);
    MRI.setRegBank(Or.getReg(0), *DstBank);

    B.buildBitcast(DstReg, Or);
    MI.eraseFromParent();
    return;
  }
  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
    SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));

    assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());

    Register DstReg = MI.getOperand(0).getReg();
    Register SrcReg = MI.getOperand(1).getReg();

    const LLT S32 = LLT::scalar(32);
    LLT DstTy = MRI.getType(DstReg);
    LLT SrcTy = MRI.getType(SrcReg);

    if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
      return;

    MachineIRBuilder B(MI);

    const ValueMapping &DstMapping
      = OpdMapper.getInstrMapping().getOperandMapping(0);
    const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
    const RegisterBank *SrcBank =
      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
    const RegisterBank *IdxBank =
        OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;

    Register BaseIdxReg;
    unsigned ConstOffset;
    std::tie(BaseIdxReg, ConstOffset) =
        AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());

    // See if the index is an add of a constant which will be foldable by moving
    // the base register of the index later if this is going to be executed in a
    // waterfall loop. This is essentially to reassociate the add of a constant
    // with the readfirstlane.
    bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
                                   ConstOffset > 0 &&
                                   ConstOffset < SrcTy.getNumElements();

    // Move the base register. We'll re-insert the add later.
    if (ShouldMoveIndexIntoLoop)
      MI.getOperand(2).setReg(BaseIdxReg);

    // If this is a VGPR result only because the index was a VGPR result, the
    // actual indexing will be done on the SGPR source vector, which will
    // produce a scalar result. We need to copy to the VGPR result inside the
    // waterfall loop.
    const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
                                SrcBank == &AMDGPU::SGPRRegBank;
    if (DstRegs.empty()) {
      applyDefaultMapping(OpdMapper);

      executeInWaterfallLoop(MI, MRI, { 2 });

      if (NeedCopyToVGPR) {
        // We don't want a phi for this temporary reg.
        Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
        MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
        MI.getOperand(0).setReg(TmpReg);
        B.setInsertPt(*MI.getParent(), ++MI.getIterator());

        // Use a v_mov_b32 here to make the exec dependency explicit.
        buildVCopy(B, DstReg, TmpReg);
      }

      // Re-insert the constant offset add inside the waterfall loop.
      if (ShouldMoveIndexIntoLoop)
        reinsertVectorIndexAdd(B, MI, 2, ConstOffset);

      return;
    }

    assert(DstTy.getSizeInBits() == 64);

    LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);

    auto CastSrc = B.buildBitcast(Vec32, SrcReg);
    auto One = B.buildConstant(S32, 1);

    MachineBasicBlock::iterator MII = MI.getIterator();

    // Split the vector index into 32-bit pieces. Prepare to move all of the
    // new instructions into a waterfall loop if necessary.
    //
    // Don't put the bitcast or constant in the loop.
    MachineInstrSpan Span(MII, &B.getMBB());

    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
    auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
    auto IdxHi = B.buildAdd(S32, IdxLo, One);

    auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
    auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);

    MRI.setRegBank(DstReg, *DstBank);
    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);

    SmallSet<Register, 4> OpsToWaterfall;
    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
      MI.eraseFromParent();
      return;
    }

    // Remove the original instruction to avoid potentially confusing the
    // waterfall loop logic.
    B.setInstr(*Span.begin());
    MI.eraseFromParent();
    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
                           OpsToWaterfall, MRI);

    if (NeedCopyToVGPR) {
      MachineBasicBlock *LoopBB = Extract1->getParent();
      Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
      Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
      MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
      MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);

      Extract0->getOperand(0).setReg(TmpReg0);
      Extract1->getOperand(0).setReg(TmpReg1);

      B.setInsertPt(*LoopBB, ++Extract1->getIterator());

      buildVCopy(B, DstRegs[0], TmpReg0);
      buildVCopy(B, DstRegs[1], TmpReg1);
    }

    if (ShouldMoveIndexIntoLoop)
      reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);

    return;
  }
  case AMDGPU::G_INSERT_VECTOR_ELT: {
    SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));

    Register DstReg = MI.getOperand(0).getReg();
    LLT VecTy = MRI.getType(DstReg);

    assert(OpdMapper.getVRegs(0).empty());
    assert(OpdMapper.getVRegs(3).empty());

    if (substituteSimpleCopyRegs(OpdMapper, 1))
      MRI.setType(MI.getOperand(1).getReg(), VecTy);

    if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
      return;

    const RegisterBank *IdxBank =
      OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;

    Register SrcReg = MI.getOperand(1).getReg();
    Register InsReg = MI.getOperand(2).getReg();
    LLT InsTy = MRI.getType(InsReg);
    (void)InsTy;

    Register BaseIdxReg;
    unsigned ConstOffset;
    std::tie(BaseIdxReg, ConstOffset) =
        AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());

    // See if the index is an add of a constant which will be foldable by moving
    // the base register of the index later if this is going to be executed in a
    // waterfall loop. This is essentially to reassociate the add of a constant
    // with the readfirstlane.
    bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
      ConstOffset > 0 &&
      ConstOffset < VecTy.getNumElements();

    // Move the base register. We'll re-insert the add later.
    if (ShouldMoveIndexIntoLoop)
      MI.getOperand(3).setReg(BaseIdxReg);


    if (InsRegs.empty()) {
      executeInWaterfallLoop(MI, MRI, { 3 });

      // Re-insert the constant offset add inside the waterfall loop.
      if (ShouldMoveIndexIntoLoop) {
        MachineIRBuilder B(MI);
        reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
      }

      return;
    }


    assert(InsTy.getSizeInBits() == 64);

    const LLT S32 = LLT::scalar(32);
    LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);

    MachineIRBuilder B(MI);
    auto CastSrc = B.buildBitcast(Vec32, SrcReg);
    auto One = B.buildConstant(S32, 1);

    // Split the vector index into 32-bit pieces. Prepare to move all of the
    // new instructions into a waterfall loop if necessary.
    //
    // Don't put the bitcast or constant in the loop.
    MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());

    // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
    auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
    auto IdxHi = B.buildAdd(S32, IdxLo, One);

    auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
    auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);

    const RegisterBank *DstBank =
      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
    const RegisterBank *SrcBank =
      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
    const RegisterBank *InsSrcBank =
      OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;

    MRI.setRegBank(InsReg, *InsSrcBank);
    MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
    MRI.setRegBank(InsLo.getReg(0), *DstBank);
    MRI.setRegBank(InsHi.getReg(0), *DstBank);
    MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
    MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
    MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);


    SmallSet<Register, 4> OpsToWaterfall;
    if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
      B.setInsertPt(B.getMBB(), MI);
      B.buildBitcast(DstReg, InsHi);
      MI.eraseFromParent();
      return;
    }

    B.setInstr(*Span.begin());
    MI.eraseFromParent();

    // Figure out the point after the waterfall loop before mangling the control
    // flow.
    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
                           OpsToWaterfall, MRI);

    // The insertion point is now right after the original instruction.
    //
    // Keep the bitcast to the original vector type out of the loop. Doing this
    // saved an extra phi we don't need inside the loop.
    B.buildBitcast(DstReg, InsHi);

    // Re-insert the constant offset add inside the waterfall loop.
    if (ShouldMoveIndexIntoLoop)
      reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);

    return;
  }
  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
  case AMDGPU::G_AMDGPU_BUFFER_STORE:
  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
    applyDefaultMapping(OpdMapper);
    executeInWaterfallLoop(MI, MRI, {1, 4});
    return;
  }
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
    applyDefaultMapping(OpdMapper);
    executeInWaterfallLoop(MI, MRI, {2, 5});
    return;
  }
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
    applyDefaultMapping(OpdMapper);
    executeInWaterfallLoop(MI, MRI, {2, 5});
    return;
  }
  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
    applyDefaultMapping(OpdMapper);
    executeInWaterfallLoop(MI, MRI, {3, 6});
    return;
  }
  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
    applyMappingSBufferLoad(OpdMapper);
    return;
  }
  case AMDGPU::G_INTRINSIC: {
    switch (MI.getIntrinsicID()) {
    case Intrinsic::amdgcn_readlane: {
      substituteSimpleCopyRegs(OpdMapper, 2);

      assert(OpdMapper.getVRegs(0).empty());
      assert(OpdMapper.getVRegs(3).empty());

      // Make sure the index is an SGPR. It doesn't make sense to run this in a
      // waterfall loop, so assume it's a uniform value.
      constrainOpWithReadfirstlane(MI, MRI, 3); // Index
      return;
    }
    case Intrinsic::amdgcn_writelane: {
      assert(OpdMapper.getVRegs(0).empty());
      assert(OpdMapper.getVRegs(2).empty());
      assert(OpdMapper.getVRegs(3).empty());

      substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
      constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
      constrainOpWithReadfirstlane(MI, MRI, 3); // Index
      return;
    }
    case Intrinsic::amdgcn_interp_p1:
    case Intrinsic::amdgcn_interp_p2:
    case Intrinsic::amdgcn_interp_mov:
    case Intrinsic::amdgcn_interp_p1_f16:
    case Intrinsic::amdgcn_interp_p2_f16: {
      applyDefaultMapping(OpdMapper);

      // Readlane for m0 value, which is always the last operand.
      // FIXME: Should this be a waterfall loop instead?
      constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
      return;
    }
    case Intrinsic::amdgcn_permlane16:
    case Intrinsic::amdgcn_permlanex16: {
      // Doing a waterfall loop over these wouldn't make any sense.
      substituteSimpleCopyRegs(OpdMapper, 2);
      substituteSimpleCopyRegs(OpdMapper, 3);
      constrainOpWithReadfirstlane(MI, MRI, 4);
      constrainOpWithReadfirstlane(MI, MRI, 5);
      return;
    }
    case Intrinsic::amdgcn_sbfe:
      applyMappingBFE(OpdMapper, true);
      return;
    case Intrinsic::amdgcn_ubfe:
      applyMappingBFE(OpdMapper, false);
      return;
    case Intrinsic::amdgcn_ballot:
      // Use default handling and insert copy to vcc source.
      break;
    }
    break;
  }
  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
    const AMDGPU::RsrcIntrinsic *RSrcIntrin
      = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
    assert(RSrcIntrin && RSrcIntrin->IsImage);
    // Non-images can have complications from operands that allow both SGPR
    // and VGPR. For now it's too complicated to figure out the final opcode
    // to derive the register bank from the MCInstrDesc.
    applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
    return;
  }
  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
    unsigned N = MI.getNumExplicitOperands() - 2;
    applyDefaultMapping(OpdMapper);
    executeInWaterfallLoop(MI, MRI, { N });
    return;
  }
  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
    auto IntrID = MI.getIntrinsicID();
    switch (IntrID) {
    case Intrinsic::amdgcn_ds_ordered_add:
    case Intrinsic::amdgcn_ds_ordered_swap: {
      // This is only allowed to execute with 1 lane, so readfirstlane is safe.
      assert(OpdMapper.getVRegs(0).empty());
      substituteSimpleCopyRegs(OpdMapper, 3);
      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
      return;
    }
    case Intrinsic::amdgcn_ds_gws_init:
    case Intrinsic::amdgcn_ds_gws_barrier:
    case Intrinsic::amdgcn_ds_gws_sema_br: {
      // Only the first lane is executes, so readfirstlane is safe.
      substituteSimpleCopyRegs(OpdMapper, 1);
      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
      return;
    }
    case Intrinsic::amdgcn_ds_gws_sema_v:
    case Intrinsic::amdgcn_ds_gws_sema_p:
    case Intrinsic::amdgcn_ds_gws_sema_release_all: {
      // Only the first lane is executes, so readfirstlane is safe.
      constrainOpWithReadfirstlane(MI, MRI, 1); // M0
      return;
    }
    case Intrinsic::amdgcn_ds_append:
    case Intrinsic::amdgcn_ds_consume: {
      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
      return;
    }
    case Intrinsic::amdgcn_s_sendmsg:
    case Intrinsic::amdgcn_s_sendmsghalt: {
      // FIXME: Should this use a waterfall loop?
      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
      return;
    }
    case Intrinsic::amdgcn_s_setreg: {
      constrainOpWithReadfirstlane(MI, MRI, 2);
      return;
    }
    default: {
      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
              AMDGPU::lookupRsrcIntrinsic(IntrID)) {
        // Non-images can have complications from operands that allow both SGPR
        // and VGPR. For now it's too complicated to figure out the final opcode
        // to derive the register bank from the MCInstrDesc.
        if (RSrcIntrin->IsImage) {
          applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
          return;
        }
      }

      break;
    }
    }
    break;
  }
  case AMDGPU::G_SI_CALL: {
    // Use a set to avoid extra readfirstlanes in the case where multiple
    // operands are the same register.
    SmallSet<Register, 4> SGPROperandRegs;

    if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
      break;

    // Move all copies to physical SGPRs that are used by the call instruction
    // into the loop block. Start searching for these copies until the
    // ADJCALLSTACKUP.
    unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
    unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;

    // Move all non-copies before the copies, so that a complete range can be
    // moved into the waterfall loop.
    SmallVector<MachineInstr *, 4> NonCopyInstrs;
    // Count of NonCopyInstrs found until the current LastCopy.
    unsigned NonCopyInstrsLen = 0;
    MachineBasicBlock::iterator Start(&MI);
    MachineBasicBlock::iterator LastCopy = Start;
    MachineBasicBlock *MBB = MI.getParent();
    const SIMachineFunctionInfo *Info =
        MBB->getParent()->getInfo<SIMachineFunctionInfo>();
    while (Start->getOpcode() != FrameSetupOpcode) {
      --Start;
      bool IsCopy = false;
      if (Start->getOpcode() == AMDGPU::COPY) {
        auto &Dst = Start->getOperand(0);
        if (Dst.isReg()) {
          Register Reg = Dst.getReg();
          if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
            IsCopy = true;
          } else {
            // Also move the copy from the scratch rsrc descriptor into the loop
            // to allow it to be optimized away.
            auto &Src = Start->getOperand(1);
            if (Src.isReg()) {
              Reg = Src.getReg();
              IsCopy = Info->getScratchRSrcReg() == Reg;
            }
          }
        }
      }

      if (IsCopy) {
        LastCopy = Start;
        NonCopyInstrsLen = NonCopyInstrs.size();
      } else {
        NonCopyInstrs.push_back(&*Start);
      }
    }
    NonCopyInstrs.resize(NonCopyInstrsLen);

    for (auto *NonCopy : reverse(NonCopyInstrs)) {
      MBB->splice(LastCopy, MBB, NonCopy->getIterator());
    }
    Start = LastCopy;

    // Do the same for copies after the loop
    NonCopyInstrs.clear();
    NonCopyInstrsLen = 0;
    MachineBasicBlock::iterator End(&MI);
    LastCopy = End;
    while (End->getOpcode() != FrameDestroyOpcode) {
      ++End;
      bool IsCopy = false;
      if (End->getOpcode() == AMDGPU::COPY) {
        auto &Src = End->getOperand(1);
        if (Src.isReg()) {
          Register Reg = Src.getReg();
          IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
        }
      }

      if (IsCopy) {
        LastCopy = End;
        NonCopyInstrsLen = NonCopyInstrs.size();
      } else {
        NonCopyInstrs.push_back(&*End);
      }
    }
    NonCopyInstrs.resize(NonCopyInstrsLen);

    End = LastCopy;
    ++LastCopy;
    for (auto *NonCopy : reverse(NonCopyInstrs)) {
      MBB->splice(LastCopy, MBB, NonCopy->getIterator());
    }

    ++End;
    MachineIRBuilder B(*Start);
    executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
    break;
  }
  case AMDGPU::G_LOAD:
  case AMDGPU::G_ZEXTLOAD:
  case AMDGPU::G_SEXTLOAD: {
    if (applyMappingLoad(MI, OpdMapper, MRI))
      return;
    break;
  }
  case AMDGPU::G_DYN_STACKALLOC:
    applyMappingDynStackAlloc(MI, OpdMapper, MRI);
    return;
  case AMDGPU::G_SBFX:
    applyMappingBFE(OpdMapper, /*Signed*/ true);
    return;
  case AMDGPU::G_UBFX:
    applyMappingBFE(OpdMapper, /*Signed*/ false);
    return;
  default:
    break;
  }

  return applyDefaultMapping(OpdMapper);
}