in llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp [2027:3146]
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
switch (Opc) {
case AMDGPU::G_PHI: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy != LLT::scalar(1))
break;
const LLT S32 = LLT::scalar(32);
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VCCRegBank) {
applyDefaultMapping(OpdMapper);
// The standard handling only considers the result register bank for
// phis. For VCC, blindly inserting a copy when the phi is lowered will
// produce an invalid copy. We can only copy with some kind of compare to
// get a vector boolean result. Insert a register bank copy that will be
// correctly lowered to a compare.
MachineIRBuilder B(*MI.getParent()->getParent());
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
Register SrcReg = MI.getOperand(I).getReg();
const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
if (SrcBank != &AMDGPU::VCCRegBank) {
MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
MI.getOperand(I).setReg(Copy.getReg(0));
}
}
return;
}
// Phi handling is strange and only considers the bank of the destination.
substituteSimpleCopyRegs(OpdMapper, 0);
// Promote SGPR/VGPR booleans to s32
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
MachineIRBuilder B(MI, ApplyBank);
LegalizerHelper Helper(*MF, ApplyBank, B);
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
return;
}
case AMDGPU::G_ICMP:
case AMDGPU::G_UADDO:
case AMDGPU::G_USUBO:
case AMDGPU::G_UADDE:
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE: {
unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
Register DstReg = MI.getOperand(BoolDstOp).getReg();
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank != &AMDGPU::SGPRRegBank)
break;
const bool HasCarryIn = MI.getNumOperands() == 5;
// If this is a scalar compare, promote the result to s32, as the selection
// will end up using a copy to a 32-bit vreg.
const LLT S32 = LLT::scalar(32);
Register NewDstReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
MI.getOperand(BoolDstOp).setReg(NewDstReg);
MachineIRBuilder B(MI);
if (HasCarryIn) {
Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
MI.getOperand(4).setReg(NewSrcReg);
}
MachineBasicBlock *MBB = MI.getParent();
B.setInsertPt(*MBB, std::next(MI.getIterator()));
// If we had a constrained VCC result register, a copy was inserted to VCC
// from SGPR.
SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
if (DefRegs.empty())
DefRegs.push_back(DstReg);
B.buildTrunc(DefRegs[0], NewDstReg);
return;
}
case AMDGPU::G_SELECT: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
if (CondRegs.empty())
CondRegs.push_back(MI.getOperand(1).getReg());
else {
assert(CondRegs.size() == 1);
}
const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
if (CondBank == &AMDGPU::SGPRRegBank) {
MachineIRBuilder B(MI);
const LLT S32 = LLT::scalar(32);
Register NewCondReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
MI.getOperand(1).setReg(NewCondReg);
B.buildZExt(NewCondReg, CondRegs[0]);
}
if (DstTy.getSizeInBits() != 64)
break;
MachineIRBuilder B(MI);
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
// All inputs are SGPRs, nothing special to do.
if (DefRegs.empty()) {
assert(Src1Regs.empty() && Src2Regs.empty());
break;
}
if (Src1Regs.empty())
split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
else {
setRegsToType(MRI, Src1Regs, HalfTy);
}
if (Src2Regs.empty())
split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
else
setRegsToType(MRI, Src2Regs, HalfTy);
setRegsToType(MRI, DefRegs, HalfTy);
B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
return;
}
case AMDGPU::G_BRCOND: {
Register CondReg = MI.getOperand(0).getReg();
// FIXME: Should use legalizer helper, but should change bool ext type.
const RegisterBank *CondBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (CondBank == &AMDGPU::SGPRRegBank) {
MachineIRBuilder B(MI);
const LLT S32 = LLT::scalar(32);
Register NewCondReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
MI.getOperand(0).setReg(NewCondReg);
B.buildZExt(NewCondReg, CondReg);
return;
}
break;
}
case AMDGPU::G_AND:
case AMDGPU::G_OR:
case AMDGPU::G_XOR: {
// 64-bit and is only available on the SALU, so split into 2 32-bit ops if
// there is a VGPR input.
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy.getSizeInBits() == 1) {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VCCRegBank)
break;
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
MachineIRBuilder B(MI, ApplyBank);
LegalizerHelper Helper(*MF, ApplyBank, B);
if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
return;
}
if (DstTy.getSizeInBits() != 64)
break;
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
// All inputs are SGPRs, nothing special to do.
if (DefRegs.empty()) {
assert(Src0Regs.empty() && Src1Regs.empty());
break;
}
assert(DefRegs.size() == 2);
assert(Src0Regs.size() == Src1Regs.size() &&
(Src0Regs.empty() || Src0Regs.size() == 2));
// Depending on where the source registers came from, the generic code may
// have decided to split the inputs already or not. If not, we still need to
// extract the values.
MachineIRBuilder B(MI);
if (Src0Regs.empty())
split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
else
setRegsToType(MRI, Src0Regs, HalfTy);
if (Src1Regs.empty())
split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
else
setRegsToType(MRI, Src1Regs, HalfTy);
setRegsToType(MRI, DefRegs, HalfTy);
B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
return;
}
case AMDGPU::G_ABS: {
Register SrcReg = MI.getOperand(1).getReg();
const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
// There is no VALU abs instruction so we need to replace it with a sub and
// max combination.
if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
MachineFunction *MF = MI.getParent()->getParent();
ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, Apply);
LegalizerHelper Helper(*MF, Apply, B);
if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
return;
}
LLVM_FALLTHROUGH;
}
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
case AMDGPU::G_MUL:
case AMDGPU::G_SHL:
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR:
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
case AMDGPU::G_UMAX: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
// Packed 16-bit operations need to be scalarized and promoted.
if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
break;
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::VGPRRegBank)
break;
const LLT S32 = LLT::scalar(32);
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
MachineIRBuilder B(MI, ApplySALU);
if (DstTy.isVector()) {
Register WideSrc0Lo, WideSrc0Hi;
Register WideSrc1Lo, WideSrc1Hi;
unsigned ExtendOp = getExtendOp(MI.getOpcode());
std::tie(WideSrc0Lo, WideSrc0Hi)
= unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
std::tie(WideSrc1Lo, WideSrc1Hi)
= unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
MI.eraseFromParent();
} else {
LegalizerHelper Helper(*MF, ApplySALU, B);
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
// FIXME: s16 shift amounts should be legal.
if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
Opc == AMDGPU::G_ASHR) {
B.setInsertPt(*MBB, MI.getIterator());
if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
}
}
return;
}
case AMDGPU::G_SEXT_INREG: {
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
if (SrcRegs.empty())
break; // Nothing to repair
const LLT S32 = LLT::scalar(32);
MachineIRBuilder B(MI);
ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
GISelObserverWrapper Observer(&O);
B.setChangeObserver(Observer);
// Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
// we would need to further expand, and doesn't let us directly set the
// result registers.
SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
int Amt = MI.getOperand(2).getImm();
if (Amt <= 32) {
if (Amt == 32) {
// The low bits are unchanged.
B.buildCopy(DstRegs[0], SrcRegs[0]);
} else {
// Extend in the low bits and propagate the sign bit to the high half.
B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
}
B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
} else {
// The low bits are unchanged, and extend in the high bits.
B.buildCopy(DstRegs[0], SrcRegs[0]);
B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
}
Register DstReg = MI.getOperand(0).getReg();
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
return;
}
case AMDGPU::G_CTPOP:
case AMDGPU::G_BITREVERSE: {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
break;
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
if (Ty == S32)
break;
ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, ApplyVALU);
MachineFunction &MF = B.getMF();
LegalizerHelper Helper(MF, ApplyVALU, B);
if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
llvm_unreachable("narrowScalar should have succeeded");
return;
}
case AMDGPU::G_AMDGPU_FFBH_U32:
case AMDGPU::G_AMDGPU_FFBL_B32:
case AMDGPU::G_CTLZ_ZERO_UNDEF:
case AMDGPU::G_CTTZ_ZERO_UNDEF: {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
break;
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT Ty = MRI.getType(SrcReg);
if (Ty == S32)
break;
// We can narrow this more efficiently than Helper can by using ffbh/ffbl
// which return -1 when the input is zero:
// (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
// (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
// (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
MachineIRBuilder B(MI, ApplyVALU);
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
: Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
: Opc;
unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
unsigned AddOpc =
Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
? AMDGPU::G_ADD
: AMDGPU::G_UADDSAT;
Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
Register DstReg = MI.getOperand(0).getReg();
B.buildUMin(DstReg, X, Y);
MI.eraseFromParent();
return;
}
case AMDGPU::G_SEXT:
case AMDGPU::G_ZEXT:
case AMDGPU::G_ANYEXT: {
Register SrcReg = MI.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
const bool Signed = Opc == AMDGPU::G_SEXT;
assert(empty(OpdMapper.getVRegs(1)));
MachineIRBuilder B(MI);
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy.isScalar() &&
SrcBank != &AMDGPU::SGPRRegBank &&
SrcBank != &AMDGPU::VCCRegBank &&
// FIXME: Should handle any type that round to s64 when irregular
// breakdowns supported.
DstTy.getSizeInBits() == 64 &&
SrcTy.getSizeInBits() <= 32) {
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
// Extend to 32-bit, and then extend the low half.
if (Signed) {
// TODO: Should really be buildSExtOrCopy
B.buildSExtOrTrunc(DefRegs[0], SrcReg);
} else if (Opc == AMDGPU::G_ZEXT) {
B.buildZExtOrTrunc(DefRegs[0], SrcReg);
} else {
B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
}
extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
MRI.setRegBank(DstReg, *SrcBank);
MI.eraseFromParent();
return;
}
if (SrcTy != LLT::scalar(1))
return;
// It is not legal to have a legalization artifact with a VCC source. Rather
// than introducing a copy, insert the select we would have to select the
// copy to.
if (SrcBank == &AMDGPU::VCCRegBank) {
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
unsigned DstSize = DstTy.getSizeInBits();
// 64-bit select is SGPR only
const bool UseSel64 = DstSize > 32 &&
SrcBank->getID() == AMDGPU::SGPRRegBankID;
// TODO: Should s16 select be legal?
LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
auto True = B.buildConstant(SelType, Signed ? -1 : 1);
auto False = B.buildConstant(SelType, 0);
MRI.setRegBank(True.getReg(0), *DstBank);
MRI.setRegBank(False.getReg(0), *DstBank);
MRI.setRegBank(DstReg, *DstBank);
if (DstSize > 32) {
B.buildSelect(DefRegs[0], SrcReg, True, False);
extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
} else if (DstSize < 32) {
auto Sel = B.buildSelect(SelType, SrcReg, True, False);
MRI.setRegBank(Sel.getReg(0), *DstBank);
B.buildTrunc(DstReg, Sel);
} else {
B.buildSelect(DstReg, SrcReg, True, False);
}
MI.eraseFromParent();
return;
}
break;
}
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
if (DstTy != LLT::fixed_vector(2, 16))
break;
assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 1);
substituteSimpleCopyRegs(OpdMapper, 2);
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank == &AMDGPU::SGPRRegBank)
break; // Can use S_PACK_* instructions.
MachineIRBuilder B(MI);
Register Lo = MI.getOperand(1).getReg();
Register Hi = MI.getOperand(2).getReg();
const LLT S32 = LLT::scalar(32);
const RegisterBank *BankLo =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank *BankHi =
OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
Register ZextLo;
Register ShiftHi;
if (Opc == AMDGPU::G_BUILD_VECTOR) {
ZextLo = B.buildZExt(S32, Lo).getReg(0);
MRI.setRegBank(ZextLo, *BankLo);
Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
MRI.setRegBank(ZextHi, *BankHi);
auto ShiftAmt = B.buildConstant(S32, 16);
MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
MRI.setRegBank(ShiftHi, *BankHi);
} else {
Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
MRI.setRegBank(MaskLo, *BankLo);
auto ShiftAmt = B.buildConstant(S32, 16);
MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
MRI.setRegBank(ShiftHi, *BankHi);
ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
MRI.setRegBank(ZextLo, *BankLo);
}
auto Or = B.buildOr(S32, ZextLo, ShiftHi);
MRI.setRegBank(Or.getReg(0), *DstBank);
B.buildBitcast(DstReg, Or);
MI.eraseFromParent();
return;
}
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
const LLT S32 = LLT::scalar(32);
LLT DstTy = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
return;
MachineIRBuilder B(MI);
const ValueMapping &DstMapping
= OpdMapper.getInstrMapping().getOperandMapping(0);
const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank *IdxBank =
OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
Register BaseIdxReg;
unsigned ConstOffset;
std::tie(BaseIdxReg, ConstOffset) =
AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
// See if the index is an add of a constant which will be foldable by moving
// the base register of the index later if this is going to be executed in a
// waterfall loop. This is essentially to reassociate the add of a constant
// with the readfirstlane.
bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
ConstOffset > 0 &&
ConstOffset < SrcTy.getNumElements();
// Move the base register. We'll re-insert the add later.
if (ShouldMoveIndexIntoLoop)
MI.getOperand(2).setReg(BaseIdxReg);
// If this is a VGPR result only because the index was a VGPR result, the
// actual indexing will be done on the SGPR source vector, which will
// produce a scalar result. We need to copy to the VGPR result inside the
// waterfall loop.
const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
SrcBank == &AMDGPU::SGPRRegBank;
if (DstRegs.empty()) {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, { 2 });
if (NeedCopyToVGPR) {
// We don't want a phi for this temporary reg.
Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
MI.getOperand(0).setReg(TmpReg);
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
// Use a v_mov_b32 here to make the exec dependency explicit.
buildVCopy(B, DstReg, TmpReg);
}
// Re-insert the constant offset add inside the waterfall loop.
if (ShouldMoveIndexIntoLoop)
reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
return;
}
assert(DstTy.getSizeInBits() == 64);
LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
MachineBasicBlock::iterator MII = MI.getIterator();
// Split the vector index into 32-bit pieces. Prepare to move all of the
// new instructions into a waterfall loop if necessary.
//
// Don't put the bitcast or constant in the loop.
MachineInstrSpan Span(MII, &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
MRI.setRegBank(DstReg, *DstBank);
MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
SmallSet<Register, 4> OpsToWaterfall;
if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
MI.eraseFromParent();
return;
}
// Remove the original instruction to avoid potentially confusing the
// waterfall loop logic.
B.setInstr(*Span.begin());
MI.eraseFromParent();
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
OpsToWaterfall, MRI);
if (NeedCopyToVGPR) {
MachineBasicBlock *LoopBB = Extract1->getParent();
Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
Extract0->getOperand(0).setReg(TmpReg0);
Extract1->getOperand(0).setReg(TmpReg1);
B.setInsertPt(*LoopBB, ++Extract1->getIterator());
buildVCopy(B, DstRegs[0], TmpReg0);
buildVCopy(B, DstRegs[1], TmpReg1);
}
if (ShouldMoveIndexIntoLoop)
reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
return;
}
case AMDGPU::G_INSERT_VECTOR_ELT: {
SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
Register DstReg = MI.getOperand(0).getReg();
LLT VecTy = MRI.getType(DstReg);
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(3).empty());
if (substituteSimpleCopyRegs(OpdMapper, 1))
MRI.setType(MI.getOperand(1).getReg(), VecTy);
if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
return;
const RegisterBank *IdxBank =
OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
Register SrcReg = MI.getOperand(1).getReg();
Register InsReg = MI.getOperand(2).getReg();
LLT InsTy = MRI.getType(InsReg);
(void)InsTy;
Register BaseIdxReg;
unsigned ConstOffset;
std::tie(BaseIdxReg, ConstOffset) =
AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
// See if the index is an add of a constant which will be foldable by moving
// the base register of the index later if this is going to be executed in a
// waterfall loop. This is essentially to reassociate the add of a constant
// with the readfirstlane.
bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
ConstOffset > 0 &&
ConstOffset < VecTy.getNumElements();
// Move the base register. We'll re-insert the add later.
if (ShouldMoveIndexIntoLoop)
MI.getOperand(3).setReg(BaseIdxReg);
if (InsRegs.empty()) {
executeInWaterfallLoop(MI, MRI, { 3 });
// Re-insert the constant offset add inside the waterfall loop.
if (ShouldMoveIndexIntoLoop) {
MachineIRBuilder B(MI);
reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
}
return;
}
assert(InsTy.getSizeInBits() == 64);
const LLT S32 = LLT::scalar(32);
LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
// Split the vector index into 32-bit pieces. Prepare to move all of the
// new instructions into a waterfall loop if necessary.
//
// Don't put the bitcast or constant in the loop.
MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
// Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
auto IdxHi = B.buildAdd(S32, IdxLo, One);
auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
const RegisterBank *InsSrcBank =
OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
MRI.setRegBank(InsReg, *InsSrcBank);
MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
MRI.setRegBank(InsLo.getReg(0), *DstBank);
MRI.setRegBank(InsHi.getReg(0), *DstBank);
MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
SmallSet<Register, 4> OpsToWaterfall;
if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
B.setInsertPt(B.getMBB(), MI);
B.buildBitcast(DstReg, InsHi);
MI.eraseFromParent();
return;
}
B.setInstr(*Span.begin());
MI.eraseFromParent();
// Figure out the point after the waterfall loop before mangling the control
// flow.
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
OpsToWaterfall, MRI);
// The insertion point is now right after the original instruction.
//
// Keep the bitcast to the original vector type out of the loop. Doing this
// saved an extra phi we don't need inside the loop.
B.buildBitcast(DstReg, InsHi);
// Re-insert the constant offset add inside the waterfall loop.
if (ShouldMoveIndexIntoLoop)
reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
return;
}
case AMDGPU::G_AMDGPU_BUFFER_LOAD:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
case AMDGPU::G_AMDGPU_BUFFER_STORE:
case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {1, 4});
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {2, 5});
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, {3, 6});
return;
}
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
applyMappingSBufferLoad(OpdMapper);
return;
}
case AMDGPU::G_INTRINSIC: {
switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_readlane: {
substituteSimpleCopyRegs(OpdMapper, 2);
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(3).empty());
// Make sure the index is an SGPR. It doesn't make sense to run this in a
// waterfall loop, so assume it's a uniform value.
constrainOpWithReadfirstlane(MI, MRI, 3); // Index
return;
}
case Intrinsic::amdgcn_writelane: {
assert(OpdMapper.getVRegs(0).empty());
assert(OpdMapper.getVRegs(2).empty());
assert(OpdMapper.getVRegs(3).empty());
substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
constrainOpWithReadfirstlane(MI, MRI, 3); // Index
return;
}
case Intrinsic::amdgcn_interp_p1:
case Intrinsic::amdgcn_interp_p2:
case Intrinsic::amdgcn_interp_mov:
case Intrinsic::amdgcn_interp_p1_f16:
case Intrinsic::amdgcn_interp_p2_f16: {
applyDefaultMapping(OpdMapper);
// Readlane for m0 value, which is always the last operand.
// FIXME: Should this be a waterfall loop instead?
constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
return;
}
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16: {
// Doing a waterfall loop over these wouldn't make any sense.
substituteSimpleCopyRegs(OpdMapper, 2);
substituteSimpleCopyRegs(OpdMapper, 3);
constrainOpWithReadfirstlane(MI, MRI, 4);
constrainOpWithReadfirstlane(MI, MRI, 5);
return;
}
case Intrinsic::amdgcn_sbfe:
applyMappingBFE(OpdMapper, true);
return;
case Intrinsic::amdgcn_ubfe:
applyMappingBFE(OpdMapper, false);
return;
case Intrinsic::amdgcn_ballot:
// Use default handling and insert copy to vcc source.
break;
}
break;
}
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
const AMDGPU::RsrcIntrinsic *RSrcIntrin
= AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
assert(RSrcIntrin && RSrcIntrin->IsImage);
// Non-images can have complications from operands that allow both SGPR
// and VGPR. For now it's too complicated to figure out the final opcode
// to derive the register bank from the MCInstrDesc.
applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
return;
}
case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
unsigned N = MI.getNumExplicitOperands() - 2;
applyDefaultMapping(OpdMapper);
executeInWaterfallLoop(MI, MRI, { N });
return;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
auto IntrID = MI.getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// This is only allowed to execute with 1 lane, so readfirstlane is safe.
assert(OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 3);
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_br: {
// Only the first lane is executes, so readfirstlane is safe.
substituteSimpleCopyRegs(OpdMapper, 1);
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_ds_gws_sema_v:
case Intrinsic::amdgcn_ds_gws_sema_p:
case Intrinsic::amdgcn_ds_gws_sema_release_all: {
// Only the first lane is executes, so readfirstlane is safe.
constrainOpWithReadfirstlane(MI, MRI, 1); // M0
return;
}
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
case Intrinsic::amdgcn_s_setreg: {
constrainOpWithReadfirstlane(MI, MRI, 2);
return;
}
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
// Non-images can have complications from operands that allow both SGPR
// and VGPR. For now it's too complicated to figure out the final opcode
// to derive the register bank from the MCInstrDesc.
if (RSrcIntrin->IsImage) {
applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
return;
}
}
break;
}
}
break;
}
case AMDGPU::G_SI_CALL: {
// Use a set to avoid extra readfirstlanes in the case where multiple
// operands are the same register.
SmallSet<Register, 4> SGPROperandRegs;
if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
break;
// Move all copies to physical SGPRs that are used by the call instruction
// into the loop block. Start searching for these copies until the
// ADJCALLSTACKUP.
unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
// Move all non-copies before the copies, so that a complete range can be
// moved into the waterfall loop.
SmallVector<MachineInstr *, 4> NonCopyInstrs;
// Count of NonCopyInstrs found until the current LastCopy.
unsigned NonCopyInstrsLen = 0;
MachineBasicBlock::iterator Start(&MI);
MachineBasicBlock::iterator LastCopy = Start;
MachineBasicBlock *MBB = MI.getParent();
const SIMachineFunctionInfo *Info =
MBB->getParent()->getInfo<SIMachineFunctionInfo>();
while (Start->getOpcode() != FrameSetupOpcode) {
--Start;
bool IsCopy = false;
if (Start->getOpcode() == AMDGPU::COPY) {
auto &Dst = Start->getOperand(0);
if (Dst.isReg()) {
Register Reg = Dst.getReg();
if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
IsCopy = true;
} else {
// Also move the copy from the scratch rsrc descriptor into the loop
// to allow it to be optimized away.
auto &Src = Start->getOperand(1);
if (Src.isReg()) {
Reg = Src.getReg();
IsCopy = Info->getScratchRSrcReg() == Reg;
}
}
}
}
if (IsCopy) {
LastCopy = Start;
NonCopyInstrsLen = NonCopyInstrs.size();
} else {
NonCopyInstrs.push_back(&*Start);
}
}
NonCopyInstrs.resize(NonCopyInstrsLen);
for (auto *NonCopy : reverse(NonCopyInstrs)) {
MBB->splice(LastCopy, MBB, NonCopy->getIterator());
}
Start = LastCopy;
// Do the same for copies after the loop
NonCopyInstrs.clear();
NonCopyInstrsLen = 0;
MachineBasicBlock::iterator End(&MI);
LastCopy = End;
while (End->getOpcode() != FrameDestroyOpcode) {
++End;
bool IsCopy = false;
if (End->getOpcode() == AMDGPU::COPY) {
auto &Src = End->getOperand(1);
if (Src.isReg()) {
Register Reg = Src.getReg();
IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
}
}
if (IsCopy) {
LastCopy = End;
NonCopyInstrsLen = NonCopyInstrs.size();
} else {
NonCopyInstrs.push_back(&*End);
}
}
NonCopyInstrs.resize(NonCopyInstrsLen);
End = LastCopy;
++LastCopy;
for (auto *NonCopy : reverse(NonCopyInstrs)) {
MBB->splice(LastCopy, MBB, NonCopy->getIterator());
}
++End;
MachineIRBuilder B(*Start);
executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
break;
}
case AMDGPU::G_LOAD:
case AMDGPU::G_ZEXTLOAD:
case AMDGPU::G_SEXTLOAD: {
if (applyMappingLoad(MI, OpdMapper, MRI))
return;
break;
}
case AMDGPU::G_DYN_STACKALLOC:
applyMappingDynStackAlloc(MI, OpdMapper, MRI);
return;
case AMDGPU::G_SBFX:
applyMappingBFE(OpdMapper, /*Signed*/ true);
return;
case AMDGPU::G_UBFX:
applyMappingBFE(OpdMapper, /*Signed*/ false);
return;
default:
break;
}
return applyDefaultMapping(OpdMapper);
}