in src/coreclr/jit/lsraxarch.cpp [2094:2882]
int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount)
{
assert(pDstCount != nullptr);
NamedIntrinsic intrinsicId = intrinsicTree->GetHWIntrinsicId();
var_types baseType = intrinsicTree->GetSimdBaseType();
size_t numArgs = intrinsicTree->GetOperandCount();
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
// Set the AVX Flags if this instruction may use VEX encoding for SIMD operations.
// Note that this may be true even if the ISA is not AVX (e.g. for platform-agnostic intrinsics
// or non-AVX intrinsics that will use VEX encoding if it is available on the target).
if (intrinsicTree->isSIMD())
{
SetContainsAVXFlags(intrinsicTree->GetSimdSize());
}
int srcCount = 0;
int dstCount;
if (intrinsicTree->IsValue())
{
if (HWIntrinsicInfo::IsMultiReg(intrinsicId))
{
dstCount = HWIntrinsicInfo::GetMultiRegCount(intrinsicId);
}
else
{
dstCount = 1;
}
}
else
{
dstCount = 0;
}
SingleTypeRegSet dstCandidates = RBM_NONE;
if (intrinsicTree->GetOperandCount() == 0)
{
assert(numArgs == 0);
}
else
{
// A contained CreateScalarUnsafe is special in that we're not containing it to load from
// memory and it isn't a constant. Instead, its essentially a "transparent" node we're ignoring
// to simplify the overall IR handling. As such, we need to "skip" such nodes when present and
// get the underlying op1 so that delayFreeUse and other preferencing remains correct.
GenTree* op1 = nullptr;
GenTree* op2 = nullptr;
GenTree* op3 = nullptr;
GenTree* op4 = nullptr;
GenTree* op5 = nullptr;
GenTree* lastOp = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(numArgs));
switch (numArgs)
{
case 5:
{
op5 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(5));
FALLTHROUGH;
}
case 4:
{
op4 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(4));
FALLTHROUGH;
}
case 3:
{
op3 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(3));
FALLTHROUGH;
}
case 2:
{
op2 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(2));
FALLTHROUGH;
}
case 1:
{
op1 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(1));
break;
}
default:
{
unreached();
}
}
bool buildUses = true;
if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
{
if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
{
assert(!lastOp->IsCnsIntOrI());
// We need two extra reg when lastOp isn't a constant so
// the offset into the jump table for the fallback path
// can be computed.
buildInternalIntRegisterDefForNode(intrinsicTree);
buildInternalIntRegisterDefForNode(intrinsicTree);
}
}
if (intrinsicTree->OperIsEmbRoundingEnabled() && !lastOp->IsCnsIntOrI())
{
buildInternalIntRegisterDefForNode(intrinsicTree);
buildInternalIntRegisterDefForNode(intrinsicTree);
}
// Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
// is not allocated the same register as the target.
bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
#if defined(TARGET_AMD64)
bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic();
#endif // TARGET_AMD64
// Create internal temps, and handle any other special requirements.
// Note that the default case for building uses will handle the RMW flag, but if the uses
// are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
// must be handled within the case.
switch (intrinsicId)
{
case NI_Vector128_CreateScalarUnsafe:
case NI_Vector128_ToScalar:
case NI_Vector256_CreateScalarUnsafe:
case NI_Vector256_ToScalar:
case NI_Vector512_CreateScalarUnsafe:
case NI_Vector512_ToScalar:
{
assert(numArgs == 1);
if (varTypeIsFloating(baseType))
{
if (op1->isContained())
{
srcCount += BuildOperandUses(op1);
}
else
{
// We will either be in memory and need to be moved
// into a register of the appropriate size or we
// are already in an XMM/YMM/ZMM register and can stay
// where we are.
tgtPrefUse = BuildUse(op1);
srcCount += 1;
}
buildUses = false;
}
break;
}
case NI_Vector128_GetElement:
case NI_Vector256_GetElement:
case NI_Vector512_GetElement:
{
assert(numArgs == 2);
if (!op2->OperIsConst() && !op1->isContained())
{
// If the index is not a constant or op1 is in register,
// we will use the SIMD temp location to store the vector.
var_types requiredSimdTempType = Compiler::getSIMDTypeForSize(intrinsicTree->GetSimdSize());
compiler->getSIMDInitTempVarNum(requiredSimdTempType);
}
break;
}
case NI_Vector128_AsVector128Unsafe:
case NI_Vector128_AsVector2:
case NI_Vector128_AsVector3:
case NI_Vector128_ToVector256:
case NI_Vector128_ToVector512:
case NI_Vector256_ToVector512:
case NI_Vector128_ToVector256Unsafe:
case NI_Vector256_ToVector512Unsafe:
case NI_Vector256_GetLower:
case NI_Vector512_GetLower:
case NI_Vector512_GetLower128:
{
assert(numArgs == 1);
if (op1->isContained())
{
srcCount += BuildOperandUses(op1);
}
else
{
// We will either be in memory and need to be moved
// into a register of the appropriate size or we
// are already in an XMM/YMM register and can stay
// where we are.
tgtPrefUse = BuildUse(op1);
srcCount += 1;
}
buildUses = false;
break;
}
case NI_SSE2_MaskMove:
{
assert(numArgs == 3);
assert(!isRMW);
// MaskMove hardcodes the destination (op3) in DI/EDI/RDI
srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1));
srcCount += BuildOperandUses(op2, BuildEvexIncompatibleMask(op2));
srcCount += BuildOperandUses(op3, SRBM_EDI);
buildUses = false;
break;
}
case NI_SSE41_BlendVariable:
{
assert(numArgs == 3);
if (!compiler->canUseVexEncoding())
{
assert(isRMW);
// SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1));
srcCount += 1;
srcCount += op2->isContained() ? BuildOperandUses(op2, BuildEvexIncompatibleMask(op2))
: BuildDelayFreeUses(op2, op1, BuildEvexIncompatibleMask(op2));
srcCount += BuildDelayFreeUses(op3, op1, SRBM_XMM0);
buildUses = false;
}
break;
}
case NI_SSE41_Extract:
{
assert(!varTypeIsFloating(baseType));
#ifdef TARGET_X86
if (varTypeIsByte(baseType))
{
dstCandidates = allByteRegs();
}
#endif
break;
}
#ifdef TARGET_X86
case NI_SSE42_Crc32:
case NI_SSE42_X64_Crc32:
{
// TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
// to the code generator. We may want to encode the overload info in another way.
assert(numArgs == 2);
assert(isRMW);
// CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
tgtPrefUse = BuildUse(op1);
srcCount += 1;
srcCount += BuildDelayFreeUses(op2, op1, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);
buildUses = false;
break;
}
#endif // TARGET_X86
case NI_X86Base_DivRem:
case NI_X86Base_X64_DivRem:
{
assert(numArgs == 3);
assert(dstCount == 2);
assert(isRMW);
// DIV implicitly put op1(lower) to EAX and op2(upper) to EDX
srcCount += BuildOperandUses(op1, SRBM_EAX);
srcCount += BuildOperandUses(op2, SRBM_EDX);
if (!op3->isContained())
{
// For non-contained nodes, we want to make sure we delay free the register for
// op3 with respect to both op1 and op2. In other words, op3 shouldn't get same
// register that is assigned to either of op1 and op2.
RefPosition* op3RefPosition;
srcCount += BuildDelayFreeUses(op3, op1, RBM_NONE, &op3RefPosition);
if ((op3RefPosition != nullptr) && !op3RefPosition->delayRegFree)
{
// If op3 was not marked as delay-free for op1, mark it as delay-free
// if needed for op2.
AddDelayFreeUses(op3RefPosition, op2);
}
}
else
{
srcCount += BuildOperandUses(op3);
}
// result put in EAX and EDX
BuildDef(intrinsicTree, SRBM_EAX, 0);
BuildDef(intrinsicTree, SRBM_EDX, 1);
buildUses = false;
break;
}
case NI_BMI2_MultiplyNoFlags:
case NI_BMI2_X64_MultiplyNoFlags:
{
assert(numArgs == 2 || numArgs == 3);
srcCount += BuildOperandUses(op1, SRBM_EDX);
srcCount += BuildOperandUses(op2);
if (numArgs == 3)
{
// op3 reg should be different from target reg to
// store the lower half result after executing the instruction
srcCount += BuildDelayFreeUses(op3, op1);
// Need a internal register different from the dst to take the lower half result
buildInternalIntRegisterDefForNode(intrinsicTree);
setInternalRegsDelayFree = true;
}
buildUses = false;
break;
}
case NI_FMA_MultiplyAdd:
case NI_FMA_MultiplyAddNegated:
case NI_FMA_MultiplyAddNegatedScalar:
case NI_FMA_MultiplyAddScalar:
case NI_FMA_MultiplyAddSubtract:
case NI_FMA_MultiplySubtract:
case NI_FMA_MultiplySubtractAdd:
case NI_FMA_MultiplySubtractNegated:
case NI_FMA_MultiplySubtractNegatedScalar:
case NI_FMA_MultiplySubtractScalar:
case NI_AVX512F_FusedMultiplyAdd:
case NI_AVX512F_FusedMultiplyAddScalar:
case NI_AVX512F_FusedMultiplyAddNegated:
case NI_AVX512F_FusedMultiplyAddNegatedScalar:
case NI_AVX512F_FusedMultiplyAddSubtract:
case NI_AVX512F_FusedMultiplySubtract:
case NI_AVX512F_FusedMultiplySubtractScalar:
case NI_AVX512F_FusedMultiplySubtractAdd:
case NI_AVX512F_FusedMultiplySubtractNegated:
case NI_AVX512F_FusedMultiplySubtractNegatedScalar:
case NI_AVX10v1_FusedMultiplyAddNegatedScalar:
case NI_AVX10v1_FusedMultiplyAddScalar:
case NI_AVX10v1_FusedMultiplySubtractNegatedScalar:
case NI_AVX10v1_FusedMultiplySubtractScalar:
{
assert((numArgs == 3) || (intrinsicTree->OperIsEmbRoundingEnabled()));
assert(isRMW);
assert(HWIntrinsicInfo::IsFmaIntrinsic(intrinsicId));
const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);
LIR::Use use;
GenTree* user = nullptr;
if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(intrinsicTree, &use))
{
user = use.User();
}
unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3);
unsigned containedOpNum = 0;
// containedOpNum remains 0 when no operand is contained or regOptional
if (op1->isContained() || op1->IsRegOptional())
{
containedOpNum = 1;
}
else if (op2->isContained() || op2->IsRegOptional())
{
containedOpNum = 2;
}
else if (op3->isContained() || op3->IsRegOptional())
{
containedOpNum = 3;
}
GenTree* emitOp1 = op1;
GenTree* emitOp2 = op2;
GenTree* emitOp3 = op3;
// Intrinsics with CopyUpperBits semantics must have op1 as target
assert(containedOpNum != 1 || !copiesUpperBits);
// We need to keep this in sync with hwintrinsiccodegenxarch.cpp
// Ideally we'd actually swap the operands here and simplify codegen
// but its a bit more complicated to do so for many operands as well
// as being complicated to tell codegen how to pick the right instruction
if (containedOpNum == 1)
{
// https://github.com/dotnet/runtime/issues/62215
// resultOpNum might change between lowering and lsra, comment out assertion for now.
// assert(containedOpNum != resultOpNum);
// resultOpNum is 3 or 0: op3/? = ([op1] * op2) + op3
std::swap(emitOp1, emitOp3);
if (resultOpNum == 2)
{
// op2 = ([op1] * op2) + op3
std::swap(emitOp1, emitOp2);
}
}
else if (containedOpNum == 3)
{
// assert(containedOpNum != resultOpNum);
if (resultOpNum == 2 && !copiesUpperBits)
{
// op2 = (op1 * op2) + [op3]
std::swap(emitOp1, emitOp2);
}
// else: op1/? = (op1 * op2) + [op3]
}
else if (containedOpNum == 2)
{
// assert(containedOpNum != resultOpNum);
// op1/? = (op1 * [op2]) + op3
std::swap(emitOp2, emitOp3);
if (resultOpNum == 3 && !copiesUpperBits)
{
// op3 = (op1 * [op2]) + op3
std::swap(emitOp1, emitOp2);
}
}
else
{
// containedOpNum == 0
// no extra work when resultOpNum is 0 or 1
if (resultOpNum == 2)
{
std::swap(emitOp1, emitOp2);
}
else if (resultOpNum == 3)
{
std::swap(emitOp1, emitOp3);
}
}
GenTree* ops[] = {op1, op2, op3};
for (GenTree* op : ops)
{
if (op == emitOp1)
{
tgtPrefUse = BuildUse(op);
srcCount++;
}
else if (op == emitOp2)
{
srcCount += BuildDelayFreeUses(op, emitOp1);
}
else if (op == emitOp3)
{
srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, emitOp1);
}
}
if (intrinsicTree->OperIsEmbRoundingEnabled() && !intrinsicTree->Op(4)->IsCnsIntOrI())
{
srcCount += BuildOperandUses(intrinsicTree->Op(4));
}
buildUses = false;
break;
}
case NI_EVEX_BlendVariableMask:
{
assert(numArgs == 3);
if (op2->IsEmbMaskOp())
{
// TODO-AVX512-CQ: Ensure we can support embedded operations on RMW intrinsics
assert(!op2->isRMWHWIntrinsic(compiler));
if (isRMW)
{
assert(!op1->isContained());
tgtPrefUse = BuildUse(op1);
srcCount += 1;
assert(op2->isContained());
for (GenTree* operand : op2->AsHWIntrinsic()->Operands())
{
assert(varTypeIsSIMD(operand) || varTypeIsInt(operand));
srcCount += BuildDelayFreeUses(operand, op1);
}
}
else
{
assert(op1->isContained() && op1->IsVectorZero());
srcCount += BuildOperandUses(op1);
assert(op2->isContained());
for (GenTree* operand : op2->AsHWIntrinsic()->Operands())
{
assert(varTypeIsSIMD(operand) || varTypeIsInt(operand));
srcCount += BuildOperandUses(operand);
}
}
assert(!op3->isContained());
srcCount += BuildOperandUses(op3);
buildUses = false;
}
break;
}
case NI_AVX512F_PermuteVar8x64x2:
case NI_AVX512F_PermuteVar16x32x2:
case NI_AVX512F_VL_PermuteVar2x64x2:
case NI_AVX512F_VL_PermuteVar4x32x2:
case NI_AVX512F_VL_PermuteVar4x64x2:
case NI_AVX512F_VL_PermuteVar8x32x2:
case NI_AVX512BW_PermuteVar32x16x2:
case NI_AVX512BW_VL_PermuteVar8x16x2:
case NI_AVX512BW_VL_PermuteVar16x16x2:
case NI_AVX512VBMI_PermuteVar64x8x2:
case NI_AVX512VBMI_VL_PermuteVar16x8x2:
case NI_AVX512VBMI_VL_PermuteVar32x8x2:
case NI_AVX10v1_PermuteVar16x8x2:
case NI_AVX10v1_PermuteVar2x64x2:
case NI_AVX10v1_PermuteVar4x32x2:
case NI_AVX10v1_PermuteVar8x16x2:
case NI_AVX10v1_PermuteVar32x8x2:
case NI_AVX10v1_PermuteVar4x64x2:
case NI_AVX10v1_PermuteVar8x32x2:
case NI_AVX10v1_PermuteVar16x16x2:
case NI_AVX10v1_V512_PermuteVar64x8x2:
{
assert(numArgs == 3);
assert(isRMW);
assert(HWIntrinsicInfo::IsPermuteVar2x(intrinsicId));
LIR::Use use;
GenTree* user = nullptr;
if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(intrinsicTree, &use))
{
user = use.User();
}
unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3);
assert(!op1->isContained());
assert(!op2->isContained());
GenTree* emitOp1 = op1;
GenTree* emitOp2 = op2;
GenTree* emitOp3 = op3;
if (resultOpNum == 2)
{
std::swap(emitOp1, emitOp2);
}
GenTree* ops[] = {op1, op2, op3};
for (GenTree* op : ops)
{
if (op == emitOp1)
{
tgtPrefUse = BuildUse(op);
srcCount++;
}
else if (op == emitOp2)
{
srcCount += BuildDelayFreeUses(op, emitOp1);
}
else if (op == emitOp3)
{
srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, emitOp1);
}
}
buildUses = false;
break;
}
case NI_AVXVNNI_MultiplyWideningAndAdd:
case NI_AVXVNNI_MultiplyWideningAndAddSaturate:
{
assert(numArgs == 3);
tgtPrefUse = BuildUse(op1);
srcCount += 1;
srcCount += BuildDelayFreeUses(op2, op1);
srcCount += op3->isContained() ? BuildOperandUses(op3) : BuildDelayFreeUses(op3, op1);
buildUses = false;
break;
}
case NI_AVX2_GatherVector128:
case NI_AVX2_GatherVector256:
{
assert(numArgs == 3);
assert(!isRMW);
// Any pair of the index, mask, or destination registers should be different
srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1));
srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2));
// op3 should always be contained
assert(op3->isContained());
// get a tmp register for mask that will be cleared by gather instructions
buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
setInternalRegsDelayFree = true;
buildUses = false;
break;
}
case NI_AVX2_GatherMaskVector128:
case NI_AVX2_GatherMaskVector256:
{
assert(!isRMW);
// Any pair of the index, mask, or destination registers should be different
srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1));
srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2));
srcCount += BuildDelayFreeUses(op3, nullptr, BuildEvexIncompatibleMask(op3));
srcCount += BuildDelayFreeUses(op4, nullptr, BuildEvexIncompatibleMask(op4));
// op5 should always be contained
assert(op5->isContained());
// get a tmp register for mask that will be cleared by gather instructions
buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
setInternalRegsDelayFree = true;
buildUses = false;
break;
}
default:
{
assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
assert(!HWIntrinsicInfo::IsFmaIntrinsic(intrinsicId));
assert(!HWIntrinsicInfo::IsPermuteVar2x(intrinsicId));
break;
}
}
if (buildUses)
{
SingleTypeRegSet op1RegCandidates = RBM_NONE;
#if defined(TARGET_AMD64)
if (!isEvexCompatible)
{
op1RegCandidates = BuildEvexIncompatibleMask(op1);
}
#endif // TARGET_AMD64
if (intrinsicTree->OperIsMemoryLoadOrStore())
{
srcCount += BuildAddrUses(op1, op1RegCandidates);
}
else if (isRMW && !op1->isContained())
{
tgtPrefUse = BuildUse(op1, op1RegCandidates);
srcCount += 1;
}
else
{
srcCount += BuildOperandUses(op1, op1RegCandidates);
}
if (op2 != nullptr)
{
SingleTypeRegSet op2RegCandidates = RBM_NONE;
#if defined(TARGET_AMD64)
if (!isEvexCompatible)
{
op2RegCandidates = BuildEvexIncompatibleMask(op2);
}
#endif // TARGET_AMD64
if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained())
{
srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1), op2RegCandidates);
}
else if (isRMW)
{
if (!op2->isContained() && intrinsicTree->isCommutativeHWIntrinsic())
{
// When op2 is not contained and we are commutative, we can set op2
// to also be a tgtPrefUse. Codegen will then swap the operands.
tgtPrefUse2 = BuildUse(op2, op2RegCandidates);
srcCount += 1;
}
else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet()))
{
// When op2 is not contained or if we are producing a scalar value
// we need to mark it as delay free because the operand and target
// exist in the same register set.
srcCount += BuildDelayFreeUses(op2, op1, op2RegCandidates);
}
else
{
// When op2 is contained and we are not producing a scalar value we
// have no concerns of overwriting op2 because they exist in different
// register sets.
srcCount += BuildOperandUses(op2, op2RegCandidates);
}
}
else
{
srcCount += BuildOperandUses(op2, op2RegCandidates);
}
if (op3 != nullptr)
{
SingleTypeRegSet op3RegCandidates = RBM_NONE;
#if defined(TARGET_AMD64)
if (!isEvexCompatible)
{
op3RegCandidates = BuildEvexIncompatibleMask(op3);
}
#endif // TARGET_AMD64
srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates)
: BuildOperandUses(op3, op3RegCandidates);
if (op4 != nullptr)
{
SingleTypeRegSet op4RegCandidates = RBM_NONE;
#if defined(TARGET_AMD64)
assert(isEvexCompatible);
#endif // TARGET_AMD64
srcCount += isRMW ? BuildDelayFreeUses(op4, op1, op4RegCandidates)
: BuildOperandUses(op4, op4RegCandidates);
}
}
}
}
buildInternalRegisterUses();
}
if (dstCount == 1)
{
#if defined(TARGET_AMD64)
bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic();
if (!isEvexCompatible)
{
dstCandidates = BuildEvexIncompatibleMask(intrinsicTree);
}
#endif
BuildDef(intrinsicTree, dstCandidates);
}
else
{
// Currently dstCount = 2 is only used for DivRem, which has special constriants and handled above
assert((dstCount == 0) ||
((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem))));
}
*pDstCount = dstCount;
return srcCount;
}