in src/coreclr/jit/lsraarm64.cpp [1337:2109]
int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount)
{
assert(pDstCount != nullptr);
const HWIntrinsic intrin(intrinsicTree);
int srcCount = 0;
int dstCount = 0;
regMaskTP dstCandidates = RBM_NONE;
if (HWIntrinsicInfo::IsMultiReg(intrin.id))
{
dstCount = intrinsicTree->GetMultiRegCount(compiler);
}
else if (intrinsicTree->IsValue())
{
dstCount = 1;
}
const bool hasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrin.id);
if (hasImmediateOperand && !HWIntrinsicInfo::NoJmpTableImm(intrin.id))
{
// We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate
// operand and the intrinsic does not have an alternative non-const fallback form.
// However, for a case when the operand can take only two possible values - zero and one
// the codegen can use cbnz to do conditional branch, so such register is not needed.
bool needBranchTargetReg = false;
int immLowerBound = 0;
int immUpperBound = 0;
if (intrin.category == HW_Category_SIMDByIndexedElement)
{
var_types indexedElementOpType;
if (intrin.numOperands == 2)
{
indexedElementOpType = intrin.op1->TypeGet();
}
else if (intrin.numOperands == 3)
{
indexedElementOpType = intrin.op2->TypeGet();
}
else
{
assert(intrin.numOperands == 4);
indexedElementOpType = intrin.op3->TypeGet();
}
assert(varTypeIsSIMD(indexedElementOpType));
const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType);
HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound,
&immUpperBound);
}
else
{
HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1,
&immLowerBound, &immUpperBound);
}
if ((immLowerBound != 0) || (immUpperBound != 1))
{
if ((intrin.category == HW_Category_SIMDByIndexedElement) ||
(intrin.category == HW_Category_ShiftLeftByImmediate) ||
(intrin.category == HW_Category_ShiftRightByImmediate))
{
switch (intrin.numOperands)
{
case 4:
needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed();
break;
case 3:
needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
break;
case 2:
needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed();
break;
default:
unreached();
}
}
else
{
switch (intrin.id)
{
case NI_AdvSimd_DuplicateSelectedScalarToVector64:
case NI_AdvSimd_DuplicateSelectedScalarToVector128:
case NI_AdvSimd_Extract:
case NI_AdvSimd_Insert:
case NI_AdvSimd_InsertScalar:
case NI_AdvSimd_LoadAndInsertScalar:
case NI_AdvSimd_LoadAndInsertScalarVector64x2:
case NI_AdvSimd_LoadAndInsertScalarVector64x3:
case NI_AdvSimd_LoadAndInsertScalarVector64x4:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed();
break;
case NI_AdvSimd_ExtractVector64:
case NI_AdvSimd_ExtractVector128:
case NI_AdvSimd_StoreSelectedScalar:
case NI_AdvSimd_Arm64_StoreSelectedScalar:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
case NI_Sve_ExtractVector:
needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
break;
case NI_AdvSimd_Arm64_InsertSelectedScalar:
assert(intrin.op2->isContainedIntOrIImmed());
assert(intrin.op4->isContainedIntOrIImmed());
break;
case NI_Sve_CreateTrueMaskByte:
case NI_Sve_CreateTrueMaskDouble:
case NI_Sve_CreateTrueMaskInt16:
case NI_Sve_CreateTrueMaskInt32:
case NI_Sve_CreateTrueMaskInt64:
case NI_Sve_CreateTrueMaskSByte:
case NI_Sve_CreateTrueMaskSingle:
case NI_Sve_CreateTrueMaskUInt16:
case NI_Sve_CreateTrueMaskUInt32:
case NI_Sve_CreateTrueMaskUInt64:
case NI_Sve_Count16BitElements:
case NI_Sve_Count32BitElements:
case NI_Sve_Count64BitElements:
case NI_Sve_Count8BitElements:
needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed();
break;
case NI_Sve_GatherPrefetch8Bit:
case NI_Sve_GatherPrefetch16Bit:
case NI_Sve_GatherPrefetch32Bit:
case NI_Sve_GatherPrefetch64Bit:
if (!varTypeIsSIMD(intrin.op2->gtType))
{
needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed();
}
else
{
needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
}
break;
case NI_Sve_SaturatingDecrementBy16BitElementCount:
case NI_Sve_SaturatingDecrementBy32BitElementCount:
case NI_Sve_SaturatingDecrementBy64BitElementCount:
case NI_Sve_SaturatingDecrementBy8BitElementCount:
case NI_Sve_SaturatingIncrementBy16BitElementCount:
case NI_Sve_SaturatingIncrementBy32BitElementCount:
case NI_Sve_SaturatingIncrementBy64BitElementCount:
case NI_Sve_SaturatingIncrementBy8BitElementCount:
case NI_Sve_SaturatingDecrementBy16BitElementCountScalar:
case NI_Sve_SaturatingDecrementBy32BitElementCountScalar:
case NI_Sve_SaturatingDecrementBy64BitElementCountScalar:
case NI_Sve_SaturatingIncrementBy16BitElementCountScalar:
case NI_Sve_SaturatingIncrementBy32BitElementCountScalar:
case NI_Sve_SaturatingIncrementBy64BitElementCountScalar:
// Can only avoid generating a table if both immediates are constant.
assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed());
needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed();
// Ensure that internal does not collide with desination.
setInternalRegsDelayFree = true;
break;
default:
unreached();
}
}
}
if (needBranchTargetReg)
{
buildInternalIntRegisterDefForNode(intrinsicTree);
}
}
// Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
// is not allocated the same register as the target.
const bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
bool tgtPrefOp1 = false;
bool tgtPrefOp2 = false;
bool delayFreeMultiple = false;
if (intrin.op1 != nullptr)
{
bool simdRegToSimdRegMove = false;
switch (intrin.id)
{
case NI_Vector64_CreateScalarUnsafe:
case NI_Vector128_CreateScalarUnsafe:
{
simdRegToSimdRegMove = varTypeIsFloating(intrin.op1);
break;
}
case NI_AdvSimd_Arm64_DuplicateToVector64:
{
simdRegToSimdRegMove = (intrin.op1->TypeGet() == TYP_DOUBLE);
break;
}
case NI_Vector64_ToScalar:
case NI_Vector128_ToScalar:
{
simdRegToSimdRegMove = varTypeIsFloating(intrinsicTree);
break;
}
case NI_Vector64_ToVector128Unsafe:
case NI_Vector128_AsVector128Unsafe:
case NI_Vector128_AsVector3:
case NI_Vector128_GetLower:
{
simdRegToSimdRegMove = true;
break;
}
case NI_AdvSimd_LoadAndInsertScalarVector64x2:
case NI_AdvSimd_LoadAndInsertScalarVector64x3:
case NI_AdvSimd_LoadAndInsertScalarVector64x4:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
{
delayFreeMultiple = true;
break;
}
default:
{
break;
}
}
// If we have an RMW intrinsic or an intrinsic with simple move semantic between two SIMD registers,
// we want to preference op1Reg to the target if op1 is not contained.
if ((isRMW || simdRegToSimdRegMove))
{
if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id))
{
assert(!simdRegToSimdRegMove);
// Prefer op2Reg for the masked operation as mask would be the op1Reg
tgtPrefOp2 = !intrin.op1->isContained();
}
else
{
tgtPrefOp1 = !intrin.op1->isContained();
}
}
if (delayFreeMultiple)
{
assert(isRMW);
assert(intrin.op1->OperIs(GT_FIELD_LIST));
GenTreeFieldList* op1 = intrin.op1->AsFieldList();
assert(compiler->info.compNeedsConsecutiveRegisters);
for (GenTreeFieldList::Use& use : op1->Uses())
{
BuildDelayFreeUses(use.GetNode(), intrinsicTree);
srcCount++;
}
}
else if (HWIntrinsicInfo::IsMaskedOperation(intrin.id))
{
if (!varTypeIsMask(intrin.op1->TypeGet()) && !HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id))
{
srcCount += BuildOperandUses(intrin.op1);
}
else
{
SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet();
if (intrin.id == NI_Sve_ConditionalSelect)
{
// If this is conditional select, make sure to check the embedded
// operation to determine the predicate mask.
assert(intrinsicTree->GetOperandCount() == 3);
assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id));
if (intrin.op2->OperIs(GT_HWINTRINSIC))
{
GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic();
const HWIntrinsic intrinEmb(embOp2Node);
if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmb.id))
{
predMask = RBM_LOWMASK.GetPredicateRegSet();
}
}
}
else if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id))
{
predMask = RBM_LOWMASK.GetPredicateRegSet();
}
if (tgtPrefOp2)
{
srcCount += BuildDelayFreeUses(intrin.op1, intrin.op2, predMask);
}
else
{
srcCount += BuildOperandUses(intrin.op1, predMask);
}
}
}
else if (intrinsicTree->OperIsMemoryLoadOrStore())
{
srcCount += BuildAddrUses(intrin.op1);
}
else if (tgtPrefOp1)
{
tgtPrefUse = BuildUse(intrin.op1);
srcCount++;
}
else if ((intrin.id != NI_AdvSimd_VectorTableLookup) && (intrin.id != NI_AdvSimd_Arm64_VectorTableLookup))
{
srcCount += BuildOperandUses(intrin.op1);
}
else
{
srcCount += BuildConsecutiveRegistersForUse(intrin.op1);
}
}
if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2))
{
// Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g.
// "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when
// the element size is H (i.e. 2 bytes).
assert(intrin.op2 != nullptr);
if ((intrin.op4 != nullptr) || ((intrin.op3 != nullptr) && !hasImmediateOperand))
{
if (isRMW)
{
srcCount += BuildDelayFreeUses(intrin.op2, nullptr);
srcCount +=
BuildDelayFreeUses(intrin.op3, nullptr, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet());
}
else
{
srcCount += BuildOperandUses(intrin.op2);
srcCount += BuildOperandUses(intrin.op3, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet());
}
if (intrin.op4 != nullptr)
{
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op4));
srcCount += BuildOperandUses(intrin.op4);
}
}
else
{
assert(!isRMW);
if (intrin.id == NI_Sve_DuplicateSelectedScalarToVector)
{
srcCount += BuildOperandUses(intrin.op2);
}
else
{
srcCount += BuildOperandUses(intrin.op2, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet());
}
if (intrin.op3 != nullptr)
{
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op3));
srcCount += BuildOperandUses(intrin.op3);
}
}
}
else if (HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id))
{
switch (intrin.id)
{
case NI_AdvSimd_VectorTableLookup:
case NI_AdvSimd_Arm64_VectorTableLookup:
{
assert(intrin.op2 != nullptr);
srcCount += BuildOperandUses(intrin.op2);
assert(dstCount == 1);
buildInternalRegisterUses();
BuildDef(intrinsicTree);
*pDstCount = 1;
break;
}
case NI_AdvSimd_VectorTableLookupExtension:
case NI_AdvSimd_Arm64_VectorTableLookupExtension:
{
assert(intrin.op2 != nullptr);
assert(intrin.op3 != nullptr);
assert(isRMW);
srcCount += BuildConsecutiveRegistersForUse(intrin.op2, intrin.op1);
srcCount += BuildDelayFreeUses(intrin.op3, intrin.op1);
assert(dstCount == 1);
buildInternalRegisterUses();
BuildDef(intrinsicTree);
*pDstCount = 1;
break;
}
case NI_AdvSimd_StoreSelectedScalar:
case NI_AdvSimd_Arm64_StoreSelectedScalar:
{
assert(intrin.op1 != nullptr);
assert(intrin.op3 != nullptr);
srcCount += (intrin.op2->gtType == TYP_STRUCT) ? BuildConsecutiveRegistersForUse(intrin.op2)
: BuildOperandUses(intrin.op2);
if (!intrin.op3->isContainedIntOrIImmed())
{
srcCount += BuildOperandUses(intrin.op3);
}
assert(dstCount == 0);
buildInternalRegisterUses();
*pDstCount = 0;
break;
}
case NI_AdvSimd_Store:
case NI_AdvSimd_Arm64_Store:
case NI_AdvSimd_StoreVectorAndZip:
case NI_AdvSimd_Arm64_StoreVectorAndZip:
{
assert(intrin.op1 != nullptr);
srcCount += BuildConsecutiveRegistersForUse(intrin.op2);
assert(dstCount == 0);
buildInternalRegisterUses();
*pDstCount = 0;
break;
}
case NI_AdvSimd_LoadAndInsertScalarVector64x2:
case NI_AdvSimd_LoadAndInsertScalarVector64x3:
case NI_AdvSimd_LoadAndInsertScalarVector64x4:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
{
assert(intrin.op2 != nullptr);
assert(intrin.op3 != nullptr);
assert(isRMW);
if (!intrin.op2->isContainedIntOrIImmed())
{
srcCount += BuildOperandUses(intrin.op2);
}
assert(intrinsicTree->OperIsMemoryLoadOrStore());
srcCount += BuildAddrUses(intrin.op3);
buildInternalRegisterUses();
FALLTHROUGH;
}
case NI_AdvSimd_Load2xVector64AndUnzip:
case NI_AdvSimd_Load3xVector64AndUnzip:
case NI_AdvSimd_Load4xVector64AndUnzip:
case NI_AdvSimd_Arm64_Load2xVector128AndUnzip:
case NI_AdvSimd_Arm64_Load3xVector128AndUnzip:
case NI_AdvSimd_Arm64_Load4xVector128AndUnzip:
case NI_AdvSimd_Load2xVector64:
case NI_AdvSimd_Load3xVector64:
case NI_AdvSimd_Load4xVector64:
case NI_AdvSimd_Arm64_Load2xVector128:
case NI_AdvSimd_Arm64_Load3xVector128:
case NI_AdvSimd_Arm64_Load4xVector128:
case NI_AdvSimd_LoadAndReplicateToVector64x2:
case NI_AdvSimd_LoadAndReplicateToVector64x3:
case NI_AdvSimd_LoadAndReplicateToVector64x4:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x2:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x3:
case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4:
{
assert(intrin.op1 != nullptr);
BuildConsecutiveRegistersForDef(intrinsicTree, dstCount);
*pDstCount = dstCount;
break;
}
case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
{
assert(intrin.op1 != nullptr);
assert(intrin.op2 != nullptr);
assert(intrinsicTree->OperIsMemoryLoadOrStore());
srcCount += BuildAddrUses(intrin.op2);
BuildConsecutiveRegistersForDef(intrinsicTree, dstCount);
*pDstCount = dstCount;
break;
}
case NI_Sve_StoreAndZipx2:
case NI_Sve_StoreAndZipx3:
case NI_Sve_StoreAndZipx4:
{
assert(intrin.op2 != nullptr);
assert(intrin.op3 != nullptr);
srcCount += BuildAddrUses(intrin.op2);
srcCount += BuildConsecutiveRegistersForUse(intrin.op3);
assert(dstCount == 0);
buildInternalRegisterUses();
*pDstCount = 0;
break;
}
default:
noway_assert(!"Not a supported as multiple consecutive register intrinsic");
}
return srcCount;
}
else if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp()) &&
(intrin.op2->isRMWHWIntrinsic(compiler)))
{
assert(intrin.op3 != nullptr);
// For ConditionalSelect, if there is an embedded operation, and the operation has RMW semantics
// then record delay-free for operands as well as the "merge" value
GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic();
size_t numArgs = embOp2Node->GetOperandCount();
const HWIntrinsic intrinEmb(embOp2Node);
numArgs = embOp2Node->GetOperandCount();
if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmb.id))
{
assert(embOp2Node->isRMWHWIntrinsic(compiler));
assert(numArgs == 3);
LIR::Use use;
GenTree* user = nullptr;
if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embOp2Node, &use))
{
user = use.User();
}
unsigned resultOpNum =
embOp2Node->GetResultOpNumForRmwIntrinsic(user, intrinEmb.op1, intrinEmb.op2, intrinEmb.op3);
GenTree* emitOp1 = intrinEmb.op1;
GenTree* emitOp2 = intrinEmb.op2;
GenTree* emitOp3 = intrinEmb.op3;
if (resultOpNum == 2)
{
// op2 = op1 + (op2 * op3)
std::swap(emitOp1, emitOp3);
std::swap(emitOp1, emitOp2);
// op1 = (op1 * op2) + op3
}
else if (resultOpNum == 3)
{
// op3 = op1 + (op2 * op3)
std::swap(emitOp1, emitOp3);
// op1 = (op1 * op2) + op3
}
else
{
// op1 = op1 + (op2 * op3)
// Nothing needs to be done
}
tgtPrefUse = BuildUse(emitOp1);
srcCount += 1;
srcCount += BuildDelayFreeUses(emitOp2, emitOp1);
srcCount += BuildDelayFreeUses(emitOp3, emitOp1);
srcCount += BuildDelayFreeUses(intrin.op3, emitOp1);
}
else
{
assert((numArgs == 1) || (numArgs == 2) || (numArgs == 3));
tgtPrefUse = BuildUse(embOp2Node->Op(1));
srcCount += 1;
for (size_t argNum = 2; argNum <= numArgs; argNum++)
{
srcCount += BuildDelayFreeUses(embOp2Node->Op(argNum), embOp2Node->Op(1));
}
srcCount += BuildDelayFreeUses(intrin.op3, embOp2Node->Op(1));
}
}
else if (intrin.op2 != nullptr)
{
// RMW intrinsic operands doesn't have to be delayFree when they can be assigned the same register as op1Reg
// (i.e. a register that corresponds to read-modify-write operand) and one of them is the last use.
assert(intrin.op1 != nullptr);
bool forceOp2DelayFree = false;
SingleTypeRegSet lowVectorCandidates = RBM_NONE;
size_t lowVectorOperandNum = 0;
if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement))
{
if (!intrin.op2->IsCnsIntOrI() && (!intrin.op1->isContained() || intrin.op1->OperIsLocal()))
{
// If the index is not a constant and the object is not contained or is a local
// we will need a general purpose register to calculate the address
// internal register must not clobber input index
// TODO-Cleanup: An internal register will never clobber a source; this code actually
// ensures that the index (op2) doesn't interfere with the target.
buildInternalIntRegisterDefForNode(intrinsicTree);
forceOp2DelayFree = true;
}
if (!intrin.op2->IsCnsIntOrI() && !intrin.op1->isContained())
{
// If the index is not a constant or op1 is in register,
// we will use the SIMD temp location to store the vector.
var_types requiredSimdTempType = (intrin.id == NI_Vector64_GetElement) ? TYP_SIMD8 : TYP_SIMD16;
compiler->getSIMDInitTempVarNum(requiredSimdTempType);
}
}
else if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id))
{
getLowVectorOperandAndCandidates(intrin, &lowVectorOperandNum, &lowVectorCandidates);
}
if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp()) &&
(intrin.op2->isRMWHWIntrinsic(compiler)))
{
// For ConditionalSelect, if there is an embedded operation, and the operation has RMW semantics
// then record delay-free for them.
GenTreeHWIntrinsic* intrinEmbOp2 = intrin.op2->AsHWIntrinsic();
size_t numArgs = intrinEmbOp2->GetOperandCount();
assert((numArgs == 1) || (numArgs == 2));
const HWIntrinsic intrinEmb(intrinEmbOp2);
if (HWIntrinsicInfo::IsLowVectorOperation(intrinEmb.id))
{
getLowVectorOperandAndCandidates(intrinEmb, &lowVectorOperandNum, &lowVectorCandidates);
}
tgtPrefUse = BuildUse(intrinEmbOp2->Op(1));
srcCount += 1;
for (size_t argNum = 2; argNum <= numArgs; argNum++)
{
srcCount += BuildDelayFreeUses(intrinEmbOp2->Op(argNum), intrinEmbOp2->Op(1),
(argNum == lowVectorOperandNum) ? lowVectorCandidates : RBM_NONE);
}
}
else if (tgtPrefOp2)
{
if (!intrin.op2->isContained())
{
assert(tgtPrefUse == nullptr);
tgtPrefUse2 = BuildUse(intrin.op2);
srcCount++;
}
else
{
srcCount += BuildOperandUses(intrin.op2);
}
}
else
{
switch (intrin.id)
{
case NI_Sve_LoadVectorNonTemporal:
case NI_Sve_LoadVector128AndReplicateToVector:
case NI_Sve_StoreAndZip:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
assert(intrinsicTree->OperIsMemoryLoadOrStore());
srcCount += BuildAddrUses(intrin.op2);
break;
case NI_Sve_GatherPrefetch8Bit:
case NI_Sve_GatherPrefetch16Bit:
case NI_Sve_GatherPrefetch32Bit:
case NI_Sve_GatherPrefetch64Bit:
case NI_Sve_GatherVector:
case NI_Sve_GatherVectorByteZeroExtend:
case NI_Sve_GatherVectorInt16SignExtend:
case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorInt32SignExtend:
case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
case NI_Sve_GatherVectorSByteSignExtend:
case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt16ZeroExtend:
case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
case NI_Sve_GatherVectorUInt32ZeroExtend:
assert(intrinsicTree->OperIsMemoryLoadOrStore());
if (!varTypeIsSIMD(intrin.op2->gtType))
{
srcCount += BuildAddrUses(intrin.op2);
break;
}
FALLTHROUGH;
default:
{
SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE;
if (intrin.op2->gtType == TYP_MASK)
{
assert(lowVectorOperandNum != 2);
candidates = RBM_ALLMASK.GetPredicateRegSet();
}
if (forceOp2DelayFree)
{
srcCount += BuildDelayFreeUses(intrin.op2, nullptr, candidates);
}
else
{
srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates)
: BuildOperandUses(intrin.op2, candidates);
}
}
break;
}
}
if (intrin.op3 != nullptr)
{
SingleTypeRegSet candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE;
if (isRMW)
{
srcCount += BuildDelayFreeUses(intrin.op3, (tgtPrefOp2 ? intrin.op2 : intrin.op1), candidates);
}
else
{
srcCount += BuildOperandUses(intrin.op3, candidates);
}
if (intrin.op4 != nullptr)
{
assert(lowVectorOperandNum != 4);
assert(!tgtPrefOp2);
srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, intrin.op1) : BuildOperandUses(intrin.op4);
}
}
}
buildInternalRegisterUses();
if ((dstCount == 1) || (dstCount == 2))
{
BuildDef(intrinsicTree);
if (dstCount == 2)
{
BuildDef(intrinsicTree, RBM_NONE, 1);
}
}
else
{
assert(dstCount == 0);
}
*pDstCount = dstCount;
return srcCount;
}