int LinearScan::BuildHWIntrinsic()

in src/coreclr/jit/lsraarm64.cpp [1337:2109]


int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount)
{
    assert(pDstCount != nullptr);

    const HWIntrinsic intrin(intrinsicTree);

    int       srcCount      = 0;
    int       dstCount      = 0;
    regMaskTP dstCandidates = RBM_NONE;

    if (HWIntrinsicInfo::IsMultiReg(intrin.id))
    {
        dstCount = intrinsicTree->GetMultiRegCount(compiler);
    }
    else if (intrinsicTree->IsValue())
    {
        dstCount = 1;
    }

    const bool hasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrin.id);

    if (hasImmediateOperand && !HWIntrinsicInfo::NoJmpTableImm(intrin.id))
    {
        // We may need to allocate an additional general-purpose register when an intrinsic has a non-const immediate
        // operand and the intrinsic does not have an alternative non-const fallback form.
        // However, for a case when the operand can take only two possible values - zero and one
        // the codegen can use cbnz to do conditional branch, so such register is not needed.

        bool needBranchTargetReg = false;

        int immLowerBound = 0;
        int immUpperBound = 0;

        if (intrin.category == HW_Category_SIMDByIndexedElement)
        {
            var_types indexedElementOpType;

            if (intrin.numOperands == 2)
            {
                indexedElementOpType = intrin.op1->TypeGet();
            }
            else if (intrin.numOperands == 3)
            {
                indexedElementOpType = intrin.op2->TypeGet();
            }
            else
            {
                assert(intrin.numOperands == 4);
                indexedElementOpType = intrin.op3->TypeGet();
            }

            assert(varTypeIsSIMD(indexedElementOpType));

            const unsigned int indexedElementSimdSize = genTypeSize(indexedElementOpType);
            HWIntrinsicInfo::lookupImmBounds(intrin.id, indexedElementSimdSize, intrin.baseType, 1, &immLowerBound,
                                             &immUpperBound);
        }
        else
        {
            HWIntrinsicInfo::lookupImmBounds(intrin.id, intrinsicTree->GetSimdSize(), intrin.baseType, 1,
                                             &immLowerBound, &immUpperBound);
        }

        if ((immLowerBound != 0) || (immUpperBound != 1))
        {
            if ((intrin.category == HW_Category_SIMDByIndexedElement) ||
                (intrin.category == HW_Category_ShiftLeftByImmediate) ||
                (intrin.category == HW_Category_ShiftRightByImmediate))
            {
                switch (intrin.numOperands)
                {
                    case 4:
                        needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed();
                        break;

                    case 3:
                        needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
                        break;

                    case 2:
                        needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed();
                        break;

                    default:
                        unreached();
                }
            }
            else
            {
                switch (intrin.id)
                {
                    case NI_AdvSimd_DuplicateSelectedScalarToVector64:
                    case NI_AdvSimd_DuplicateSelectedScalarToVector128:
                    case NI_AdvSimd_Extract:
                    case NI_AdvSimd_Insert:
                    case NI_AdvSimd_InsertScalar:
                    case NI_AdvSimd_LoadAndInsertScalar:
                    case NI_AdvSimd_LoadAndInsertScalarVector64x2:
                    case NI_AdvSimd_LoadAndInsertScalarVector64x3:
                    case NI_AdvSimd_LoadAndInsertScalarVector64x4:
                    case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
                    case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
                    case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
                    case NI_AdvSimd_Arm64_DuplicateSelectedScalarToVector128:
                        needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed();
                        break;

                    case NI_AdvSimd_ExtractVector64:
                    case NI_AdvSimd_ExtractVector128:
                    case NI_AdvSimd_StoreSelectedScalar:
                    case NI_AdvSimd_Arm64_StoreSelectedScalar:
                    case NI_Sve_PrefetchBytes:
                    case NI_Sve_PrefetchInt16:
                    case NI_Sve_PrefetchInt32:
                    case NI_Sve_PrefetchInt64:
                    case NI_Sve_ExtractVector:
                        needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
                        break;

                    case NI_AdvSimd_Arm64_InsertSelectedScalar:
                        assert(intrin.op2->isContainedIntOrIImmed());
                        assert(intrin.op4->isContainedIntOrIImmed());
                        break;

                    case NI_Sve_CreateTrueMaskByte:
                    case NI_Sve_CreateTrueMaskDouble:
                    case NI_Sve_CreateTrueMaskInt16:
                    case NI_Sve_CreateTrueMaskInt32:
                    case NI_Sve_CreateTrueMaskInt64:
                    case NI_Sve_CreateTrueMaskSByte:
                    case NI_Sve_CreateTrueMaskSingle:
                    case NI_Sve_CreateTrueMaskUInt16:
                    case NI_Sve_CreateTrueMaskUInt32:
                    case NI_Sve_CreateTrueMaskUInt64:
                    case NI_Sve_Count16BitElements:
                    case NI_Sve_Count32BitElements:
                    case NI_Sve_Count64BitElements:
                    case NI_Sve_Count8BitElements:
                        needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed();
                        break;

                    case NI_Sve_GatherPrefetch8Bit:
                    case NI_Sve_GatherPrefetch16Bit:
                    case NI_Sve_GatherPrefetch32Bit:
                    case NI_Sve_GatherPrefetch64Bit:
                        if (!varTypeIsSIMD(intrin.op2->gtType))
                        {
                            needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed();
                        }
                        else
                        {
                            needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
                        }
                        break;

                    case NI_Sve_SaturatingDecrementBy16BitElementCount:
                    case NI_Sve_SaturatingDecrementBy32BitElementCount:
                    case NI_Sve_SaturatingDecrementBy64BitElementCount:
                    case NI_Sve_SaturatingDecrementBy8BitElementCount:
                    case NI_Sve_SaturatingIncrementBy16BitElementCount:
                    case NI_Sve_SaturatingIncrementBy32BitElementCount:
                    case NI_Sve_SaturatingIncrementBy64BitElementCount:
                    case NI_Sve_SaturatingIncrementBy8BitElementCount:
                    case NI_Sve_SaturatingDecrementBy16BitElementCountScalar:
                    case NI_Sve_SaturatingDecrementBy32BitElementCountScalar:
                    case NI_Sve_SaturatingDecrementBy64BitElementCountScalar:
                    case NI_Sve_SaturatingIncrementBy16BitElementCountScalar:
                    case NI_Sve_SaturatingIncrementBy32BitElementCountScalar:
                    case NI_Sve_SaturatingIncrementBy64BitElementCountScalar:
                        // Can only avoid generating a table if both immediates are constant.
                        assert(intrin.op2->isContainedIntOrIImmed() == intrin.op3->isContainedIntOrIImmed());
                        needBranchTargetReg = !intrin.op2->isContainedIntOrIImmed();
                        // Ensure that internal does not collide with desination.
                        setInternalRegsDelayFree = true;
                        break;

                    default:
                        unreached();
                }
            }
        }

        if (needBranchTargetReg)
        {
            buildInternalIntRegisterDefForNode(intrinsicTree);
        }
    }

    // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
    // is not allocated the same register as the target.
    const bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);

    bool tgtPrefOp1        = false;
    bool tgtPrefOp2        = false;
    bool delayFreeMultiple = false;
    if (intrin.op1 != nullptr)
    {
        bool simdRegToSimdRegMove = false;

        switch (intrin.id)
        {
            case NI_Vector64_CreateScalarUnsafe:
            case NI_Vector128_CreateScalarUnsafe:
            {
                simdRegToSimdRegMove = varTypeIsFloating(intrin.op1);
                break;
            }

            case NI_AdvSimd_Arm64_DuplicateToVector64:
            {
                simdRegToSimdRegMove = (intrin.op1->TypeGet() == TYP_DOUBLE);
                break;
            }

            case NI_Vector64_ToScalar:
            case NI_Vector128_ToScalar:
            {
                simdRegToSimdRegMove = varTypeIsFloating(intrinsicTree);
                break;
            }

            case NI_Vector64_ToVector128Unsafe:
            case NI_Vector128_AsVector128Unsafe:
            case NI_Vector128_AsVector3:
            case NI_Vector128_GetLower:
            {
                simdRegToSimdRegMove = true;
                break;
            }
            case NI_AdvSimd_LoadAndInsertScalarVector64x2:
            case NI_AdvSimd_LoadAndInsertScalarVector64x3:
            case NI_AdvSimd_LoadAndInsertScalarVector64x4:
            case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
            case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
            case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
            {
                delayFreeMultiple = true;
                break;
            }

            default:
            {
                break;
            }
        }

        // If we have an RMW intrinsic or an intrinsic with simple move semantic between two SIMD registers,
        // we want to preference op1Reg to the target if op1 is not contained.

        if ((isRMW || simdRegToSimdRegMove))
        {
            if (HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id))
            {
                assert(!simdRegToSimdRegMove);
                // Prefer op2Reg for the masked operation as mask would be the op1Reg
                tgtPrefOp2 = !intrin.op1->isContained();
            }
            else
            {
                tgtPrefOp1 = !intrin.op1->isContained();
            }
        }

        if (delayFreeMultiple)
        {
            assert(isRMW);
            assert(intrin.op1->OperIs(GT_FIELD_LIST));
            GenTreeFieldList* op1 = intrin.op1->AsFieldList();
            assert(compiler->info.compNeedsConsecutiveRegisters);

            for (GenTreeFieldList::Use& use : op1->Uses())
            {
                BuildDelayFreeUses(use.GetNode(), intrinsicTree);
                srcCount++;
            }
        }
        else if (HWIntrinsicInfo::IsMaskedOperation(intrin.id))
        {
            if (!varTypeIsMask(intrin.op1->TypeGet()) && !HWIntrinsicInfo::IsExplicitMaskedOperation(intrin.id))
            {
                srcCount += BuildOperandUses(intrin.op1);
            }
            else
            {
                SingleTypeRegSet predMask = RBM_ALLMASK.GetPredicateRegSet();
                if (intrin.id == NI_Sve_ConditionalSelect)
                {
                    // If this is conditional select, make sure to check the embedded
                    // operation to determine the predicate mask.
                    assert(intrinsicTree->GetOperandCount() == 3);
                    assert(!HWIntrinsicInfo::IsLowMaskedOperation(intrin.id));

                    if (intrin.op2->OperIs(GT_HWINTRINSIC))
                    {
                        GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic();
                        const HWIntrinsic   intrinEmb(embOp2Node);
                        if (HWIntrinsicInfo::IsLowMaskedOperation(intrinEmb.id))
                        {
                            predMask = RBM_LOWMASK.GetPredicateRegSet();
                        }
                    }
                }
                else if (HWIntrinsicInfo::IsLowMaskedOperation(intrin.id))
                {
                    predMask = RBM_LOWMASK.GetPredicateRegSet();
                }

                if (tgtPrefOp2)
                {
                    srcCount += BuildDelayFreeUses(intrin.op1, intrin.op2, predMask);
                }
                else
                {
                    srcCount += BuildOperandUses(intrin.op1, predMask);
                }
            }
        }
        else if (intrinsicTree->OperIsMemoryLoadOrStore())
        {
            srcCount += BuildAddrUses(intrin.op1);
        }
        else if (tgtPrefOp1)
        {
            tgtPrefUse = BuildUse(intrin.op1);
            srcCount++;
        }
        else if ((intrin.id != NI_AdvSimd_VectorTableLookup) && (intrin.id != NI_AdvSimd_Arm64_VectorTableLookup))
        {
            srcCount += BuildOperandUses(intrin.op1);
        }
        else
        {
            srcCount += BuildConsecutiveRegistersForUse(intrin.op1);
        }
    }

    if ((intrin.category == HW_Category_SIMDByIndexedElement) && (genTypeSize(intrin.baseType) == 2))
    {
        // Some "Advanced SIMD scalar x indexed element" and "Advanced SIMD vector x indexed element" instructions (e.g.
        // "MLA (by element)") have encoding that restricts what registers that can be used for the indexed element when
        // the element size is H (i.e. 2 bytes).
        assert(intrin.op2 != nullptr);

        if ((intrin.op4 != nullptr) || ((intrin.op3 != nullptr) && !hasImmediateOperand))
        {
            if (isRMW)
            {
                srcCount += BuildDelayFreeUses(intrin.op2, nullptr);
                srcCount +=
                    BuildDelayFreeUses(intrin.op3, nullptr, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet());
            }
            else
            {
                srcCount += BuildOperandUses(intrin.op2);
                srcCount += BuildOperandUses(intrin.op3, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet());
            }

            if (intrin.op4 != nullptr)
            {
                assert(hasImmediateOperand);
                assert(varTypeIsIntegral(intrin.op4));

                srcCount += BuildOperandUses(intrin.op4);
            }
        }
        else
        {
            assert(!isRMW);

            if (intrin.id == NI_Sve_DuplicateSelectedScalarToVector)
            {
                srcCount += BuildOperandUses(intrin.op2);
            }
            else
            {
                srcCount += BuildOperandUses(intrin.op2, RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS.GetFloatRegSet());
            }

            if (intrin.op3 != nullptr)
            {
                assert(hasImmediateOperand);
                assert(varTypeIsIntegral(intrin.op3));

                srcCount += BuildOperandUses(intrin.op3);
            }
        }
    }
    else if (HWIntrinsicInfo::NeedsConsecutiveRegisters(intrin.id))
    {
        switch (intrin.id)
        {
            case NI_AdvSimd_VectorTableLookup:
            case NI_AdvSimd_Arm64_VectorTableLookup:
            {
                assert(intrin.op2 != nullptr);
                srcCount += BuildOperandUses(intrin.op2);
                assert(dstCount == 1);
                buildInternalRegisterUses();
                BuildDef(intrinsicTree);
                *pDstCount = 1;
                break;
            }

            case NI_AdvSimd_VectorTableLookupExtension:
            case NI_AdvSimd_Arm64_VectorTableLookupExtension:
            {
                assert(intrin.op2 != nullptr);
                assert(intrin.op3 != nullptr);
                assert(isRMW);
                srcCount += BuildConsecutiveRegistersForUse(intrin.op2, intrin.op1);
                srcCount += BuildDelayFreeUses(intrin.op3, intrin.op1);
                assert(dstCount == 1);
                buildInternalRegisterUses();
                BuildDef(intrinsicTree);
                *pDstCount = 1;
                break;
            }

            case NI_AdvSimd_StoreSelectedScalar:
            case NI_AdvSimd_Arm64_StoreSelectedScalar:
            {
                assert(intrin.op1 != nullptr);
                assert(intrin.op3 != nullptr);
                srcCount += (intrin.op2->gtType == TYP_STRUCT) ? BuildConsecutiveRegistersForUse(intrin.op2)
                                                               : BuildOperandUses(intrin.op2);
                if (!intrin.op3->isContainedIntOrIImmed())
                {
                    srcCount += BuildOperandUses(intrin.op3);
                }
                assert(dstCount == 0);
                buildInternalRegisterUses();
                *pDstCount = 0;
                break;
            }

            case NI_AdvSimd_Store:
            case NI_AdvSimd_Arm64_Store:
            case NI_AdvSimd_StoreVectorAndZip:
            case NI_AdvSimd_Arm64_StoreVectorAndZip:
            {
                assert(intrin.op1 != nullptr);
                srcCount += BuildConsecutiveRegistersForUse(intrin.op2);
                assert(dstCount == 0);
                buildInternalRegisterUses();
                *pDstCount = 0;
                break;
            }

            case NI_AdvSimd_LoadAndInsertScalarVector64x2:
            case NI_AdvSimd_LoadAndInsertScalarVector64x3:
            case NI_AdvSimd_LoadAndInsertScalarVector64x4:
            case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x2:
            case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x3:
            case NI_AdvSimd_Arm64_LoadAndInsertScalarVector128x4:
            {
                assert(intrin.op2 != nullptr);
                assert(intrin.op3 != nullptr);
                assert(isRMW);
                if (!intrin.op2->isContainedIntOrIImmed())
                {
                    srcCount += BuildOperandUses(intrin.op2);
                }

                assert(intrinsicTree->OperIsMemoryLoadOrStore());
                srcCount += BuildAddrUses(intrin.op3);
                buildInternalRegisterUses();
                FALLTHROUGH;
            }

            case NI_AdvSimd_Load2xVector64AndUnzip:
            case NI_AdvSimd_Load3xVector64AndUnzip:
            case NI_AdvSimd_Load4xVector64AndUnzip:
            case NI_AdvSimd_Arm64_Load2xVector128AndUnzip:
            case NI_AdvSimd_Arm64_Load3xVector128AndUnzip:
            case NI_AdvSimd_Arm64_Load4xVector128AndUnzip:
            case NI_AdvSimd_Load2xVector64:
            case NI_AdvSimd_Load3xVector64:
            case NI_AdvSimd_Load4xVector64:
            case NI_AdvSimd_Arm64_Load2xVector128:
            case NI_AdvSimd_Arm64_Load3xVector128:
            case NI_AdvSimd_Arm64_Load4xVector128:
            case NI_AdvSimd_LoadAndReplicateToVector64x2:
            case NI_AdvSimd_LoadAndReplicateToVector64x3:
            case NI_AdvSimd_LoadAndReplicateToVector64x4:
            case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x2:
            case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x3:
            case NI_AdvSimd_Arm64_LoadAndReplicateToVector128x4:
            {
                assert(intrin.op1 != nullptr);
                BuildConsecutiveRegistersForDef(intrinsicTree, dstCount);
                *pDstCount = dstCount;
                break;
            }

            case NI_Sve_Load2xVectorAndUnzip:
            case NI_Sve_Load3xVectorAndUnzip:
            case NI_Sve_Load4xVectorAndUnzip:
            {
                assert(intrin.op1 != nullptr);
                assert(intrin.op2 != nullptr);
                assert(intrinsicTree->OperIsMemoryLoadOrStore());
                srcCount += BuildAddrUses(intrin.op2);
                BuildConsecutiveRegistersForDef(intrinsicTree, dstCount);
                *pDstCount = dstCount;
                break;
            }

            case NI_Sve_StoreAndZipx2:
            case NI_Sve_StoreAndZipx3:
            case NI_Sve_StoreAndZipx4:
            {
                assert(intrin.op2 != nullptr);
                assert(intrin.op3 != nullptr);
                srcCount += BuildAddrUses(intrin.op2);
                srcCount += BuildConsecutiveRegistersForUse(intrin.op3);
                assert(dstCount == 0);
                buildInternalRegisterUses();
                *pDstCount = 0;
                break;
            }

            default:
                noway_assert(!"Not a supported as multiple consecutive register intrinsic");
        }
        return srcCount;
    }

    else if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp()) &&
             (intrin.op2->isRMWHWIntrinsic(compiler)))
    {
        assert(intrin.op3 != nullptr);

        // For ConditionalSelect, if there is an embedded operation, and the operation has RMW semantics
        // then record delay-free for operands as well as the "merge" value
        GenTreeHWIntrinsic* embOp2Node = intrin.op2->AsHWIntrinsic();
        size_t              numArgs    = embOp2Node->GetOperandCount();
        const HWIntrinsic   intrinEmb(embOp2Node);
        numArgs = embOp2Node->GetOperandCount();

        if (HWIntrinsicInfo::IsFmaIntrinsic(intrinEmb.id))
        {
            assert(embOp2Node->isRMWHWIntrinsic(compiler));
            assert(numArgs == 3);

            LIR::Use use;
            GenTree* user = nullptr;

            if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(embOp2Node, &use))
            {
                user = use.User();
            }
            unsigned resultOpNum =
                embOp2Node->GetResultOpNumForRmwIntrinsic(user, intrinEmb.op1, intrinEmb.op2, intrinEmb.op3);

            GenTree* emitOp1 = intrinEmb.op1;
            GenTree* emitOp2 = intrinEmb.op2;
            GenTree* emitOp3 = intrinEmb.op3;

            if (resultOpNum == 2)
            {
                // op2 = op1 + (op2 * op3)
                std::swap(emitOp1, emitOp3);
                std::swap(emitOp1, emitOp2);
                // op1 = (op1 * op2) + op3
            }
            else if (resultOpNum == 3)
            {
                // op3 = op1 + (op2 * op3)
                std::swap(emitOp1, emitOp3);
                // op1 = (op1 * op2) + op3
            }
            else
            {
                // op1 = op1 + (op2 * op3)
                // Nothing needs to be done
            }

            tgtPrefUse = BuildUse(emitOp1);
            srcCount += 1;
            srcCount += BuildDelayFreeUses(emitOp2, emitOp1);
            srcCount += BuildDelayFreeUses(emitOp3, emitOp1);
            srcCount += BuildDelayFreeUses(intrin.op3, emitOp1);
        }
        else
        {
            assert((numArgs == 1) || (numArgs == 2) || (numArgs == 3));
            tgtPrefUse = BuildUse(embOp2Node->Op(1));
            srcCount += 1;

            for (size_t argNum = 2; argNum <= numArgs; argNum++)
            {
                srcCount += BuildDelayFreeUses(embOp2Node->Op(argNum), embOp2Node->Op(1));
            }

            srcCount += BuildDelayFreeUses(intrin.op3, embOp2Node->Op(1));
        }
    }

    else if (intrin.op2 != nullptr)
    {
        // RMW intrinsic operands doesn't have to be delayFree when they can be assigned the same register as op1Reg
        // (i.e. a register that corresponds to read-modify-write operand) and one of them is the last use.

        assert(intrin.op1 != nullptr);

        bool             forceOp2DelayFree   = false;
        SingleTypeRegSet lowVectorCandidates = RBM_NONE;
        size_t           lowVectorOperandNum = 0;
        if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement))
        {
            if (!intrin.op2->IsCnsIntOrI() && (!intrin.op1->isContained() || intrin.op1->OperIsLocal()))
            {
                // If the index is not a constant and the object is not contained or is a local
                // we will need a general purpose register to calculate the address
                // internal register must not clobber input index
                // TODO-Cleanup: An internal register will never clobber a source; this code actually
                // ensures that the index (op2) doesn't interfere with the target.
                buildInternalIntRegisterDefForNode(intrinsicTree);
                forceOp2DelayFree = true;
            }

            if (!intrin.op2->IsCnsIntOrI() && !intrin.op1->isContained())
            {
                // If the index is not a constant or op1 is in register,
                // we will use the SIMD temp location to store the vector.
                var_types requiredSimdTempType = (intrin.id == NI_Vector64_GetElement) ? TYP_SIMD8 : TYP_SIMD16;
                compiler->getSIMDInitTempVarNum(requiredSimdTempType);
            }
        }
        else if (HWIntrinsicInfo::IsLowVectorOperation(intrin.id))
        {
            getLowVectorOperandAndCandidates(intrin, &lowVectorOperandNum, &lowVectorCandidates);
        }

        if ((intrin.id == NI_Sve_ConditionalSelect) && (intrin.op2->IsEmbMaskOp()) &&
            (intrin.op2->isRMWHWIntrinsic(compiler)))
        {
            // For ConditionalSelect, if there is an embedded operation, and the operation has RMW semantics
            // then record delay-free for them.
            GenTreeHWIntrinsic* intrinEmbOp2 = intrin.op2->AsHWIntrinsic();
            size_t              numArgs      = intrinEmbOp2->GetOperandCount();
            assert((numArgs == 1) || (numArgs == 2));
            const HWIntrinsic intrinEmb(intrinEmbOp2);
            if (HWIntrinsicInfo::IsLowVectorOperation(intrinEmb.id))
            {
                getLowVectorOperandAndCandidates(intrinEmb, &lowVectorOperandNum, &lowVectorCandidates);
            }

            tgtPrefUse = BuildUse(intrinEmbOp2->Op(1));
            srcCount += 1;

            for (size_t argNum = 2; argNum <= numArgs; argNum++)
            {
                srcCount += BuildDelayFreeUses(intrinEmbOp2->Op(argNum), intrinEmbOp2->Op(1),
                                               (argNum == lowVectorOperandNum) ? lowVectorCandidates : RBM_NONE);
            }
        }
        else if (tgtPrefOp2)
        {
            if (!intrin.op2->isContained())
            {
                assert(tgtPrefUse == nullptr);
                tgtPrefUse2 = BuildUse(intrin.op2);
                srcCount++;
            }
            else
            {
                srcCount += BuildOperandUses(intrin.op2);
            }
        }
        else
        {
            switch (intrin.id)
            {
                case NI_Sve_LoadVectorNonTemporal:
                case NI_Sve_LoadVector128AndReplicateToVector:
                case NI_Sve_StoreAndZip:
                case NI_Sve_PrefetchBytes:
                case NI_Sve_PrefetchInt16:
                case NI_Sve_PrefetchInt32:
                case NI_Sve_PrefetchInt64:
                    assert(intrinsicTree->OperIsMemoryLoadOrStore());
                    srcCount += BuildAddrUses(intrin.op2);
                    break;

                case NI_Sve_GatherPrefetch8Bit:
                case NI_Sve_GatherPrefetch16Bit:
                case NI_Sve_GatherPrefetch32Bit:
                case NI_Sve_GatherPrefetch64Bit:
                case NI_Sve_GatherVector:
                case NI_Sve_GatherVectorByteZeroExtend:
                case NI_Sve_GatherVectorInt16SignExtend:
                case NI_Sve_GatherVectorInt16WithByteOffsetsSignExtend:
                case NI_Sve_GatherVectorInt32SignExtend:
                case NI_Sve_GatherVectorInt32WithByteOffsetsSignExtend:
                case NI_Sve_GatherVectorSByteSignExtend:
                case NI_Sve_GatherVectorUInt16WithByteOffsetsZeroExtend:
                case NI_Sve_GatherVectorUInt16ZeroExtend:
                case NI_Sve_GatherVectorUInt32WithByteOffsetsZeroExtend:
                case NI_Sve_GatherVectorUInt32ZeroExtend:
                    assert(intrinsicTree->OperIsMemoryLoadOrStore());
                    if (!varTypeIsSIMD(intrin.op2->gtType))
                    {
                        srcCount += BuildAddrUses(intrin.op2);
                        break;
                    }
                    FALLTHROUGH;

                default:
                {
                    SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE;

                    if (intrin.op2->gtType == TYP_MASK)
                    {
                        assert(lowVectorOperandNum != 2);
                        candidates = RBM_ALLMASK.GetPredicateRegSet();
                    }

                    if (forceOp2DelayFree)
                    {
                        srcCount += BuildDelayFreeUses(intrin.op2, nullptr, candidates);
                    }
                    else
                    {
                        srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates)
                                          : BuildOperandUses(intrin.op2, candidates);
                    }
                }
                break;
            }
        }

        if (intrin.op3 != nullptr)
        {
            SingleTypeRegSet candidates = lowVectorOperandNum == 3 ? lowVectorCandidates : RBM_NONE;

            if (isRMW)
            {
                srcCount += BuildDelayFreeUses(intrin.op3, (tgtPrefOp2 ? intrin.op2 : intrin.op1), candidates);
            }
            else
            {
                srcCount += BuildOperandUses(intrin.op3, candidates);
            }

            if (intrin.op4 != nullptr)
            {
                assert(lowVectorOperandNum != 4);
                assert(!tgtPrefOp2);
                srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, intrin.op1) : BuildOperandUses(intrin.op4);
            }
        }
    }

    buildInternalRegisterUses();

    if ((dstCount == 1) || (dstCount == 2))
    {
        BuildDef(intrinsicTree);

        if (dstCount == 2)
        {
            BuildDef(intrinsicTree, RBM_NONE, 1);
        }
    }
    else
    {
        assert(dstCount == 0);
    }

    *pDstCount = dstCount;
    return srcCount;
}