int LinearScan::BuildHWIntrinsic()

in src/coreclr/jit/lsraxarch.cpp [2094:2882]


int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount)
{
    assert(pDstCount != nullptr);

    NamedIntrinsic      intrinsicId = intrinsicTree->GetHWIntrinsicId();
    var_types           baseType    = intrinsicTree->GetSimdBaseType();
    size_t              numArgs     = intrinsicTree->GetOperandCount();
    HWIntrinsicCategory category    = HWIntrinsicInfo::lookupCategory(intrinsicId);

    // Set the AVX Flags if this instruction may use VEX encoding for SIMD operations.
    // Note that this may be true even if the ISA is not AVX (e.g. for platform-agnostic intrinsics
    // or non-AVX intrinsics that will use VEX encoding if it is available on the target).
    if (intrinsicTree->isSIMD())
    {
        SetContainsAVXFlags(intrinsicTree->GetSimdSize());
    }

    int srcCount = 0;
    int dstCount;

    if (intrinsicTree->IsValue())
    {
        if (HWIntrinsicInfo::IsMultiReg(intrinsicId))
        {
            dstCount = HWIntrinsicInfo::GetMultiRegCount(intrinsicId);
        }
        else
        {
            dstCount = 1;
        }
    }
    else
    {
        dstCount = 0;
    }

    SingleTypeRegSet dstCandidates = RBM_NONE;

    if (intrinsicTree->GetOperandCount() == 0)
    {
        assert(numArgs == 0);
    }
    else
    {
        // A contained CreateScalarUnsafe is special in that we're not containing it to load from
        // memory and it isn't a constant. Instead, its essentially a "transparent" node we're ignoring
        // to simplify the overall IR handling. As such, we need to "skip" such nodes when present and
        // get the underlying op1 so that delayFreeUse and other preferencing remains correct.

        GenTree* op1    = nullptr;
        GenTree* op2    = nullptr;
        GenTree* op3    = nullptr;
        GenTree* op4    = nullptr;
        GenTree* op5    = nullptr;
        GenTree* lastOp = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(numArgs));

        switch (numArgs)
        {
            case 5:
            {
                op5 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(5));
                FALLTHROUGH;
            }

            case 4:
            {
                op4 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(4));
                FALLTHROUGH;
            }

            case 3:
            {
                op3 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(3));
                FALLTHROUGH;
            }

            case 2:
            {
                op2 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(2));
                FALLTHROUGH;
            }

            case 1:
            {
                op1 = SkipContainedCreateScalarUnsafe(intrinsicTree->Op(1));
                break;
            }

            default:
            {
                unreached();
            }
        }

        bool buildUses = true;

        if ((category == HW_Category_IMM) && !HWIntrinsicInfo::NoJmpTableImm(intrinsicId))
        {
            if (HWIntrinsicInfo::isImmOp(intrinsicId, lastOp) && !lastOp->isContainedIntOrIImmed())
            {
                assert(!lastOp->IsCnsIntOrI());

                // We need two extra reg when lastOp isn't a constant so
                // the offset into the jump table for the fallback path
                // can be computed.
                buildInternalIntRegisterDefForNode(intrinsicTree);
                buildInternalIntRegisterDefForNode(intrinsicTree);
            }
        }

        if (intrinsicTree->OperIsEmbRoundingEnabled() && !lastOp->IsCnsIntOrI())
        {
            buildInternalIntRegisterDefForNode(intrinsicTree);
            buildInternalIntRegisterDefForNode(intrinsicTree);
        }

        // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it
        // is not allocated the same register as the target.
        bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler);
#if defined(TARGET_AMD64)
        bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic();
#endif // TARGET_AMD64

        // Create internal temps, and handle any other special requirements.
        // Note that the default case for building uses will handle the RMW flag, but if the uses
        // are built in the individual cases, buildUses is set to false, and any RMW handling (delayFree)
        // must be handled within the case.
        switch (intrinsicId)
        {
            case NI_Vector128_CreateScalarUnsafe:
            case NI_Vector128_ToScalar:
            case NI_Vector256_CreateScalarUnsafe:
            case NI_Vector256_ToScalar:
            case NI_Vector512_CreateScalarUnsafe:
            case NI_Vector512_ToScalar:
            {
                assert(numArgs == 1);

                if (varTypeIsFloating(baseType))
                {
                    if (op1->isContained())
                    {
                        srcCount += BuildOperandUses(op1);
                    }
                    else
                    {
                        // We will either be in memory and need to be moved
                        // into a register of the appropriate size or we
                        // are already in an XMM/YMM/ZMM register and can stay
                        // where we are.

                        tgtPrefUse = BuildUse(op1);
                        srcCount += 1;
                    }

                    buildUses = false;
                }
                break;
            }

            case NI_Vector128_GetElement:
            case NI_Vector256_GetElement:
            case NI_Vector512_GetElement:
            {
                assert(numArgs == 2);

                if (!op2->OperIsConst() && !op1->isContained())
                {
                    // If the index is not a constant or op1 is in register,
                    // we will use the SIMD temp location to store the vector.

                    var_types requiredSimdTempType = Compiler::getSIMDTypeForSize(intrinsicTree->GetSimdSize());
                    compiler->getSIMDInitTempVarNum(requiredSimdTempType);
                }
                break;
            }

            case NI_Vector128_AsVector128Unsafe:
            case NI_Vector128_AsVector2:
            case NI_Vector128_AsVector3:
            case NI_Vector128_ToVector256:
            case NI_Vector128_ToVector512:
            case NI_Vector256_ToVector512:
            case NI_Vector128_ToVector256Unsafe:
            case NI_Vector256_ToVector512Unsafe:
            case NI_Vector256_GetLower:
            case NI_Vector512_GetLower:
            case NI_Vector512_GetLower128:
            {
                assert(numArgs == 1);

                if (op1->isContained())
                {
                    srcCount += BuildOperandUses(op1);
                }
                else
                {
                    // We will either be in memory and need to be moved
                    // into a register of the appropriate size or we
                    // are already in an XMM/YMM register and can stay
                    // where we are.

                    tgtPrefUse = BuildUse(op1);
                    srcCount += 1;
                }

                buildUses = false;
                break;
            }

            case NI_SSE2_MaskMove:
            {
                assert(numArgs == 3);
                assert(!isRMW);

                // MaskMove hardcodes the destination (op3) in DI/EDI/RDI
                srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1));
                srcCount += BuildOperandUses(op2, BuildEvexIncompatibleMask(op2));
                srcCount += BuildOperandUses(op3, SRBM_EDI);

                buildUses = false;
                break;
            }

            case NI_SSE41_BlendVariable:
            {
                assert(numArgs == 3);

                if (!compiler->canUseVexEncoding())
                {
                    assert(isRMW);

                    // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0
                    tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1));

                    srcCount += 1;
                    srcCount += op2->isContained() ? BuildOperandUses(op2, BuildEvexIncompatibleMask(op2))
                                                   : BuildDelayFreeUses(op2, op1, BuildEvexIncompatibleMask(op2));
                    srcCount += BuildDelayFreeUses(op3, op1, SRBM_XMM0);

                    buildUses = false;
                }
                break;
            }

            case NI_SSE41_Extract:
            {
                assert(!varTypeIsFloating(baseType));

#ifdef TARGET_X86
                if (varTypeIsByte(baseType))
                {
                    dstCandidates = allByteRegs();
                }
#endif
                break;
            }

#ifdef TARGET_X86
            case NI_SSE42_Crc32:
            case NI_SSE42_X64_Crc32:
            {
                // TODO-XArch-Cleanup: Currently we use the BaseType to bring the type of the second argument
                // to the code generator. We may want to encode the overload info in another way.

                assert(numArgs == 2);
                assert(isRMW);

                // CRC32 may operate over "byte" but on x86 only RBM_BYTE_REGS can be used as byte registers.
                tgtPrefUse = BuildUse(op1);

                srcCount += 1;
                srcCount += BuildDelayFreeUses(op2, op1, varTypeIsByte(baseType) ? allByteRegs() : RBM_NONE);

                buildUses = false;
                break;
            }
#endif // TARGET_X86

            case NI_X86Base_DivRem:
            case NI_X86Base_X64_DivRem:
            {
                assert(numArgs == 3);
                assert(dstCount == 2);
                assert(isRMW);

                // DIV implicitly put op1(lower) to EAX and op2(upper) to EDX
                srcCount += BuildOperandUses(op1, SRBM_EAX);
                srcCount += BuildOperandUses(op2, SRBM_EDX);

                if (!op3->isContained())
                {
                    // For non-contained nodes, we want to make sure we delay free the register for
                    // op3 with respect to both op1 and op2. In other words, op3 shouldn't get same
                    // register that is assigned to either of op1 and op2.

                    RefPosition* op3RefPosition;
                    srcCount += BuildDelayFreeUses(op3, op1, RBM_NONE, &op3RefPosition);
                    if ((op3RefPosition != nullptr) && !op3RefPosition->delayRegFree)
                    {
                        // If op3 was not marked as delay-free for op1, mark it as delay-free
                        // if needed for op2.
                        AddDelayFreeUses(op3RefPosition, op2);
                    }
                }
                else
                {
                    srcCount += BuildOperandUses(op3);
                }

                // result put in EAX and EDX
                BuildDef(intrinsicTree, SRBM_EAX, 0);
                BuildDef(intrinsicTree, SRBM_EDX, 1);

                buildUses = false;
                break;
            }

            case NI_BMI2_MultiplyNoFlags:
            case NI_BMI2_X64_MultiplyNoFlags:
            {
                assert(numArgs == 2 || numArgs == 3);
                srcCount += BuildOperandUses(op1, SRBM_EDX);
                srcCount += BuildOperandUses(op2);
                if (numArgs == 3)
                {
                    // op3 reg should be different from target reg to
                    // store the lower half result after executing the instruction
                    srcCount += BuildDelayFreeUses(op3, op1);
                    // Need a internal register different from the dst to take the lower half result
                    buildInternalIntRegisterDefForNode(intrinsicTree);
                    setInternalRegsDelayFree = true;
                }
                buildUses = false;
                break;
            }

            case NI_FMA_MultiplyAdd:
            case NI_FMA_MultiplyAddNegated:
            case NI_FMA_MultiplyAddNegatedScalar:
            case NI_FMA_MultiplyAddScalar:
            case NI_FMA_MultiplyAddSubtract:
            case NI_FMA_MultiplySubtract:
            case NI_FMA_MultiplySubtractAdd:
            case NI_FMA_MultiplySubtractNegated:
            case NI_FMA_MultiplySubtractNegatedScalar:
            case NI_FMA_MultiplySubtractScalar:
            case NI_AVX512F_FusedMultiplyAdd:
            case NI_AVX512F_FusedMultiplyAddScalar:
            case NI_AVX512F_FusedMultiplyAddNegated:
            case NI_AVX512F_FusedMultiplyAddNegatedScalar:
            case NI_AVX512F_FusedMultiplyAddSubtract:
            case NI_AVX512F_FusedMultiplySubtract:
            case NI_AVX512F_FusedMultiplySubtractScalar:
            case NI_AVX512F_FusedMultiplySubtractAdd:
            case NI_AVX512F_FusedMultiplySubtractNegated:
            case NI_AVX512F_FusedMultiplySubtractNegatedScalar:
            case NI_AVX10v1_FusedMultiplyAddNegatedScalar:
            case NI_AVX10v1_FusedMultiplyAddScalar:
            case NI_AVX10v1_FusedMultiplySubtractNegatedScalar:
            case NI_AVX10v1_FusedMultiplySubtractScalar:
            {
                assert((numArgs == 3) || (intrinsicTree->OperIsEmbRoundingEnabled()));
                assert(isRMW);
                assert(HWIntrinsicInfo::IsFmaIntrinsic(intrinsicId));

                const bool copiesUpperBits = HWIntrinsicInfo::CopiesUpperBits(intrinsicId);

                LIR::Use use;
                GenTree* user = nullptr;

                if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(intrinsicTree, &use))
                {
                    user = use.User();
                }
                unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3);

                unsigned containedOpNum = 0;

                // containedOpNum remains 0 when no operand is contained or regOptional
                if (op1->isContained() || op1->IsRegOptional())
                {
                    containedOpNum = 1;
                }
                else if (op2->isContained() || op2->IsRegOptional())
                {
                    containedOpNum = 2;
                }
                else if (op3->isContained() || op3->IsRegOptional())
                {
                    containedOpNum = 3;
                }

                GenTree* emitOp1 = op1;
                GenTree* emitOp2 = op2;
                GenTree* emitOp3 = op3;

                // Intrinsics with CopyUpperBits semantics must have op1 as target
                assert(containedOpNum != 1 || !copiesUpperBits);

                // We need to keep this in sync with hwintrinsiccodegenxarch.cpp
                // Ideally we'd actually swap the operands here and simplify codegen
                // but its a bit more complicated to do so for many operands as well
                // as being complicated to tell codegen how to pick the right instruction

                if (containedOpNum == 1)
                {
                    // https://github.com/dotnet/runtime/issues/62215
                    // resultOpNum might change between lowering and lsra, comment out assertion for now.
                    // assert(containedOpNum != resultOpNum);
                    // resultOpNum is 3 or 0: op3/? = ([op1] * op2) + op3
                    std::swap(emitOp1, emitOp3);

                    if (resultOpNum == 2)
                    {
                        // op2 = ([op1] * op2) + op3
                        std::swap(emitOp1, emitOp2);
                    }
                }
                else if (containedOpNum == 3)
                {
                    // assert(containedOpNum != resultOpNum);
                    if (resultOpNum == 2 && !copiesUpperBits)
                    {
                        // op2 = (op1 * op2) + [op3]
                        std::swap(emitOp1, emitOp2);
                    }
                    // else: op1/? = (op1 * op2) + [op3]
                }
                else if (containedOpNum == 2)
                {
                    // assert(containedOpNum != resultOpNum);

                    // op1/? = (op1 * [op2]) + op3
                    std::swap(emitOp2, emitOp3);
                    if (resultOpNum == 3 && !copiesUpperBits)
                    {
                        // op3 = (op1 * [op2]) + op3
                        std::swap(emitOp1, emitOp2);
                    }
                }
                else
                {
                    // containedOpNum == 0
                    // no extra work when resultOpNum is 0 or 1
                    if (resultOpNum == 2)
                    {
                        std::swap(emitOp1, emitOp2);
                    }
                    else if (resultOpNum == 3)
                    {
                        std::swap(emitOp1, emitOp3);
                    }
                }

                GenTree* ops[] = {op1, op2, op3};
                for (GenTree* op : ops)
                {
                    if (op == emitOp1)
                    {
                        tgtPrefUse = BuildUse(op);
                        srcCount++;
                    }
                    else if (op == emitOp2)
                    {
                        srcCount += BuildDelayFreeUses(op, emitOp1);
                    }
                    else if (op == emitOp3)
                    {
                        srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, emitOp1);
                    }
                }

                if (intrinsicTree->OperIsEmbRoundingEnabled() && !intrinsicTree->Op(4)->IsCnsIntOrI())
                {
                    srcCount += BuildOperandUses(intrinsicTree->Op(4));
                }

                buildUses = false;
                break;
            }

            case NI_EVEX_BlendVariableMask:
            {
                assert(numArgs == 3);

                if (op2->IsEmbMaskOp())
                {
                    // TODO-AVX512-CQ: Ensure we can support embedded operations on RMW intrinsics
                    assert(!op2->isRMWHWIntrinsic(compiler));

                    if (isRMW)
                    {
                        assert(!op1->isContained());

                        tgtPrefUse = BuildUse(op1);
                        srcCount += 1;

                        assert(op2->isContained());

                        for (GenTree* operand : op2->AsHWIntrinsic()->Operands())
                        {
                            assert(varTypeIsSIMD(operand) || varTypeIsInt(operand));
                            srcCount += BuildDelayFreeUses(operand, op1);
                        }
                    }
                    else
                    {
                        assert(op1->isContained() && op1->IsVectorZero());
                        srcCount += BuildOperandUses(op1);

                        assert(op2->isContained());

                        for (GenTree* operand : op2->AsHWIntrinsic()->Operands())
                        {
                            assert(varTypeIsSIMD(operand) || varTypeIsInt(operand));
                            srcCount += BuildOperandUses(operand);
                        }
                    }

                    assert(!op3->isContained());
                    srcCount += BuildOperandUses(op3);

                    buildUses = false;
                }
                break;
            }

            case NI_AVX512F_PermuteVar8x64x2:
            case NI_AVX512F_PermuteVar16x32x2:
            case NI_AVX512F_VL_PermuteVar2x64x2:
            case NI_AVX512F_VL_PermuteVar4x32x2:
            case NI_AVX512F_VL_PermuteVar4x64x2:
            case NI_AVX512F_VL_PermuteVar8x32x2:
            case NI_AVX512BW_PermuteVar32x16x2:
            case NI_AVX512BW_VL_PermuteVar8x16x2:
            case NI_AVX512BW_VL_PermuteVar16x16x2:
            case NI_AVX512VBMI_PermuteVar64x8x2:
            case NI_AVX512VBMI_VL_PermuteVar16x8x2:
            case NI_AVX512VBMI_VL_PermuteVar32x8x2:
            case NI_AVX10v1_PermuteVar16x8x2:
            case NI_AVX10v1_PermuteVar2x64x2:
            case NI_AVX10v1_PermuteVar4x32x2:
            case NI_AVX10v1_PermuteVar8x16x2:
            case NI_AVX10v1_PermuteVar32x8x2:
            case NI_AVX10v1_PermuteVar4x64x2:
            case NI_AVX10v1_PermuteVar8x32x2:
            case NI_AVX10v1_PermuteVar16x16x2:
            case NI_AVX10v1_V512_PermuteVar64x8x2:
            {
                assert(numArgs == 3);
                assert(isRMW);
                assert(HWIntrinsicInfo::IsPermuteVar2x(intrinsicId));

                LIR::Use use;
                GenTree* user = nullptr;

                if (LIR::AsRange(blockSequence[curBBSeqNum]).TryGetUse(intrinsicTree, &use))
                {
                    user = use.User();
                }
                unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3);

                assert(!op1->isContained());
                assert(!op2->isContained());

                GenTree* emitOp1 = op1;
                GenTree* emitOp2 = op2;
                GenTree* emitOp3 = op3;

                if (resultOpNum == 2)
                {
                    std::swap(emitOp1, emitOp2);
                }

                GenTree* ops[] = {op1, op2, op3};
                for (GenTree* op : ops)
                {
                    if (op == emitOp1)
                    {
                        tgtPrefUse = BuildUse(op);
                        srcCount++;
                    }
                    else if (op == emitOp2)
                    {
                        srcCount += BuildDelayFreeUses(op, emitOp1);
                    }
                    else if (op == emitOp3)
                    {
                        srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, emitOp1);
                    }
                }

                buildUses = false;
                break;
            }

            case NI_AVXVNNI_MultiplyWideningAndAdd:
            case NI_AVXVNNI_MultiplyWideningAndAddSaturate:
            {
                assert(numArgs == 3);

                tgtPrefUse = BuildUse(op1);
                srcCount += 1;
                srcCount += BuildDelayFreeUses(op2, op1);
                srcCount += op3->isContained() ? BuildOperandUses(op3) : BuildDelayFreeUses(op3, op1);

                buildUses = false;
                break;
            }

            case NI_AVX2_GatherVector128:
            case NI_AVX2_GatherVector256:
            {
                assert(numArgs == 3);
                assert(!isRMW);

                // Any pair of the index, mask, or destination registers should be different
                srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1));
                srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2));

                // op3 should always be contained
                assert(op3->isContained());

                // get a tmp register for mask that will be cleared by gather instructions
                buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
                setInternalRegsDelayFree = true;

                buildUses = false;
                break;
            }

            case NI_AVX2_GatherMaskVector128:
            case NI_AVX2_GatherMaskVector256:
            {
                assert(!isRMW);

                // Any pair of the index, mask, or destination registers should be different
                srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1));
                srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2));
                srcCount += BuildDelayFreeUses(op3, nullptr, BuildEvexIncompatibleMask(op3));
                srcCount += BuildDelayFreeUses(op4, nullptr, BuildEvexIncompatibleMask(op4));

                // op5 should always be contained
                assert(op5->isContained());

                // get a tmp register for mask that will be cleared by gather instructions
                buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs());
                setInternalRegsDelayFree = true;

                buildUses = false;
                break;
            }

            default:
            {
                assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END));
                assert(!HWIntrinsicInfo::IsFmaIntrinsic(intrinsicId));
                assert(!HWIntrinsicInfo::IsPermuteVar2x(intrinsicId));
                break;
            }
        }

        if (buildUses)
        {
            SingleTypeRegSet op1RegCandidates = RBM_NONE;

#if defined(TARGET_AMD64)
            if (!isEvexCompatible)
            {
                op1RegCandidates = BuildEvexIncompatibleMask(op1);
            }
#endif // TARGET_AMD64

            if (intrinsicTree->OperIsMemoryLoadOrStore())
            {
                srcCount += BuildAddrUses(op1, op1RegCandidates);
            }
            else if (isRMW && !op1->isContained())
            {
                tgtPrefUse = BuildUse(op1, op1RegCandidates);
                srcCount += 1;
            }
            else
            {
                srcCount += BuildOperandUses(op1, op1RegCandidates);
            }

            if (op2 != nullptr)
            {
                SingleTypeRegSet op2RegCandidates = RBM_NONE;

#if defined(TARGET_AMD64)
                if (!isEvexCompatible)
                {
                    op2RegCandidates = BuildEvexIncompatibleMask(op2);
                }
#endif // TARGET_AMD64

                if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained())
                {
                    srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1), op2RegCandidates);
                }
                else if (isRMW)
                {
                    if (!op2->isContained() && intrinsicTree->isCommutativeHWIntrinsic())
                    {
                        // When op2 is not contained and we are commutative, we can set op2
                        // to also be a tgtPrefUse. Codegen will then swap the operands.

                        tgtPrefUse2 = BuildUse(op2, op2RegCandidates);
                        srcCount += 1;
                    }
                    else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet()))
                    {
                        // When op2 is not contained or if we are producing a scalar value
                        // we need to mark it as delay free because the operand and target
                        // exist in the same register set.
                        srcCount += BuildDelayFreeUses(op2, op1, op2RegCandidates);
                    }
                    else
                    {
                        // When op2 is contained and we are not producing a scalar value we
                        // have no concerns of overwriting op2 because they exist in different
                        // register sets.

                        srcCount += BuildOperandUses(op2, op2RegCandidates);
                    }
                }
                else
                {
                    srcCount += BuildOperandUses(op2, op2RegCandidates);
                }

                if (op3 != nullptr)
                {
                    SingleTypeRegSet op3RegCandidates = RBM_NONE;

#if defined(TARGET_AMD64)
                    if (!isEvexCompatible)
                    {
                        op3RegCandidates = BuildEvexIncompatibleMask(op3);
                    }
#endif // TARGET_AMD64

                    srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates)
                                      : BuildOperandUses(op3, op3RegCandidates);

                    if (op4 != nullptr)
                    {
                        SingleTypeRegSet op4RegCandidates = RBM_NONE;

#if defined(TARGET_AMD64)
                        assert(isEvexCompatible);
#endif // TARGET_AMD64

                        srcCount += isRMW ? BuildDelayFreeUses(op4, op1, op4RegCandidates)
                                          : BuildOperandUses(op4, op4RegCandidates);
                    }
                }
            }
        }

        buildInternalRegisterUses();
    }

    if (dstCount == 1)
    {
#if defined(TARGET_AMD64)
        bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic();

        if (!isEvexCompatible)
        {
            dstCandidates = BuildEvexIncompatibleMask(intrinsicTree);
        }
#endif

        BuildDef(intrinsicTree, dstCandidates);
    }
    else
    {
        // Currently dstCount = 2 is only used for DivRem, which has special constriants and handled above
        assert((dstCount == 0) ||
               ((dstCount == 2) && ((intrinsicId == NI_X86Base_DivRem) || (intrinsicId == NI_X86Base_X64_DivRem))));
    }

    *pDstCount = dstCount;
    return srcCount;
}