FORCE_INLINE __m128d _mm_dp_pd()

in common/checksum/sse2neon.h [7083:7135]


FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
{
    // Generate mask value from constant immediate bit value
    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
#if !SSE2NEON_PRECISE_DP
    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
#endif
    // Conditional multiplication
#if !SSE2NEON_PRECISE_DP
    __m128d mul = _mm_mul_pd(a, b);
    const __m128d mulMask =
        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
    __m128d tmp = _mm_and_pd(mul, mulMask);
#else
#if defined(__aarch64__) || defined(_M_ARM64)
    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
                             : 0;
    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                             : 0;
#else
    double a0 =
        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
    double a1 =
        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
    double b0 =
        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
    double b1 =
        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
    double d0 = (imm & 0x10) ? a0 * b0 : 0;
    double d1 = (imm & 0x20) ? a1 * b1 : 0;
#endif
    __m128d tmp = _mm_set_pd(d1, d0);
#endif
    // Sum the products
#if defined(__aarch64__) || defined(_M_ARM64)
    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
#else
    double _tmp0 = sse2neon_recast_u64_f64(
        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
    double _tmp1 = sse2neon_recast_u64_f64(
        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
    double sum = _tmp0 + _tmp1;
#endif
    // Conditionally store the sum
    const __m128d sumMask =
        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
    return res;
}