in common/checksum/sse2neon.h [7083:7135]
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
{
// Generate mask value from constant immediate bit value
const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
#if !SSE2NEON_PRECISE_DP
const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
#endif
// Conditional multiplication
#if !SSE2NEON_PRECISE_DP
__m128d mul = _mm_mul_pd(a, b);
const __m128d mulMask =
_mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
__m128d tmp = _mm_and_pd(mul, mulMask);
#else
#if defined(__aarch64__) || defined(_M_ARM64)
double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
: 0;
double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
: 0;
#else
double a0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
double a1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
double b0 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
double b1 =
sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
double d0 = (imm & 0x10) ? a0 * b0 : 0;
double d1 = (imm & 0x20) ? a1 * b1 : 0;
#endif
__m128d tmp = _mm_set_pd(d1, d0);
#endif
// Sum the products
#if defined(__aarch64__) || defined(_M_ARM64)
double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
#else
double _tmp0 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
double _tmp1 = sse2neon_recast_u64_f64(
vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
double sum = _tmp0 + _tmp1;
#endif
// Conditionally store the sum
const __m128d sumMask =
_mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
__m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
return res;
}