in common/checksum/sse2neon.h [7141:7183]
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
{
float32x4_t elementwise_prod = _mm_mul_ps(a, b);
#if defined(__aarch64__) || defined(_M_ARM64)
/* shortcuts */
if (imm == 0xFF) {
return _mm_set1_ps(vaddvq_f32(elementwise_prod));
}
if ((imm & 0x0F) == 0x0F) {
if (!(imm & (1 << 4)))
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
if (!(imm & (1 << 5)))
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
if (!(imm & (1 << 6)))
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
if (!(imm & (1 << 7)))
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
return _mm_set1_ps(vaddvq_f32(elementwise_prod));
}
#endif
float s = 0.0f;
if (imm & (1 << 4))
s += vgetq_lane_f32(elementwise_prod, 0);
if (imm & (1 << 5))
s += vgetq_lane_f32(elementwise_prod, 1);
if (imm & (1 << 6))
s += vgetq_lane_f32(elementwise_prod, 2);
if (imm & (1 << 7))
s += vgetq_lane_f32(elementwise_prod, 3);
const float32_t res[4] = {
(imm & 0x1) ? s : 0.0f,
(imm & 0x2) ? s : 0.0f,
(imm & 0x4) ? s : 0.0f,
(imm & 0x8) ? s : 0.0f,
};
return vreinterpretq_m128_f32(vld1q_f32(res));
}