FORCE_INLINE __m128 _mm_dp_ps()

in common/checksum/sse2neon.h [7141:7183]


FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
{
    float32x4_t elementwise_prod = _mm_mul_ps(a, b);

#if defined(__aarch64__) || defined(_M_ARM64)
    /* shortcuts */
    if (imm == 0xFF) {
        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
    }

    if ((imm & 0x0F) == 0x0F) {
        if (!(imm & (1 << 4)))
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
        if (!(imm & (1 << 5)))
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
        if (!(imm & (1 << 6)))
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
        if (!(imm & (1 << 7)))
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);

        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
    }
#endif

    float s = 0.0f;

    if (imm & (1 << 4))
        s += vgetq_lane_f32(elementwise_prod, 0);
    if (imm & (1 << 5))
        s += vgetq_lane_f32(elementwise_prod, 1);
    if (imm & (1 << 6))
        s += vgetq_lane_f32(elementwise_prod, 2);
    if (imm & (1 << 7))
        s += vgetq_lane_f32(elementwise_prod, 3);

    const float32_t res[4] = {
        (imm & 0x1) ? s : 0.0f,
        (imm & 0x2) ? s : 0.0f,
        (imm & 0x4) ? s : 0.0f,
        (imm & 0x8) ? s : 0.0f,
    };
    return vreinterpretq_m128_f32(vld1q_f32(res));
}