FORCE_INLINE __m128 _mm_round_ps()

in common/checksum/sse2neon.h [7619:7673]


FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
{
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
    switch (rounding) {
    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
        return _mm_floor_ps(a);
    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
        return _mm_ceil_ps(a);
    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
    default:  //_MM_FROUND_CUR_DIRECTION
        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
    }
#else
    float *v_float = (float *) &a;

    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
        (rounding == _MM_FROUND_CUR_DIRECTION &&
         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
        uint32x4_t signmask = vdupq_n_u32(0x80000000);
        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
        int32x4_t r_trunc = vcvtq_s32_f32(
            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
        float32x4_t delta = vsubq_f32(
            vreinterpretq_f32_m128(a),
            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
        uint32x4_t is_delta_half =
            vceqq_f32(delta, half); /* delta == +/- 0.5 */
        return vreinterpretq_m128_f32(
            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
               (rounding == _MM_FROUND_CUR_DIRECTION &&
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
        return _mm_floor_ps(a);
    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
               (rounding == _MM_FROUND_CUR_DIRECTION &&
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
        return _mm_ceil_ps(a);
    }
    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
#endif
}