inline XMVECTOR XM_CALLCONV XMVectorATan()

in Inc/DirectXMathVector.inl [4915:5028]
88 lines of code
1 McCabe index (conditional complexity)

inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept
{
    // 17-degree minimax approximation

#if defined(_XM_NO_INTRINSICS_)
    XMVECTORF32 Result = { { {
            atanf(V.vector4_f32[0]),
            atanf(V.vector4_f32[1]),
            atanf(V.vector4_f32[2]),
            atanf(V.vector4_f32[3])
        } } };
    return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
    float32x4_t absV = vabsq_f32(V);
    float32x4_t invV = XMVectorReciprocal(V);
    uint32x4_t comp = vcgtq_f32(V, g_XMOne);
    float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
    comp = vcleq_f32(absV, g_XMOne);
    sign = vbslq_f32(comp, g_XMZero, sign);
    float32x4_t x = vbslq_f32(comp, V, invV);

    float32x4_t x2 = vmulq_f32(x, x);

    // Compute polynomial approximation
    const XMVECTOR TC1 = g_XMATanCoefficients1;
    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
    XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1);

    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    const XMVECTOR TC0 = g_XMATanCoefficients0;
    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
    Result = vmlaq_f32(vConstants, Result, x2);

    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
    Result = vmlaq_f32(vConstants, Result, x2);

    Result = vmlaq_f32(g_XMOne, Result, x2);
    Result = vmulq_f32(Result, x);

    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
    result1 = vsubq_f32(result1, Result);

    comp = vceqq_f32(sign, g_XMZero);
    Result = vbslq_f32(comp, Result, result1);
    return Result;
#elif defined(_XM_SVML_INTRINSICS_)
    XMVECTOR Result = _mm_atan_ps(V);
    return Result;
#elif defined(_XM_SSE_INTRINSICS_)
    __m128 absV = XMVectorAbs(V);
    __m128 invV = _mm_div_ps(g_XMOne, V);
    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
    __m128 select0 = _mm_and_ps(comp, g_XMOne);
    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
    __m128 sign = _mm_or_ps(select0, select1);
    comp = _mm_cmple_ps(absV, g_XMOne);
    select0 = _mm_and_ps(comp, g_XMZero);
    select1 = _mm_andnot_ps(comp, sign);
    sign = _mm_or_ps(select0, select1);
    select0 = _mm_and_ps(comp, V);
    select1 = _mm_andnot_ps(comp, invV);
    __m128 x = _mm_or_ps(select0, select1);

    __m128 x2 = _mm_mul_ps(x, x);

    // Compute polynomial approximation
    const XMVECTOR TC1 = g_XMATanCoefficients1;
    __m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3));
    __m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2));
    __m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    const XMVECTOR TC0 = g_XMATanCoefficients0;
    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0));
    Result = XM_FMADD_PS(Result, x2, vConstants);

    Result = XM_FMADD_PS(Result, x2, g_XMOne);

    Result = _mm_mul_ps(Result, x);
    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
    result1 = _mm_sub_ps(result1, Result);

    comp = _mm_cmpeq_ps(sign, g_XMZero);
    select0 = _mm_and_ps(comp, Result);
    select1 = _mm_andnot_ps(comp, result1);
    Result = _mm_or_ps(select0, select1);
    return Result;
#endif
}