in Inc/DirectXMathVector.inl [4915:5028]
inline XMVECTOR XM_CALLCONV XMVectorATan(FXMVECTOR V) noexcept
{
// 17-degree minimax approximation
#if defined(_XM_NO_INTRINSICS_)
XMVECTORF32 Result = { { {
atanf(V.vector4_f32[0]),
atanf(V.vector4_f32[1]),
atanf(V.vector4_f32[2]),
atanf(V.vector4_f32[3])
} } };
return Result.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
float32x4_t absV = vabsq_f32(V);
float32x4_t invV = XMVectorReciprocal(V);
uint32x4_t comp = vcgtq_f32(V, g_XMOne);
float32x4_t sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
comp = vcleq_f32(absV, g_XMOne);
sign = vbslq_f32(comp, g_XMZero, sign);
float32x4_t x = vbslq_f32(comp, V, invV);
float32x4_t x2 = vmulq_f32(x, x);
// Compute polynomial approximation
const XMVECTOR TC1 = g_XMATanCoefficients1;
XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
XMVECTOR Result = vmlaq_lane_f32(vConstants, x2, vget_high_f32(TC1), 1);
vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
Result = vmlaq_f32(vConstants, Result, x2);
vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
Result = vmlaq_f32(vConstants, Result, x2);
const XMVECTOR TC0 = g_XMATanCoefficients0;
vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
Result = vmlaq_f32(vConstants, Result, x2);
vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
Result = vmlaq_f32(vConstants, Result, x2);
vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
Result = vmlaq_f32(vConstants, Result, x2);
vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
Result = vmlaq_f32(vConstants, Result, x2);
Result = vmlaq_f32(g_XMOne, Result, x2);
Result = vmulq_f32(Result, x);
float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
result1 = vsubq_f32(result1, Result);
comp = vceqq_f32(sign, g_XMZero);
Result = vbslq_f32(comp, Result, result1);
return Result;
#elif defined(_XM_SVML_INTRINSICS_)
XMVECTOR Result = _mm_atan_ps(V);
return Result;
#elif defined(_XM_SSE_INTRINSICS_)
__m128 absV = XMVectorAbs(V);
__m128 invV = _mm_div_ps(g_XMOne, V);
__m128 comp = _mm_cmpgt_ps(V, g_XMOne);
__m128 select0 = _mm_and_ps(comp, g_XMOne);
__m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
__m128 sign = _mm_or_ps(select0, select1);
comp = _mm_cmple_ps(absV, g_XMOne);
select0 = _mm_and_ps(comp, g_XMZero);
select1 = _mm_andnot_ps(comp, sign);
sign = _mm_or_ps(select0, select1);
select0 = _mm_and_ps(comp, V);
select1 = _mm_andnot_ps(comp, invV);
__m128 x = _mm_or_ps(select0, select1);
__m128 x2 = _mm_mul_ps(x, x);
// Compute polynomial approximation
const XMVECTOR TC1 = g_XMATanCoefficients1;
__m128 vConstantsB = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(3, 3, 3, 3));
__m128 vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(2, 2, 2, 2));
__m128 Result = XM_FMADD_PS(vConstantsB, x2, vConstants);
vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(1, 1, 1, 1));
Result = XM_FMADD_PS(Result, x2, vConstants);
vConstants = XM_PERMUTE_PS(TC1, _MM_SHUFFLE(0, 0, 0, 0));
Result = XM_FMADD_PS(Result, x2, vConstants);
const XMVECTOR TC0 = g_XMATanCoefficients0;
vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(3, 3, 3, 3));
Result = XM_FMADD_PS(Result, x2, vConstants);
vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(2, 2, 2, 2));
Result = XM_FMADD_PS(Result, x2, vConstants);
vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(1, 1, 1, 1));
Result = XM_FMADD_PS(Result, x2, vConstants);
vConstants = XM_PERMUTE_PS(TC0, _MM_SHUFFLE(0, 0, 0, 0));
Result = XM_FMADD_PS(Result, x2, vConstants);
Result = XM_FMADD_PS(Result, x2, g_XMOne);
Result = _mm_mul_ps(Result, x);
__m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
result1 = _mm_sub_ps(result1, Result);
comp = _mm_cmpeq_ps(sign, g_XMZero);
select0 = _mm_and_ps(comp, Result);
select1 = _mm_andnot_ps(comp, result1);
Result = _mm_or_ps(select0, select1);
return Result;
#endif
}