in Inc/DirectXMathVector.inl [9826:9940]
inline XMVECTOR XM_CALLCONV XMVector3Normalize(FXMVECTOR V) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
float fLength;
XMVECTOR vResult;
vResult = XMVector3Length(V);
fLength = vResult.vector4_f32[0];
// Prevent divide by zero
if (fLength > 0)
{
fLength = 1.0f / fLength;
}
vResult.vector4_f32[0] = V.vector4_f32[0] * fLength;
vResult.vector4_f32[1] = V.vector4_f32[1] * fLength;
vResult.vector4_f32[2] = V.vector4_f32[2] * fLength;
vResult.vector4_f32[3] = V.vector4_f32[3] * fLength;
return vResult;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
// Dot3
float32x4_t vTemp = vmulq_f32(V, V);
float32x2_t v1 = vget_low_f32(vTemp);
float32x2_t v2 = vget_high_f32(vTemp);
v1 = vpadd_f32(v1, v1);
v2 = vdup_lane_f32(v2, 0);
v1 = vadd_f32(v1, v2);
uint32x2_t VEqualsZero = vceq_f32(v1, vdup_n_f32(0));
uint32x2_t VEqualsInf = vceq_f32(v1, vget_low_f32(g_XMInfinity));
// Reciprocal sqrt (2 iterations of Newton-Raphson)
float32x2_t S0 = vrsqrte_f32(v1);
float32x2_t P0 = vmul_f32(v1, S0);
float32x2_t R0 = vrsqrts_f32(P0, S0);
float32x2_t S1 = vmul_f32(S0, R0);
float32x2_t P1 = vmul_f32(v1, S1);
float32x2_t R1 = vrsqrts_f32(P1, S1);
v2 = vmul_f32(S1, R1);
// Normalize
XMVECTOR vResult = vmulq_f32(V, vcombine_f32(v2, v2));
vResult = vbslq_f32(vcombine_u32(VEqualsZero, VEqualsZero), vdupq_n_f32(0), vResult);
return vbslq_f32(vcombine_u32(VEqualsInf, VEqualsInf), g_XMQNaN, vResult);
#elif defined(_XM_SSE4_INTRINSICS_)
XMVECTOR vLengthSq = _mm_dp_ps(V, V, 0x7f);
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#elif defined(_XM_SSE3_INTRINSICS_)
// Perform the dot product on x,y and z only
XMVECTOR vLengthSq = _mm_mul_ps(V, V);
vLengthSq = _mm_and_ps(vLengthSq, g_XMMask3);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
vLengthSq = _mm_hadd_ps(vLengthSq, vLengthSq);
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#elif defined(_XM_SSE_INTRINSICS_)
// Perform the dot product on x,y and z only
XMVECTOR vLengthSq = _mm_mul_ps(V, V);
XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(2, 1, 2, 1));
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vTemp = XM_PERMUTE_PS(vTemp, _MM_SHUFFLE(1, 1, 1, 1));
vLengthSq = _mm_add_ss(vLengthSq, vTemp);
vLengthSq = XM_PERMUTE_PS(vLengthSq, _MM_SHUFFLE(0, 0, 0, 0));
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask, vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq, g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V, vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult, vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq, g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult, vLengthSq);
vResult = _mm_or_ps(vTemp1, vTemp2);
return vResult;
#endif
}