in Inc/DirectXMathMisc.inl [711:940]
inline XMVECTOR XM_CALLCONV XMQuaternionRotationMatrix(FXMMATRIX M) noexcept
{
#if defined(_XM_NO_INTRINSICS_)
XMVECTORF32 q;
float r22 = M.m[2][2];
if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2
{
float dif10 = M.m[1][1] - M.m[0][0];
float omr22 = 1.f - r22;
if (dif10 <= 0.f) // x^2 >= y^2
{
float fourXSqr = omr22 - dif10;
float inv4x = 0.5f / sqrtf(fourXSqr);
q.f[0] = fourXSqr * inv4x;
q.f[1] = (M.m[0][1] + M.m[1][0]) * inv4x;
q.f[2] = (M.m[0][2] + M.m[2][0]) * inv4x;
q.f[3] = (M.m[1][2] - M.m[2][1]) * inv4x;
}
else // y^2 >= x^2
{
float fourYSqr = omr22 + dif10;
float inv4y = 0.5f / sqrtf(fourYSqr);
q.f[0] = (M.m[0][1] + M.m[1][0]) * inv4y;
q.f[1] = fourYSqr * inv4y;
q.f[2] = (M.m[1][2] + M.m[2][1]) * inv4y;
q.f[3] = (M.m[2][0] - M.m[0][2]) * inv4y;
}
}
else // z^2 + w^2 >= x^2 + y^2
{
float sum10 = M.m[1][1] + M.m[0][0];
float opr22 = 1.f + r22;
if (sum10 <= 0.f) // z^2 >= w^2
{
float fourZSqr = opr22 - sum10;
float inv4z = 0.5f / sqrtf(fourZSqr);
q.f[0] = (M.m[0][2] + M.m[2][0]) * inv4z;
q.f[1] = (M.m[1][2] + M.m[2][1]) * inv4z;
q.f[2] = fourZSqr * inv4z;
q.f[3] = (M.m[0][1] - M.m[1][0]) * inv4z;
}
else // w^2 >= z^2
{
float fourWSqr = opr22 + sum10;
float inv4w = 0.5f / sqrtf(fourWSqr);
q.f[0] = (M.m[1][2] - M.m[2][1]) * inv4w;
q.f[1] = (M.m[2][0] - M.m[0][2]) * inv4w;
q.f[2] = (M.m[0][1] - M.m[1][0]) * inv4w;
q.f[3] = fourWSqr * inv4w;
}
}
return q.v;
#elif defined(_XM_ARM_NEON_INTRINSICS_)
static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } };
static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } };
static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } };
static const XMVECTORU32 Select0110 = { { { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 } } };
static const XMVECTORU32 Select0010 = { { { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 } } };
float32x4_t r0 = M.r[0];
float32x4_t r1 = M.r[1];
float32x4_t r2 = M.r[2];
float32x4_t r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
float32x4_t r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
float32x4_t r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
// x^2 >= y^2 equivalent to r11 - r00 <= 0
float32x4_t r11mr00 = vsubq_f32(r11, r00);
uint32x4_t x2gey2 = vcleq_f32(r11mr00, g_XMZero);
// z^2 >= w^2 equivalent to r11 + r00 <= 0
float32x4_t r11pr00 = vaddq_f32(r11, r00);
uint32x4_t z2gew2 = vcleq_f32(r11pr00, g_XMZero);
// x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
uint32x4_t x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
// (4*x^2, 4*y^2, 4*z^2, 4*w^2)
float32x4_t t0 = vmulq_f32(XMPMMP, r00);
float32x4_t x2y2z2w2 = vmlaq_f32(t0, XMMPMP, r11);
x2y2z2w2 = vmlaq_f32(x2y2z2w2, XMMMPP, r22);
x2y2z2w2 = vaddq_f32(x2y2z2w2, g_XMOne);
// (r01, r02, r12, r11)
t0 = vextq_f32(r0, r0, 1);
float32x4_t t1 = vextq_f32(r1, r1, 1);
t0 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_low_f32(t1)));
// (r10, r20, r21, r10)
t1 = vextq_f32(r2, r2, 3);
float32x4_t r10 = vdupq_lane_f32(vget_low_f32(r1), 0);
t1 = vbslq_f32(Select0110, t1, r10);
// (4*x*y, 4*x*z, 4*y*z, unused)
float32x4_t xyxzyz = vaddq_f32(t0, t1);
// (r21, r20, r10, r10)
t0 = vcombine_f32(vrev64_f32(vget_low_f32(r2)), vget_low_f32(r10));
// (r12, r02, r01, r12)
float32x4_t t2 = vcombine_f32(vrev64_f32(vget_high_f32(r0)), vrev64_f32(vget_low_f32(r0)));
float32x4_t t3 = vdupq_lane_f32(vget_high_f32(r1), 0);
t1 = vbslq_f32(Select0110, t2, t3);
// (4*x*w, 4*y*w, 4*z*w, unused)
float32x4_t xwywzw = vsubq_f32(t0, t1);
xwywzw = vmulq_f32(XMMPMP, xwywzw);
// (4*x*x, 4*x*y, 4*x*z, 4*x*w)
t0 = vextq_f32(xyxzyz, xyxzyz, 3);
t1 = vbslq_f32(Select0110, t0, x2y2z2w2);
t2 = vdupq_lane_f32(vget_low_f32(xwywzw), 0);
float32x4_t tensor0 = vbslq_f32(g_XMSelect1110, t1, t2);
// (4*y*x, 4*y*y, 4*y*z, 4*y*w)
t0 = vbslq_f32(g_XMSelect1011, xyxzyz, x2y2z2w2);
t1 = vdupq_lane_f32(vget_low_f32(xwywzw), 1);
float32x4_t tensor1 = vbslq_f32(g_XMSelect1110, t0, t1);
// (4*z*x, 4*z*y, 4*z*z, 4*z*w)
t0 = vextq_f32(xyxzyz, xyxzyz, 1);
t1 = vcombine_f32(vget_low_f32(t0), vrev64_f32(vget_high_f32(xwywzw)));
float32x4_t tensor2 = vbslq_f32(Select0010, x2y2z2w2, t1);
// (4*w*x, 4*w*y, 4*w*z, 4*w*w)
float32x4_t tensor3 = vbslq_f32(g_XMSelect1110, xwywzw, x2y2z2w2);
// Select the row of the tensor-product matrix that has the largest
// magnitude.
t0 = vbslq_f32(x2gey2, tensor0, tensor1);
t1 = vbslq_f32(z2gew2, tensor2, tensor3);
t2 = vbslq_f32(x2py2gez2pw2, t0, t1);
// Normalize the row. No division by zero is possible because the
// quaternion is unit-length (and the row is a nonzero multiple of
// the quaternion).
t0 = XMVector4Length(t2);
return XMVectorDivide(t2, t0);
#elif defined(_XM_SSE_INTRINSICS_)
static const XMVECTORF32 XMPMMP = { { { +1.0f, -1.0f, -1.0f, +1.0f } } };
static const XMVECTORF32 XMMPMP = { { { -1.0f, +1.0f, -1.0f, +1.0f } } };
static const XMVECTORF32 XMMMPP = { { { -1.0f, -1.0f, +1.0f, +1.0f } } };
XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0)
XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0)
XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0)
// (r00, r00, r00, r00)
XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0, 0, 0, 0));
// (r11, r11, r11, r11)
XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1, 1, 1, 1));
// (r22, r22, r22, r22)
XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2, 2, 2, 2));
// x^2 >= y^2 equivalent to r11 - r00 <= 0
// (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
XMVECTOR r11mr00 = _mm_sub_ps(r11, r00);
XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero);
// z^2 >= w^2 equivalent to r11 + r00 <= 0
// (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
XMVECTOR r11pr00 = _mm_add_ps(r11, r00);
XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero);
// x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero);
// (4*x^2, 4*y^2, 4*z^2, 4*w^2)
XMVECTOR t0 = XM_FMADD_PS(XMPMMP, r00, g_XMOne);
XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11);
XMVECTOR t2 = XM_FMADD_PS(XMMMPP, r22, t0);
XMVECTOR x2y2z2w2 = _mm_add_ps(t1, t2);
// (r01, r02, r12, r11)
t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1, 2, 2, 1));
// (r10, r10, r20, r21)
t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 0, 0));
// (r10, r20, r21, r10)
t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
// (4*x*y, 4*x*z, 4*y*z, unused)
XMVECTOR xyxzyz = _mm_add_ps(t0, t1);
// (r21, r20, r10, r10)
t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0, 0, 0, 1));
// (r12, r12, r02, r01)
t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1, 2, 2, 2));
// (r12, r02, r01, r12)
t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1, 3, 2, 0));
// (4*x*w, 4*y*w, 4*z*w, unused)
XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
// (4*x^2, 4*y^2, 4*x*y, unused)
t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0, 0, 1, 0));
// (4*z^2, 4*w^2, 4*z*w, unused)
t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0, 2, 3, 2));
// (4*x*z, 4*y*z, 4*x*w, 4*y*w)
t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1, 0, 2, 1));
// (4*x*x, 4*x*y, 4*x*z, 4*x*w)
XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
// (4*y*x, 4*y*y, 4*y*z, 4*y*w)
XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 1, 2));
// (4*z*x, 4*z*y, 4*z*z, 4*z*w)
XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2, 0, 1, 0));
// (4*w*x, 4*w*y, 4*w*z, 4*w*w)
XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1, 2, 3, 2));
// Select the row of the tensor-product matrix that has the largest
// magnitude.
t0 = _mm_and_ps(x2gey2, tensor0);
t1 = _mm_andnot_ps(x2gey2, tensor1);
t0 = _mm_or_ps(t0, t1);
t1 = _mm_and_ps(z2gew2, tensor2);
t2 = _mm_andnot_ps(z2gew2, tensor3);
t1 = _mm_or_ps(t1, t2);
t0 = _mm_and_ps(x2py2gez2pw2, t0);
t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
t2 = _mm_or_ps(t0, t1);
// Normalize the row. No division by zero is possible because the
// quaternion is unit-length (and the row is a nonzero multiple of
// the quaternion).
t0 = XMVector4Length(t2);
return _mm_div_ps(t2, t0);
#endif
}