in include/Algo-Direct2.h [57:100]
void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
{
union U {
__m128i vec;
uint32 ui32[4];
} u;
const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
const float *xi = base_t::data.xi;
// read indices t
const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]);
const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]);
const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]);
const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]);
#if 0
// read pairs ( X(t-1), X(t) )
__m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3));
__m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2));
__m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1));
__m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0));
// build:
// { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
// { X(t(0)), X(t(1)), X(t(2)), X(t(3)) }
__m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6));
__m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6));
__m128 u01 = _mm_unpacklo_ps(h02, h13);
__m128 u23 = _mm_unpackhi_ps(h02, h13);
__m128 vxm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6));
__m128 vxp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6));
#else
__m128 xp23 = _mm_castpd_ps(_mm_set_pd(*p3, *p2));
__m128 xp01 = _mm_castpd_ps(_mm_set_pd(*p1, *p0));
__m128 vxm = _mm_shuffle_ps(xp01, xp23, (0) + (2 << 2) + (0 << 4) + (2 << 6));
__m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6));
#endif
IVec<SSE, float> i(u.vec);
IVec<SSE, float> vlem = vz < vxm;
IVec<SSE, float> vlep = vz < vxp;
i = i + vlem + vlep;
i.store(pr);
}