void resolve()

in include/Algo-Direct2.h [143:235]


        void resolve(const FVec<AVX, float>& vz, const IVec<AVX, float>& bidx, uint32 *pr) const
    {
        const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
        const float *xi = base_t::data.xi;

#if 0   // use gather instructions

        IVec<AVX,float> idxm;
        idxm.setidx(buckets, bidx);
        __m256i z = _mm256_setzero_si256();
        IVec<AVX,float> minusone = _mm256_cmpeq_epi32(z,z);
        IVec<AVX,float> idxp = idxm - minusone;

        FVec<AVX, float> vxm = _mm256_i32gather_ps(xi, idxm, sizeof(float));
        FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
        IVec<AVX, float> ip = idxm;

#else // do not use gather instrucions

        union U {
            __m256i vec;
            uint32 ui32[8];
        } u;

        // read indices t

        const double *p7 = reinterpret_cast<const double *>(&xi[(u.ui32[7] = buckets[bidx.get7()])]);
        const double *p6 = reinterpret_cast<const double *>(&xi[(u.ui32[6] = buckets[bidx.get6()])]);
        const double *p5 = reinterpret_cast<const double *>(&xi[(u.ui32[5] = buckets[bidx.get5()])]);
        const double *p4 = reinterpret_cast<const double *>(&xi[(u.ui32[4] = buckets[bidx.get4()])]);
        const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]);
        const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]);
        const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]);
        const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]);

#if 0 // perform 8 loads in double precision

        // read pairs ( X(t-1), X(t) )
        __m128 xp7 = _mm_castpd_ps(_mm_load_sd(p7));
        __m128 xp6 = _mm_castpd_ps(_mm_load_sd(p6));
        __m128 xp5 = _mm_castpd_ps(_mm_load_sd(p5));
        __m128 xp4 = _mm_castpd_ps(_mm_load_sd(p4));
        __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3));
        __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2));
        __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1));
        __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0));

        // build:
        // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
        // { X(t(0)),   X(t(1)),   X(t(2)),   X(t(3)) }
        __m128 h57 = _mm_shuffle_ps(xp5, xp7, (1 << 2) + (1 << 6));  // F- F+ H- H+
        __m128 h46 = _mm_shuffle_ps(xp4, xp6, (1 << 2) + (1 << 6));  // E- E+ G- G+
        __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6));  // B- B+ D- D+
        __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6));  // A- A+ C- C+

        __m128 u01 = _mm_unpacklo_ps(h02, h13);  // A- B- A+ B+
        __m128 u23 = _mm_unpackhi_ps(h02, h13);  // C- D- C+ D+
        __m128 u45 = _mm_unpacklo_ps(h46, h57);  // E- F- E+ F+
        __m128 u67 = _mm_unpackhi_ps(h46, h57);  // G- H- G+ H+

        __m128 abcdm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6));  // A- B- C- D-
        __m128 abcdp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6));  // A+ B+ C+ D+
        __m128 efghm = _mm_shuffle_ps(u45, u67, (0) + (1 << 2) + (0 << 4) + (1 << 6));  // E- F- G- H-
        __m128 efghp = _mm_shuffle_ps(u45, u67, (2) + (3 << 2) + (2 << 4) + (3 << 6));  // E+ F+ G+ H+

        FVec<AVX, float> vxp = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdm), efghm, 1);
        FVec<AVX, float> vxm = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdp), efghp, 1);

        IVec<AVX, float> ip(u.vec);

#else   // use __mm256_set_pd

        // read pairs ( X(t-1), X(t) )
        __m256 x0145 = _mm256_castpd_ps(_mm256_set_pd(*p5, *p4, *p1, *p0)); // { x0(t-1), x0(t), x1(t-1), x1(t), x4(t-1), x4(t), x5(t-1), x5(t) }
        __m256 x2367 = _mm256_castpd_ps(_mm256_set_pd(*p7, *p6, *p3, *p2)); // { x2(t-1), x2(t), x3(t-1), x3(t), x6(t-1), x6(t), x7(t-1), x7(t) }

        // { x0(t-1), x1(t-1), x2(t-1), 3(t-1, x4(t-1), x5(t-1), x6(t-1), xt(t-1) }
        FVec<AVX, float> vxm = _mm256_shuffle_ps(x0145, x2367, 0 + (2 << 2) + (0 << 4) + (2 << 6) );
        // { x0(t), x1(t), x2(t), 3(t, x4(t), x5(t), x6(t), xt(t) }
        FVec<AVX, float> vxp = _mm256_shuffle_ps(x0145, x2367, 1 + (3 << 2) + (1 << 4) + (3 << 6) );

        IVec<AVX, float> ip(u.vec);

#endif

#endif

        IVec<AVX, float> vlem = vz < vxm;
        IVec<AVX, float> vlep = vz < vxp;
        ip = ip + vlem + vlep;

        ip.store(pr);
    }