XXH_FORCE_INLINE void XXH3_accumulate_512()

in src/mlio-py/mlio/contrib/insights/hll/xxh3.h [523:762]


XXH_FORCE_INLINE void XXH3_accumulate_512(void *XXH_RESTRICT acc,
                                          const void *XXH_RESTRICT input,
                                          const void *XXH_RESTRICT secret,
                                          XXH3_accWidth_e accWidth)
{
#if (XXH_VECTOR == XXH_AVX2)

    XXH_ASSERT((((size_t) acc) & 31) == 0);
    {
        XXH_ALIGN(32) __m256i *const xacc = (__m256i *) acc;
        const __m256i *const xinput =
            (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because
                                        _mm256_loadu_si256() requires this type */
        const __m256i *const xsecret =
            (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because
                                         _mm256_loadu_si256() requires this type */

        size_t i;
        for (i = 0; i < STRIPE_LEN / sizeof(__m256i); i++) {
            __m256i const data_vec = _mm256_loadu_si256(xinput + i);
            __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
            __m256i const data_key =
                _mm256_xor_si256(data_vec, key_vec); /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2,
                                                        d3+k3, ...} */
            __m256i const product = _mm256_mul_epu32(
                data_key,
                _mm256_shuffle_epi32(data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
            if (accWidth == XXH3_acc_128bits) {
                __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
                __m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
                xacc[i] = _mm256_add_epi64(product, sum);
            }
            else { /* XXH3_acc_64bits */
                __m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
                xacc[i] = _mm256_add_epi64(product, sum);
            }
        }
    }

#elif (XXH_VECTOR == XXH_SSE2)

    XXH_ASSERT((((size_t) acc) & 15) == 0);
    {
        XXH_ALIGN(16) __m128i *const xacc = (__m128i *) acc;
        const __m128i *const xinput =
            (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because
                                        _mm_loadu_si128() requires this type */
        const __m128i *const xsecret =
            (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because
                                         _mm_loadu_si128() requires this type */

        size_t i;
        for (i = 0; i < STRIPE_LEN / sizeof(__m128i); i++) {
            __m128i const data_vec = _mm_loadu_si128(xinput + i);
            __m128i const key_vec = _mm_loadu_si128(xsecret + i);
            __m128i const data_key =
                _mm_xor_si128(data_vec, key_vec); /* uint32 dk[8]  = {d0+k0, d1+k1, d2+k2,
                                                     d3+k3, ...} */
            __m128i const product = _mm_mul_epu32(
                data_key,
                _mm_shuffle_epi32(data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
            if (accWidth == XXH3_acc_128bits) {
                __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
                __m128i const sum = _mm_add_epi64(xacc[i], data_swap);
                xacc[i] = _mm_add_epi64(product, sum);
            }
            else { /* XXH3_acc_64bits */
                __m128i const sum = _mm_add_epi64(xacc[i], data_vec);
                xacc[i] = _mm_add_epi64(product, sum);
            }
        }
    }

#elif (XXH_VECTOR == XXH_NEON)

    XXH_ASSERT((((size_t) acc) & 15) == 0);
    {
        XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *) acc;
        /* We don't use a uint32x4_t pointer because it causes bus errors on
         * ARMv7. */
        const uint8_t *const xinput = (const uint8_t *) input;
        const uint8_t *const xsecret = (const uint8_t *) secret;

        size_t i;
        for (i = 0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
            /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs)
             * without this. vzip on 32-bit ARM NEON will overwrite the
             * original register, and I think that Clang assumes I don't want
             * to destroy it and tries to make a copy. This slows down the code
             * a lot.
             * aarch64 not only uses an entirely different syntax, but it
             * requires three instructions... ext    v1.16B, v0.16B, #8    //
             * select high bits because aarch64 can't address them directly
             *    zip1   v3.2s, v0.2s, v1.2s   // first zip
             *    zip2   v2.2s, v0.2s, v1.2s   // second zip
             * ...to do what ARM does in one:
             *    vzip.32 d0, d1               // Interleave high and low bits
             * and overwrite. */

            /* data_vec = xsecret[i]; */
            uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
            /* key_vec  = xsecret[i];  */
            uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
            /* data_key = data_vec ^ key_vec; */
            uint32x4_t data_key;

            if (accWidth == XXH3_acc_64bits) {
                /* Add first to prevent register swaps */
                /* xacc[i] += data_vec; */
                xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u8(data_vec));
            }
            else { /* XXH3_acc_128bits */
                /* xacc[i] += swap(data_vec); */
                /* can probably be optimized better */
                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
                uint64x2_t const swapped = vextq_u64(data64, data64, 1);
                xacc[i] = vaddq_u64(xacc[i], swapped);
            }

            data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));

            /* Here's the magic. We use the quirkiness of vzip to shuffle
             * data_key in place. shuffle: data_key[0, 1, 2, 3] = data_key[0,
             * 2, 1, 3] */
            __asm__("vzip.32 %e0, %f0" : "+w"(data_key));
            /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t)
             * data_key[2, 3]; */
            xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));

#else
            /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster
             * than, the vzip method. */

            /* data_vec = xsecret[i]; */
            uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
            /* key_vec  = xsecret[i];  */
            uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
            /* data_key = data_vec ^ key_vec; */
            uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
            /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
            uint32x2_t const data_key_lo = vmovn_u64(data_key);
            /* data_key_hi = (uint32x2_t) (data_key >> 32); */
            uint32x2_t const data_key_hi = vshrn_n_u64(data_key, 32);
            if (accWidth == XXH3_acc_64bits) {
                /* xacc[i] += data_vec; */
                xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u8(data_vec));
            }
            else { /* XXH3_acc_128bits */
                /* xacc[i] += swap(data_vec); */
                uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
                uint64x2_t const swapped = vextq_u64(data64, data64, 1);
                xacc[i] = vaddq_u64(xacc[i], swapped);
            }
            /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi;
             */
            xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);

#endif
        }
    }

#elif (XXH_VECTOR == XXH_VSX)
    U64x2 *const xacc = (U64x2 *) acc;                   /* presumed aligned */
    const U64x2 *const xinput = (const U64x2 *) input;   /* no alignment restriction */
    const U64x2 *const xsecret = (const U64x2 *) secret; /* no alignment restriction */
    U64x2 const v32 = {32, 32};
#if XXH_VSX_BE
    U8x16 const vXorSwap = {0x07,
                            0x16,
                            0x25,
                            0x34,
                            0x43,
                            0x52,
                            0x61,
                            0x70,
                            0x8F,
                            0x9E,
                            0xAD,
                            0xBC,
                            0xCB,
                            0xDA,
                            0xE9,
                            0xF8};
#endif
    size_t i;
    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
        /* data_vec = xinput[i]; */
        /* key_vec = xsecret[i]; */
#if XXH_VSX_BE
        /* byteswap */
        U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
        /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
        U64x2 const data_key = (U64x2) XXH_vec_permxor((U8x16) data_vec, (U8x16) key_raw, vXorSwap);
#else
        U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
        U64x2 const data_key = data_vec ^ key_vec;
#endif
        /* shuffled = (data_key << 32) | (data_key >> 32); */
        U32x4 const shuffled = (U32x4) vec_rl(data_key, v32);
        /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled &
         * 0xFFFFFFFF); */
        U64x2 const product = XXH_vec_mulo((U32x4) data_key, shuffled);
        xacc[i] += product;

        if (accWidth == XXH3_acc_64bits) {
            xacc[i] += data_vec;
        }
        else { /* XXH3_acc_128bits */
            /* swap high and low halves */
            U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
            xacc[i] += data_swapped;
        }
    }

#else /* scalar variant of Accumulator - universal */

    XXH_ALIGN(XXH_ACC_ALIGN)
    xxh_u64 *const xacc = (xxh_u64 *) acc; /* presumed aligned on 32-bytes boundaries, little
                                              hint for the auto-vectorizer */
    const xxh_u8 *const xinput = (const xxh_u8 *) input;   /* no alignment restriction */
    const xxh_u8 *const xsecret = (const xxh_u8 *) secret; /* no alignment restriction */
    size_t i;
    XXH_ASSERT(((size_t) acc & (XXH_ACC_ALIGN - 1)) == 0);
    for (i = 0; i < ACC_NB; i++) {
        xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i);
        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8);

        if (accWidth == XXH3_acc_64bits) {
            xacc[i] += data_val;
        }
        else {
            xacc[i ^ 1] += data_val; /* swap adjacent lanes */
        }
        xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
    }
#endif
}