XXH_FORCE_INLINE void XXH3_scrambleAcc()

in src/mlio-py/mlio/contrib/insights/hll/xxh3.h [764:925]


XXH_FORCE_INLINE void XXH3_scrambleAcc(void *XXH_RESTRICT acc, const void *XXH_RESTRICT secret)
{
#if (XXH_VECTOR == XXH_AVX2)

    XXH_ASSERT((((size_t) acc) & 31) == 0);
    {
        XXH_ALIGN(32) __m256i *const xacc = (__m256i *) acc;
        const __m256i *const xsecret =
            (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because
                                         _mm256_loadu_si256() requires this argument type */
        const __m256i prime32 = _mm256_set1_epi32((int) PRIME32_1);

        size_t i;
        for (i = 0; i < STRIPE_LEN / sizeof(__m256i); i++) {
            /* xacc[i] ^= (xacc[i] >> 47) */
            __m256i const acc_vec = xacc[i];
            __m256i const shifted = _mm256_srli_epi64(acc_vec, 47);
            __m256i const data_vec = _mm256_xor_si256(acc_vec, shifted);
            /* xacc[i] ^= xsecret; */
            __m256i const key_vec = _mm256_loadu_si256(xsecret + i);
            __m256i const data_key = _mm256_xor_si256(data_vec, key_vec);

            /* xacc[i] *= PRIME32_1; */
            __m256i const data_key_hi = _mm256_shuffle_epi32(data_key, 0x31);
            __m256i const prod_lo = _mm256_mul_epu32(data_key, prime32);
            __m256i const prod_hi = _mm256_mul_epu32(data_key_hi, prime32);
            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
        }
    }

#elif (XXH_VECTOR == XXH_SSE2)

    XXH_ASSERT((((size_t) acc) & 15) == 0);
    {
        XXH_ALIGN(16) __m128i *const xacc = (__m128i *) acc;
        const __m128i *const xsecret =
            (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because
                                         _mm_loadu_si128() requires this argument type */
        const __m128i prime32 = _mm_set1_epi32((int) PRIME32_1);

        size_t i;
        for (i = 0; i < STRIPE_LEN / sizeof(__m128i); i++) {
            /* xacc[i] ^= (xacc[i] >> 47) */
            __m128i const acc_vec = xacc[i];
            __m128i const shifted = _mm_srli_epi64(acc_vec, 47);
            __m128i const data_vec = _mm_xor_si128(acc_vec, shifted);
            /* xacc[i] ^= xsecret; */
            __m128i const key_vec = _mm_loadu_si128(xsecret + i);
            __m128i const data_key = _mm_xor_si128(data_vec, key_vec);

            /* xacc[i] *= PRIME32_1; */
            __m128i const data_key_hi = _mm_shuffle_epi32(data_key, 0x31);
            __m128i const prod_lo = _mm_mul_epu32(data_key, prime32);
            __m128i const prod_hi = _mm_mul_epu32(data_key_hi, prime32);
            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
        }
    }

#elif (XXH_VECTOR == XXH_NEON)

    XXH_ASSERT((((size_t) acc) & 15) == 0);

    {
        uint64x2_t *const xacc = (uint64x2_t *) acc;
        const uint8_t *const xsecret = (const uint8_t *) secret;
        uint32x2_t const prime = vdup_n_u32(PRIME32_1);

        size_t i;
        for (i = 0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
            /* data_vec = xacc[i] ^ (xacc[i] >> 47); */
            uint64x2_t const acc_vec = xacc[i];
            uint64x2_t const shifted = vshrq_n_u64(acc_vec, 47);
            uint64x2_t const data_vec = veorq_u64(acc_vec, shifted);

            /* key_vec  = xsecret[i]; */
            uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16)));
            /* data_key = data_vec ^ key_vec; */
            uint32x4_t const data_key = veorq_u32(vreinterpretq_u32_u64(data_vec), key_vec);
            /* shuffled = { data_key[0, 2], data_key[1, 3] }; */
            uint32x2x2_t const shuffled = vzip_u32(vget_low_u32(data_key), vget_high_u32(data_key));

            /* data_key *= PRIME32_1 */

            /* prod_hi = (data_key >> 32) * PRIME32_1; */
            uint64x2_t const prod_hi = vmull_u32(shuffled.val[1], prime);
            /* xacc[i] = prod_hi << 32; */
            xacc[i] = vshlq_n_u64(prod_hi, 32);
            /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */
            xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime);
        }
    }

#elif (XXH_VECTOR == XXH_VSX)

    U64x2 *const xacc = (U64x2 *) acc;
    const U64x2 *const xsecret = (const U64x2 *) secret;
    /* constants */
    U64x2 const v32 = {32, 32};
    U64x2 const v47 = {47, 47};
    U32x4 const prime = {PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1};
    size_t i;
#if XXH_VSX_BE
    /* endian swap */
    U8x16 const vXorSwap = {0x07,
                            0x16,
                            0x25,
                            0x34,
                            0x43,
                            0x52,
                            0x61,
                            0x70,
                            0x8F,
                            0x9E,
                            0xAD,
                            0xBC,
                            0xCB,
                            0xDA,
                            0xE9,
                            0xF8};
#endif
    for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
        U64x2 const acc_vec = xacc[i];
        U64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
        /* key_vec = xsecret[i]; */
#if XXH_VSX_BE
        /* swap bytes words */
        U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
        U64x2 const data_key = (U64x2) XXH_vec_permxor((U8x16) data_vec, (U8x16) key_raw, vXorSwap);
#else
        U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
        U64x2 const data_key = data_vec ^ key_vec;
#endif

        /* data_key *= PRIME32_1 */

        /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime &
         * 0xFFFFFFFF);  */
        U64x2 const prod_even = XXH_vec_mule((U32x4) data_key, prime);
        /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32);  */
        U64x2 const prod_odd = XXH_vec_mulo((U32x4) data_key, prime);
        xacc[i] = prod_odd + (prod_even << v32);
    }

#else /* scalar variant of Scrambler - universal */

    XXH_ALIGN(XXH_ACC_ALIGN)
    xxh_u64 *const xacc = (xxh_u64 *) acc; /* presumed aligned on 32-bytes boundaries, little
                                              hint for the auto-vectorizer */
    const xxh_u8 *const xsecret = (const xxh_u8 *) secret; /* no alignment restriction */
    size_t i;
    XXH_ASSERT((((size_t) acc) & (XXH_ACC_ALIGN - 1)) == 0);
    for (i = 0; i < ACC_NB; i++) {
        xxh_u64 const key64 = XXH_readLE64(xsecret + 8 * i);
        xxh_u64 acc64 = xacc[i];
        acc64 ^= acc64 >> 47;
        acc64 ^= key64;
        acc64 *= PRIME32_1;
        xacc[i] = acc64;
    }

#endif
}