in src/mlio-py/mlio/contrib/insights/hll/xxh3.h [523:762]
XXH_FORCE_INLINE void XXH3_accumulate_512(void *XXH_RESTRICT acc,
const void *XXH_RESTRICT input,
const void *XXH_RESTRICT secret,
XXH3_accWidth_e accWidth)
{
#if (XXH_VECTOR == XXH_AVX2)
XXH_ASSERT((((size_t) acc) & 31) == 0);
{
XXH_ALIGN(32) __m256i *const xacc = (__m256i *) acc;
const __m256i *const xinput =
(const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because
_mm256_loadu_si256() requires this type */
const __m256i *const xsecret =
(const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because
_mm256_loadu_si256() requires this type */
size_t i;
for (i = 0; i < STRIPE_LEN / sizeof(__m256i); i++) {
__m256i const data_vec = _mm256_loadu_si256(xinput + i);
__m256i const key_vec = _mm256_loadu_si256(xsecret + i);
__m256i const data_key =
_mm256_xor_si256(data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2,
d3+k3, ...} */
__m256i const product = _mm256_mul_epu32(
data_key,
_mm256_shuffle_epi32(data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
if (accWidth == XXH3_acc_128bits) {
__m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
__m256i const sum = _mm256_add_epi64(xacc[i], data_swap);
xacc[i] = _mm256_add_epi64(product, sum);
}
else { /* XXH3_acc_64bits */
__m256i const sum = _mm256_add_epi64(xacc[i], data_vec);
xacc[i] = _mm256_add_epi64(product, sum);
}
}
}
#elif (XXH_VECTOR == XXH_SSE2)
XXH_ASSERT((((size_t) acc) & 15) == 0);
{
XXH_ALIGN(16) __m128i *const xacc = (__m128i *) acc;
const __m128i *const xinput =
(const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because
_mm_loadu_si128() requires this type */
const __m128i *const xsecret =
(const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because
_mm_loadu_si128() requires this type */
size_t i;
for (i = 0; i < STRIPE_LEN / sizeof(__m128i); i++) {
__m128i const data_vec = _mm_loadu_si128(xinput + i);
__m128i const key_vec = _mm_loadu_si128(xsecret + i);
__m128i const data_key =
_mm_xor_si128(data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2,
d3+k3, ...} */
__m128i const product = _mm_mul_epu32(
data_key,
_mm_shuffle_epi32(data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */
if (accWidth == XXH3_acc_128bits) {
__m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
__m128i const sum = _mm_add_epi64(xacc[i], data_swap);
xacc[i] = _mm_add_epi64(product, sum);
}
else { /* XXH3_acc_64bits */
__m128i const sum = _mm_add_epi64(xacc[i], data_vec);
xacc[i] = _mm_add_epi64(product, sum);
}
}
}
#elif (XXH_VECTOR == XXH_NEON)
XXH_ASSERT((((size_t) acc) & 15) == 0);
{
XXH_ALIGN(16) uint64x2_t *const xacc = (uint64x2_t *) acc;
/* We don't use a uint32x4_t pointer because it causes bus errors on
* ARMv7. */
const uint8_t *const xinput = (const uint8_t *) input;
const uint8_t *const xsecret = (const uint8_t *) secret;
size_t i;
for (i = 0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) {
#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */
/* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs)
* without this. vzip on 32-bit ARM NEON will overwrite the
* original register, and I think that Clang assumes I don't want
* to destroy it and tries to make a copy. This slows down the code
* a lot.
* aarch64 not only uses an entirely different syntax, but it
* requires three instructions... ext v1.16B, v0.16B, #8 //
* select high bits because aarch64 can't address them directly
* zip1 v3.2s, v0.2s, v1.2s // first zip
* zip2 v2.2s, v0.2s, v1.2s // second zip
* ...to do what ARM does in one:
* vzip.32 d0, d1 // Interleave high and low bits
* and overwrite. */
/* data_vec = xsecret[i]; */
uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
/* key_vec = xsecret[i]; */
uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
/* data_key = data_vec ^ key_vec; */
uint32x4_t data_key;
if (accWidth == XXH3_acc_64bits) {
/* Add first to prevent register swaps */
/* xacc[i] += data_vec; */
xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u8(data_vec));
}
else { /* XXH3_acc_128bits */
/* xacc[i] += swap(data_vec); */
/* can probably be optimized better */
uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
uint64x2_t const swapped = vextq_u64(data64, data64, 1);
xacc[i] = vaddq_u64(xacc[i], swapped);
}
data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec));
/* Here's the magic. We use the quirkiness of vzip to shuffle
* data_key in place. shuffle: data_key[0, 1, 2, 3] = data_key[0,
* 2, 1, 3] */
__asm__("vzip.32 %e0, %f0" : "+w"(data_key));
/* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t)
* data_key[2, 3]; */
xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key));
#else
/* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster
* than, the vzip method. */
/* data_vec = xsecret[i]; */
uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16));
/* key_vec = xsecret[i]; */
uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16));
/* data_key = data_vec ^ key_vec; */
uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
/* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */
uint32x2_t const data_key_lo = vmovn_u64(data_key);
/* data_key_hi = (uint32x2_t) (data_key >> 32); */
uint32x2_t const data_key_hi = vshrn_n_u64(data_key, 32);
if (accWidth == XXH3_acc_64bits) {
/* xacc[i] += data_vec; */
xacc[i] = vaddq_u64(xacc[i], vreinterpretq_u64_u8(data_vec));
}
else { /* XXH3_acc_128bits */
/* xacc[i] += swap(data_vec); */
uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
uint64x2_t const swapped = vextq_u64(data64, data64, 1);
xacc[i] = vaddq_u64(xacc[i], swapped);
}
/* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi;
*/
xacc[i] = vmlal_u32(xacc[i], data_key_lo, data_key_hi);
#endif
}
}
#elif (XXH_VECTOR == XXH_VSX)
U64x2 *const xacc = (U64x2 *) acc; /* presumed aligned */
const U64x2 *const xinput = (const U64x2 *) input; /* no alignment restriction */
const U64x2 *const xsecret = (const U64x2 *) secret; /* no alignment restriction */
U64x2 const v32 = {32, 32};
#if XXH_VSX_BE
U8x16 const vXorSwap = {0x07,
0x16,
0x25,
0x34,
0x43,
0x52,
0x61,
0x70,
0x8F,
0x9E,
0xAD,
0xBC,
0xCB,
0xDA,
0xE9,
0xF8};
#endif
size_t i;
for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) {
/* data_vec = xinput[i]; */
/* key_vec = xsecret[i]; */
#if XXH_VSX_BE
/* byteswap */
U64x2 const data_vec = XXH_vec_revb(vec_vsx_ld(0, xinput + i));
U64x2 const key_raw = vec_vsx_ld(0, xsecret + i);
/* See comment above. data_key = data_vec ^ swap(xsecret[i]); */
U64x2 const data_key = (U64x2) XXH_vec_permxor((U8x16) data_vec, (U8x16) key_raw, vXorSwap);
#else
U64x2 const data_vec = vec_vsx_ld(0, xinput + i);
U64x2 const key_vec = vec_vsx_ld(0, xsecret + i);
U64x2 const data_key = data_vec ^ key_vec;
#endif
/* shuffled = (data_key << 32) | (data_key >> 32); */
U32x4 const shuffled = (U32x4) vec_rl(data_key, v32);
/* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled &
* 0xFFFFFFFF); */
U64x2 const product = XXH_vec_mulo((U32x4) data_key, shuffled);
xacc[i] += product;
if (accWidth == XXH3_acc_64bits) {
xacc[i] += data_vec;
}
else { /* XXH3_acc_128bits */
/* swap high and low halves */
U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2);
xacc[i] += data_swapped;
}
}
#else /* scalar variant of Accumulator - universal */
XXH_ALIGN(XXH_ACC_ALIGN)
xxh_u64 *const xacc = (xxh_u64 *) acc; /* presumed aligned on 32-bytes boundaries, little
hint for the auto-vectorizer */
const xxh_u8 *const xinput = (const xxh_u8 *) input; /* no alignment restriction */
const xxh_u8 *const xsecret = (const xxh_u8 *) secret; /* no alignment restriction */
size_t i;
XXH_ASSERT(((size_t) acc & (XXH_ACC_ALIGN - 1)) == 0);
for (i = 0; i < ACC_NB; i++) {
xxh_u64 const data_val = XXH_readLE64(xinput + 8 * i);
xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i * 8);
if (accWidth == XXH3_acc_64bits) {
xacc[i] += data_val;
}
else {
xacc[i ^ 1] += data_val; /* swap adjacent lanes */
}
xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
}
#endif
}