in common/checksum/crc.cpp [673:713]
inline __attribute__((always_inline))
__m128i crc64ecma_hw_big_avx512(const uint8_t*& data, size_t& nbytes, uint64_t crc) {
assert(nbytes >= 256);
__attribute__((aligned(16)))
v512 crc0 = {(long)~crc};
auto& ptr = (const v512*&)data;
auto zmm0 = _mm512_loadu_si512(ptr++); zmm0 ^= crc0;
auto zmm4 = _mm512_loadu_si512(ptr++);
nbytes -= 128;
if (nbytes < 384) {
auto rk3 = _mm512_broadcast_i32x4(*(__m128i*)_RK(3));
do { // fold 128 bytes each iteration
zmm0 = OP(zmm0, rk3, ptr++);
zmm4 = OP(zmm4, rk3, ptr++);
nbytes -= 128;
} while (nbytes >= 128);
} else { // nbytes >= 384
auto rk_1_2 = _mm512_broadcast_i32x4(*(__m128i*)&rk512[0]);
auto zmm7 = _mm512_loadu_si512(ptr++);
auto zmm8 = _mm512_loadu_si512(ptr++);
nbytes -= 128;
do { // fold 256 bytes each iteration
zmm0 = OP(zmm0, rk_1_2, ptr++);
zmm4 = OP(zmm4, rk_1_2, ptr++);
zmm7 = OP(zmm7, rk_1_2, ptr++);
zmm8 = OP(zmm8, rk_1_2, ptr++);
nbytes -= 256;
} while (nbytes >= 256);
auto rk3 = _mm512_broadcast_i32x4(*(__m128i*)_RK(3));
zmm0 = OP(zmm0, rk3, zmm7);
zmm4 = OP(zmm4, rk3, zmm8);
}
auto t = _mm512_extracti64x2_epi64(zmm4, 0x03);
auto zmm7 = v512{t[0], t[1]};
auto zmm1 = OP(zmm0, *(v512*)_RK(9), zmm7);
zmm1 = OP(zmm4, *(v512*)_RK(17), zmm1);
auto zmm8 = _mm512_shuffle_i64x2(zmm1, zmm1, 0x4e);
auto ymm8 = ((__m256i&)zmm8) ^ ((__m256i&)zmm1);
return _mm256_extracti64x2_epi64(ymm8, 0) ^
_mm256_extracti64x2_epi64(ymm8, 1) ;
}