in c++/src/BpackingAvx512.cc [1809:1922]
void UnpackAvx512::vectorUnpack22(int64_t* data, uint64_t offset, uint64_t len) {
uint32_t bitWidth = 22;
const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
uint64_t numElements = 0;
int64_t* dstPtr = data + offset;
uint64_t bufMoveByteLen = 0;
uint64_t bufRestByteLen = decoder->bufLength();
bool resetBuf = false;
uint64_t startBit = 0;
uint64_t tailBitLen = 0;
uint32_t backupByteLen = 0;
while (len > 0) {
alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
resetBuf, srcPtr, dstPtr);
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
__m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
__m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
__m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
__m512i maskmm = _mm512_set1_epi8(0x0F);
__m512i shuffleIdxPtr = _mm512_loadu_si512(shuffleIdxTable22u_0);
__m512i permutexIdxPtr[2];
permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable22u_0);
permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable22u_1);
__m512i shiftMaskPtr[3];
shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable22u_0);
shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable22u_1);
shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable22u_2);
__m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable22u);
while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);
zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
__m512i srcmm, zmm[2];
srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);
__m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
__m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);
// permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);
// shifting elements so they start from the start of the word
zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);
// gathering even and odd elements together
zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
zmm[0] = _mm512_and_si512(zmm[0], parseMask0);
zmm[0] = _mm512_slli_epi32(zmm[0], 10);
lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
highNibblemm = _mm512_srli_epi16(zmm[0], 4);
highNibblemm = _mm512_and_si512(highNibblemm, maskmm);
lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);
zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);
_mm512_storeu_si512(vectorBuf, zmm[0]);
srcPtr += 2 * bitWidth;
decoder->resetBufferStart(2 * bitWidth, false, 0);
bufRestByteLen = decoder->bufLength();
bufMoveByteLen -= 2 * bitWidth;
numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
}
}
alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
}
}