void UnpackAvx512::vectorUnpack30()

in c++/src/BpackingAvx512.cc [2257:2378]


  void UnpackAvx512::vectorUnpack30(int64_t* data, uint64_t offset, uint64_t len) {
    uint32_t bitWidth = 30;
    const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
    uint64_t numElements = 0;
    int64_t* dstPtr = data + offset;
    uint64_t bufMoveByteLen = 0;
    uint64_t bufRestByteLen = decoder->bufLength();
    bool resetBuf = false;
    uint64_t startBit = 0;
    uint64_t tailBitLen = 0;
    uint32_t backupByteLen = 0;

    while (len > 0) {
      alignHeaderBoundary<true>(bitWidth, UNPACK_32Bit_MAX_SIZE, startBit, bufMoveByteLen,
                                bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
                                resetBuf, srcPtr, dstPtr);

      if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
        __mmask16 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_DWORD(bitWidth * 16));
        __m512i parseMask0 = _mm512_set1_epi32(ORC_VECTOR_BIT_MASK(bitWidth));
        __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
        __m512i reverseMask32u = _mm512_loadu_si512(reverseMaskTable32u);
        __m512i maskmm = _mm512_set1_epi8(0x0F);

        __m512i shuffleIdxPtr[2];
        shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable30u_0);
        shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable30u_1);

        __m512i permutexIdxPtr[2];
        permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable30u_0);
        permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable30u_1);

        __m512i shiftMaskPtr[4];
        shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable30u_0);
        shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable30u_1);
        shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable30u_2);
        shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable30u_3);

        __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable30u);

        while (numElements >= 2 * VECTOR_UNPACK_32BIT_MAX_NUM) {
          __m512i srcmm, zmm[2];

          srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1u);

          // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
          zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
          zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);

          // shifting elements so they start from the start of the word
          zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[2]);
          zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[3]);

          // gathering even and odd elements together
          zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
          zmm[0] = _mm512_and_si512(zmm[0], parseMask0);

          _mm512_storeu_si512(vectorBuf, zmm[0]);

          srcPtr += 2 * bitWidth;
          decoder->resetBufferStart(2 * bitWidth, false, 0);
          bufRestByteLen = decoder->bufLength();
          bufMoveByteLen -= 2 * bitWidth;
          numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
          std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
          dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
        }
        if (numElements >= VECTOR_UNPACK_32BIT_MAX_NUM) {
          __m512i srcmm, zmm[2];

          srcmm = _mm512_maskz_loadu_epi32(readMask, srcPtr);

          __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
          __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
          highNibblemm = _mm512_and_si512(highNibblemm, maskmm);

          lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
          highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
          lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u);

          srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);

          // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
          zmm[0] = _mm512_permutexvar_epi32(permutexIdxPtr[0], srcmm);
          zmm[1] = _mm512_permutexvar_epi32(permutexIdxPtr[1], srcmm);

          // shifting elements so they start from the start of the word
          zmm[0] = _mm512_srlv_epi64(zmm[0], shiftMaskPtr[0]);
          zmm[1] = _mm512_sllv_epi64(zmm[1], shiftMaskPtr[1]);

          // gathering even and odd elements together
          zmm[0] = _mm512_mask_mov_epi32(zmm[0], 0xAAAA, zmm[1]);
          zmm[0] = _mm512_and_si512(zmm[0], parseMask0);

          zmm[0] = _mm512_slli_epi32(zmm[0], 2u);
          lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
          highNibblemm = _mm512_srli_epi16(zmm[0], 4u);
          highNibblemm = _mm512_and_si512(highNibblemm, maskmm);

          lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
          highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
          lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4u);

          zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
          zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask32u);

          _mm512_storeu_si512(vectorBuf, zmm[0]);

          srcPtr += 2 * bitWidth;
          decoder->resetBufferStart(2 * bitWidth, false, 0);
          bufRestByteLen = decoder->bufLength();
          bufMoveByteLen -= 2 * bitWidth;
          numElements -= VECTOR_UNPACK_32BIT_MAX_NUM;
          std::copy(vectorBuf, vectorBuf + VECTOR_UNPACK_32BIT_MAX_NUM, dstPtr);
          dstPtr += VECTOR_UNPACK_32BIT_MAX_NUM;
        }
      }

      alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
                                backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
    }
  }