void UnpackAvx512::vectorUnpack15()

in c++/src/BpackingAvx512.cc [1130:1253]


  void UnpackAvx512::vectorUnpack15(int64_t* data, uint64_t offset, uint64_t len) {
    uint32_t bitWidth = 15;
    const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(decoder->getBufStart());
    uint64_t numElements = 0;
    int64_t* dstPtr = data + offset;
    uint64_t bufMoveByteLen = 0;
    uint64_t bufRestByteLen = decoder->bufLength();
    bool resetBuf = false;
    uint64_t startBit = 0;
    uint64_t tailBitLen = 0;
    uint32_t backupByteLen = 0;

    while (len > 0) {
      alignHeaderBoundary<true>(bitWidth, UNPACK_16Bit_MAX_SIZE, startBit, bufMoveByteLen,
                                bufRestByteLen, len, tailBitLen, backupByteLen, numElements,
                                resetBuf, srcPtr, dstPtr);

      if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
        uint16_t* simdPtr = reinterpret_cast<uint16_t*>(vectorBuf);
        __mmask32 readMask = ORC_VECTOR_BIT_MASK(ORC_VECTOR_BITS_2_WORD(bitWidth * 32));
        __m512i parseMask0 = _mm512_set1_epi16(ORC_VECTOR_BIT_MASK(bitWidth));
        __m512i nibbleReversemm = _mm512_loadu_si512(nibbleReverseTable);
        __m512i reverseMask16u = _mm512_loadu_si512(reverseMaskTable16u);
        __m512i maskmm = _mm512_set1_epi8(0x0F);

        __m512i shuffleIdxPtr[2];
        shuffleIdxPtr[0] = _mm512_loadu_si512(shuffleIdxTable15u_0);
        shuffleIdxPtr[1] = _mm512_loadu_si512(shuffleIdxTable15u_1);

        __m512i permutexIdxPtr[2];
        permutexIdxPtr[0] = _mm512_loadu_si512(permutexIdxTable15u_0);
        permutexIdxPtr[1] = _mm512_loadu_si512(permutexIdxTable15u_1);

        __m512i shiftMaskPtr[4];
        shiftMaskPtr[0] = _mm512_loadu_si512(shiftTable15u_0);
        shiftMaskPtr[1] = _mm512_loadu_si512(shiftTable15u_1);
        shiftMaskPtr[2] = _mm512_loadu_si512(shiftTable15u_2);
        shiftMaskPtr[3] = _mm512_loadu_si512(shiftTable15u_3);

        __m512i gatherIdxmm = _mm512_loadu_si512(gatherIdxTable15u);

        while (numElements >= 2 * VECTOR_UNPACK_16BIT_MAX_NUM) {
          __m512i srcmm, zmm[2];

          srcmm = _mm512_i64gather_epi64(gatherIdxmm, srcPtr, 1);

          // shuffling so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
          zmm[0] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[0]);
          zmm[1] = _mm512_shuffle_epi8(srcmm, shuffleIdxPtr[1]);

          // shifting elements so they start from the start of the word
          zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[2]);
          zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[3]);

          // gathering even and odd elements together
          zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
          zmm[0] = _mm512_and_si512(zmm[0], parseMask0);

          _mm512_storeu_si512(simdPtr, zmm[0]);

          srcPtr += 4 * bitWidth;
          decoder->resetBufferStart(4 * bitWidth, false, 0);
          bufRestByteLen = decoder->bufLength();
          bufMoveByteLen -= 4 * bitWidth;
          numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
          std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
          dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
        }
        if (numElements >= VECTOR_UNPACK_16BIT_MAX_NUM) {
          __m512i srcmm, zmm[2];

          srcmm = _mm512_maskz_loadu_epi16(readMask, srcPtr);

          __m512i lowNibblemm = _mm512_and_si512(srcmm, maskmm);
          __m512i highNibblemm = _mm512_srli_epi16(srcmm, 4);
          highNibblemm = _mm512_and_si512(highNibblemm, maskmm);

          lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
          highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
          lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);

          srcmm = _mm512_or_si512(lowNibblemm, highNibblemm);

          // permuting so in zmm[0] will be elements with even indexes and in zmm[1] - with odd ones
          zmm[0] = _mm512_permutexvar_epi16(permutexIdxPtr[0], srcmm);
          zmm[1] = _mm512_permutexvar_epi16(permutexIdxPtr[1], srcmm);

          // shifting elements so they start from the start of the word
          zmm[0] = _mm512_srlv_epi32(zmm[0], shiftMaskPtr[0]);
          zmm[1] = _mm512_sllv_epi32(zmm[1], shiftMaskPtr[1]);

          // gathering even and odd elements together
          zmm[0] = _mm512_mask_mov_epi16(zmm[0], 0xAAAAAAAA, zmm[1]);
          zmm[0] = _mm512_and_si512(zmm[0], parseMask0);

          zmm[0] = _mm512_slli_epi16(zmm[0], 1);

          lowNibblemm = _mm512_and_si512(zmm[0], maskmm);
          highNibblemm = _mm512_srli_epi16(zmm[0], 4);
          highNibblemm = _mm512_and_si512(highNibblemm, maskmm);

          lowNibblemm = _mm512_shuffle_epi8(nibbleReversemm, lowNibblemm);
          highNibblemm = _mm512_shuffle_epi8(nibbleReversemm, highNibblemm);
          lowNibblemm = _mm512_slli_epi16(lowNibblemm, 4);

          zmm[0] = _mm512_or_si512(lowNibblemm, highNibblemm);
          zmm[0] = _mm512_shuffle_epi8(zmm[0], reverseMask16u);

          _mm512_storeu_si512(simdPtr, zmm[0]);

          srcPtr += 4 * bitWidth;
          decoder->resetBufferStart(4 * bitWidth, false, 0);
          bufRestByteLen = decoder->bufLength();
          bufMoveByteLen -= 4 * bitWidth;
          numElements -= VECTOR_UNPACK_16BIT_MAX_NUM;
          std::copy(simdPtr, simdPtr + VECTOR_UNPACK_16BIT_MAX_NUM, dstPtr);
          dstPtr += VECTOR_UNPACK_16BIT_MAX_NUM;
        }
      }

      alignTailerBoundary<true>(bitWidth, 0, startBit, bufMoveByteLen, bufRestByteLen, len,
                                backupByteLen, numElements, resetBuf, srcPtr, dstPtr);
    }
  }