aios/storage/indexlib/index/common/numeric_compress/DecompressSse4.cpp (2,470 lines of code) (raw):
/*
* Copyright 2014-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "indexlib/index/common/numeric_compress/DecompressSse4.h"
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "indexlib/index/common/numeric_compress/Unpack.h"
namespace indexlib::index {
const __m128i SSE4_zero_m128i __attribute__((unused)) = _mm_set_epi32(0x0, 0x0, 0x0, 0x0);
const __m128i SSE4_set_m128i __attribute__((unused)) = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff);
// c0
void decompress_sse4_c0(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
for (uint32_t i = 32; i <= n; i += 32, encode += 1, dest += 32) {
dest[0] = 0;
dest[1] = 0;
dest[2] = 0;
dest[3] = 0;
dest[4] = 0;
dest[5] = 0;
dest[6] = 0;
dest[7] = 0;
dest[8] = 0;
dest[9] = 0;
dest[10] = 0;
dest[11] = 0;
dest[12] = 0;
dest[13] = 0;
dest[14] = 0;
dest[15] = 0;
dest[16] = 0;
dest[17] = 0;
dest[18] = 0;
dest[19] = 0;
dest[20] = 0;
dest[21] = 0;
dest[22] = 0;
dest[23] = 0;
dest[24] = 0;
dest[25] = 0;
dest[26] = 0;
dest[27] = 0;
dest[28] = 0;
dest[29] = 0;
dest[30] = 0;
dest[31] = 0;
}
if (n & 0x1F) {
unaligned_unpack_0(dest, encode, (n & 0x1F));
}
}
// c1
const __m128i SSE4_c1_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01);
const __m128i SSE4_c1_mul_msk1_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
const __m128i SSE4_c1_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x10);
const __m128i SSE4_c1_mul_msk2_m128i = _mm_set_epi32(0x01, 0x02, 0x4, 0x08);
template <int segment>
inline void decompress_sse4_c1_m2(__m128i c1_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c1_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment, 0xFF, 0xFF, 0xFF, segment, 0xFF, 0xFF,
0xFF, segment, 0xFF, 0xFF, 0xFF, segment);
__m128i c1_shfl_rslt_m128i = _mm_shuffle_epi8(c1_load_vec_m128i, SSE4_c1_shfl_msk_m128i);
__m128i c1_and_rslt_m128i = _mm_and_si128(c1_shfl_rslt_m128i, SSE4_c1_and_msk1_m128i);
__m128i c1_mul_rslt_m128i = _mm_mullo_epi32(c1_and_rslt_m128i, SSE4_c1_mul_msk1_m128i);
__m128i c1_rslt_m128i = _mm_srli_epi32(c1_mul_rslt_m128i, 3);
_mm_storeu_si128((__m128i*)presult, c1_rslt_m128i);
presult = presult + 4;
SSE4_c1_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment, 0xFF, 0xFF, 0xFF, segment, 0xFF, 0xFF, 0xFF,
segment, 0xFF, 0xFF, 0xFF, segment);
c1_shfl_rslt_m128i = _mm_shuffle_epi8(c1_load_vec_m128i, SSE4_c1_shfl_msk_m128i);
c1_and_rslt_m128i = _mm_and_si128(c1_shfl_rslt_m128i, SSE4_c1_and_msk2_m128i);
c1_mul_rslt_m128i = _mm_mullo_epi32(c1_and_rslt_m128i, SSE4_c1_mul_msk2_m128i);
c1_rslt_m128i = _mm_srli_epi32(c1_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c1_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c1_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c1_load_rslt_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c1_m2<0>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<1>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<2>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<3>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<4>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<5>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<6>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<7>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<8>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<9>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<10>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<11>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<12>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<13>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<14>(c1_load_rslt_m128i, presult);
decompress_sse4_c1_m2<15>(c1_load_rslt_m128i, presult);
}
void decompress_sse4_c1(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c1_m1(vector_m128i, presult + index, offset);
offset += 1;
index += 128;
}
if (n & 0x7F) {
unpack_1(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c2
const __m128i SSE4_c2_and_msk_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0xC0, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x03);
const __m128i SSE4_c2_mul_msk_m128i = _mm_set_epi32(0x01, 0x04, 0x10, 0x40);
template <int segment>
inline void decompress_sse4_c2_m2(__m128i c2_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c2_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment, 0xFF, 0xFF, 0xFF, segment, 0xFF, 0xFF,
0xFF, segment, 0xFF, 0xFF, 0xFF, segment);
__m128i c2_shfl_rslt_m128i = _mm_shuffle_epi8(c2_load_vec_m128i, SSE4_c2_shfl_msk_m128i);
__m128i c2_and_rslt_m128i = _mm_and_si128(c2_shfl_rslt_m128i, SSE4_c2_and_msk_m128i);
__m128i c2_mul_rslt_m128i = _mm_mullo_epi32(c2_and_rslt_m128i, SSE4_c2_mul_msk_m128i);
__m128i c2_rslt_m128i = _mm_srli_epi32(c2_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c2_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c2_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c2_load_rslt_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c2_m2<0>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<1>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<2>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<3>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<4>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<5>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<6>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<7>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<8>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<9>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<10>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<11>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<12>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<13>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<14>(c2_load_rslt_m128i, presult);
decompress_sse4_c2_m2<15>(c2_load_rslt_m128i, presult);
}
void decompress_sse4_c2(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c2_m1(vector_m128i, presult + index, offset);
offset += 1;
index += 64;
}
if (n & 0x3F) {
unpack_2(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c3
const __m128i SSE4_c3_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x01, 0xC0, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x07);
const __m128i SSE4_c3_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0xE0, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x03, 0x80, 0x00, 0x00, 0x00, 0x70);
const __m128i SSE4_c3_mul_msk1_m128i = _mm_set_epi32(0x20, 0x01, 0x08, 0x40);
const __m128i SSE4_c3_mul_msk2_m128i = _mm_set_epi32(0x04, 0x20, 0x01, 0x08);
template <int segment>
inline void decompress_sse4_c3_m2(__m128i c3_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c3_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment + 1, 0xFF, 0xFF, segment + 1, segment, 0xFF,
0xFF, 0xFF, segment, 0xFF, 0xFF, 0xFF, segment);
__m128i c3_shfl_rslt_m128i = _mm_shuffle_epi8(c3_load_vec_m128i, SSE4_c3_shfl_msk_m128i);
__m128i c3_and_rslt_m128i = _mm_and_si128(c3_shfl_rslt_m128i, SSE4_c3_and_msk1_m128i);
__m128i c3_mul_rslt_m128i = _mm_mullo_epi32(c3_and_rslt_m128i, SSE4_c3_mul_msk1_m128i);
__m128i c3_rslt_m128i = _mm_srli_epi32(c3_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c3_rslt_m128i);
presult = presult + 4;
SSE4_c3_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment + 2, 0xFF, 0xFF, 0xFF, segment + 2, 0xFF, 0xFF,
segment + 2, segment + 1, 0xFF, 0xFF, 0xFF, segment + 1);
c3_shfl_rslt_m128i = _mm_shuffle_epi8(c3_load_vec_m128i, SSE4_c3_shfl_msk_m128i);
c3_and_rslt_m128i = _mm_and_si128(c3_shfl_rslt_m128i, SSE4_c3_and_msk2_m128i);
c3_mul_rslt_m128i = _mm_mullo_epi32(c3_and_rslt_m128i, SSE4_c3_mul_msk2_m128i);
c3_rslt_m128i = _mm_srli_epi32(c3_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c3_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c3_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c3_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i); // contains more than 40 values
decompress_sse4_c3_m2<0>(c3_load_rslt1_m128i, presult); // gets 1st 8 values
decompress_sse4_c3_m2<3>(c3_load_rslt1_m128i, presult); // 2nd 8 values
decompress_sse4_c3_m2<6>(c3_load_rslt1_m128i, presult); // 3th 8 values
decompress_sse4_c3_m2<9>(c3_load_rslt1_m128i, presult); // 4th 8 values
decompress_sse4_c3_m2<12>(c3_load_rslt1_m128i, presult); // 5th 8 values
__m128i c3_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c3_align_rslt1_m128i = _mm_alignr_epi8(c3_load_rslt2_m128i, c3_load_rslt1_m128i, 15);
decompress_sse4_c3_m2<0>(c3_align_rslt1_m128i, presult); // gets 1st 8 values
decompress_sse4_c3_m2<3>(c3_align_rslt1_m128i, presult); // 2nd 8 values
decompress_sse4_c3_m2<6>(c3_align_rslt1_m128i, presult); // 3th 8 values
decompress_sse4_c3_m2<9>(c3_align_rslt1_m128i, presult); // 4th 8 values
decompress_sse4_c3_m2<12>(c3_align_rslt1_m128i, presult); // 5th 8 values
__m128i c3_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c3_align_rslt2_m128i = _mm_alignr_epi8(c3_load_rslt3_m128i, c3_load_rslt2_m128i, 14);
decompress_sse4_c3_m2<0>(c3_align_rslt2_m128i, presult); // gets 1st 8 values
decompress_sse4_c3_m2<3>(c3_align_rslt2_m128i, presult); // 2nd 8 values
decompress_sse4_c3_m2<6>(c3_align_rslt2_m128i, presult); // 3th 8 values
decompress_sse4_c3_m2<9>(c3_align_rslt2_m128i, presult); // 4th 8 values
decompress_sse4_c3_m2<12>(c3_align_rslt2_m128i, presult); // 5th 8 values
decompress_sse4_c3_m2<13>(c3_load_rslt3_m128i, presult);
}
void decompress_sse4_c3(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c3_m1(vector_m128i, presult + index, offset);
offset += 3;
index += 128;
}
if (n & 0x7F) {
unpack_3(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c4
const __m128i SSE4_c4_and_msk_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0xF0, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0xF0, 0x00, 0x00, 0x00, 0x0F);
const __m128i SSE4_c4_mul_msk_m128i = _mm_set_epi32(0x01, 0x10, 0x01, 0x10);
template <int segment>
inline void decompress_sse4_c4_m2(__m128i c4_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c4_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment + 1, 0xFF, 0xFF, 0xFF, segment + 1, 0xFF,
0xFF, 0xFF, segment, 0xFF, 0xFF, 0xFF, segment);
__m128i c4_shfl_rslt_m128i = _mm_shuffle_epi8(c4_load_vec_m128i, SSE4_c4_shfl_msk_m128i);
__m128i c4_and_rslt_m128i = _mm_and_si128(c4_shfl_rslt_m128i, SSE4_c4_and_msk_m128i);
__m128i c4_mul_rslt_m128i = _mm_mullo_epi32(c4_and_rslt_m128i, SSE4_c4_mul_msk_m128i);
__m128i c4_rslt_m128i = _mm_srli_epi32(c4_mul_rslt_m128i, 4);
_mm_storeu_si128((__m128i*)presult, c4_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c4_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c4_load_rslt_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c4_m2<0>(c4_load_rslt_m128i, presult);
decompress_sse4_c4_m2<2>(c4_load_rslt_m128i, presult);
decompress_sse4_c4_m2<4>(c4_load_rslt_m128i, presult);
decompress_sse4_c4_m2<6>(c4_load_rslt_m128i, presult);
decompress_sse4_c4_m2<8>(c4_load_rslt_m128i, presult);
decompress_sse4_c4_m2<10>(c4_load_rslt_m128i, presult);
decompress_sse4_c4_m2<12>(c4_load_rslt_m128i, presult);
decompress_sse4_c4_m2<14>(c4_load_rslt_m128i, presult);
}
void decompress_sse4_c4(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 32 <= n) {
decompress_sse4_c4_m1(vector_m128i, presult + index, offset);
offset += 1;
index += 32;
}
if (n & 0x1F) {
unpack_4(dest + index, encode + offset * 4, (n & 0x1F));
}
}
// c5
const __m128i SSE4_c5_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0x0F, 0x80, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x00, 0x03, 0xE0, 0x00, 0x00, 0x00, 0x1F);
const __m128i SSE4_c5_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0xF8, 0x00, 0x00, 0x07, 0xC0, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x01, 0xF0);
const __m128i SSE4_c5_mul_msk1_m128i = _mm_set_epi32(0x01, 0x20, 0x04, 0x80);
const __m128i SSE4_c5_mul_msk2_m128i = _mm_set_epi32(0x08, 0x01, 0x20, 0x04);
template <int segment>
inline void decompress_sse4_c5_m2(__m128i c5_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c5_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, segment + 2, segment + 1, 0xFF, 0xFF, 0xFF, segment + 1,
0xFF, 0xFF, segment + 1, segment, 0xFF, 0xFF, 0xFF, segment);
__m128i c5_shfl_rslt_m128i = _mm_shuffle_epi8(c5_load_vec_m128i, SSE4_c5_shfl_msk_m128i);
__m128i c5_and_rslt_m128i = _mm_and_si128(c5_shfl_rslt_m128i, SSE4_c5_and_msk1_m128i);
__m128i c5_mul_rslt_m128i = _mm_mullo_epi32(c5_and_rslt_m128i, SSE4_c5_mul_msk1_m128i);
__m128i c5_rslt_m128i = _mm_srli_epi32(c5_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c5_rslt_m128i);
presult = presult + 4;
// decompress second 4 values
SSE4_c5_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment + 4, 0xFF, 0xFF, segment + 4, segment + 3, 0xFF,
0xFF, 0xFF, segment + 3, 0xFF, 0xFF, segment + 3, segment + 2);
c5_shfl_rslt_m128i = _mm_shuffle_epi8(c5_load_vec_m128i, SSE4_c5_shfl_msk_m128i);
c5_and_rslt_m128i = _mm_and_si128(c5_shfl_rslt_m128i, SSE4_c5_and_msk2_m128i);
c5_mul_rslt_m128i = _mm_mullo_epi32(c5_and_rslt_m128i, SSE4_c5_mul_msk2_m128i);
c5_rslt_m128i = _mm_srli_epi32(c5_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c5_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c5_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c5_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c5_m2<0>(c5_load_rslt1_m128i, presult);
decompress_sse4_c5_m2<5>(c5_load_rslt1_m128i, presult);
decompress_sse4_c5_m2<10>(c5_load_rslt1_m128i, presult);
__m128i c5_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c5_align_rslt1_m128i = _mm_alignr_epi8(c5_load_rslt2_m128i, c5_load_rslt1_m128i, 15);
decompress_sse4_c5_m2<0>(c5_align_rslt1_m128i, presult);
decompress_sse4_c5_m2<5>(c5_align_rslt1_m128i, presult);
decompress_sse4_c5_m2<10>(c5_align_rslt1_m128i, presult);
__m128i c5_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c5_align_rslt2_m128i = _mm_alignr_epi8(c5_load_rslt3_m128i, c5_load_rslt2_m128i, 14);
decompress_sse4_c5_m2<0>(c5_align_rslt2_m128i, presult);
decompress_sse4_c5_m2<5>(c5_align_rslt2_m128i, presult);
decompress_sse4_c5_m2<10>(c5_align_rslt2_m128i, presult);
__m128i c5_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c5_align_rslt3_m128i = _mm_alignr_epi8(c5_load_rslt4_m128i, c5_load_rslt3_m128i, 13);
decompress_sse4_c5_m2<0>(c5_align_rslt3_m128i, presult);
decompress_sse4_c5_m2<5>(c5_align_rslt3_m128i, presult);
decompress_sse4_c5_m2<10>(c5_align_rslt3_m128i, presult);
__m128i c5_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c5_align_rslt4_m128i = _mm_alignr_epi8(c5_load_rslt5_m128i, c5_load_rslt4_m128i, 12);
decompress_sse4_c5_m2<0>(c5_align_rslt4_m128i, presult);
decompress_sse4_c5_m2<5>(c5_align_rslt4_m128i, presult);
decompress_sse4_c5_m2<10>(c5_align_rslt4_m128i, presult);
decompress_sse4_c5_m2<11>(c5_load_rslt5_m128i, presult);
}
void decompress_sse4_c5(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c5_m1(vector_m128i, presult + index, offset);
offset += 5;
index += 128;
}
if (n & 0x7F) {
unpack_5(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c6
const __m128i SSE4_c6_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0xFC, 0x00, 0x00, 0x03, 0xF0, 0x00, 0x00, 0x0F, 0xC0, 0x00, 0x00, 0x00, 0x3F);
const __m128i SSE4_c6_mul_msk1_m128i = _mm_set_epi32(0x10, 0x04, 0x01, 0x40);
template <int segment>
inline void decompress_sse4_c6_m2(__m128i c6_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c6_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment + 2, 0xFF, 0xFF, segment + 2, segment + 1,
0xFF, 0xFF, segment + 1, segment, 0xFF, 0xFF, 0xFF, segment);
__m128i c6_shfl_rslt_m128i = _mm_shuffle_epi8(c6_load_vec_m128i, SSE4_c6_shfl_msk_m128i);
__m128i c6_and_rslt_m128i = _mm_and_si128(c6_shfl_rslt_m128i, SSE4_c6_and_msk1_m128i);
__m128i c6_mul_rslt_m128i = _mm_mullo_epi32(c6_and_rslt_m128i, SSE4_c6_mul_msk1_m128i);
__m128i c6_rslt_m128i = _mm_srli_epi32(c6_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c6_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c6_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c6_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c6_m2<0>(c6_load_rslt1_m128i, presult);
decompress_sse4_c6_m2<3>(c6_load_rslt1_m128i, presult);
decompress_sse4_c6_m2<6>(c6_load_rslt1_m128i, presult);
decompress_sse4_c6_m2<9>(c6_load_rslt1_m128i, presult);
decompress_sse4_c6_m2<12>(c6_load_rslt1_m128i, presult);
__m128i c6_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c6_align_rslt1_m128i = _mm_alignr_epi8(c6_load_rslt2_m128i, c6_load_rslt1_m128i, 15);
decompress_sse4_c6_m2<0>(c6_align_rslt1_m128i, presult);
decompress_sse4_c6_m2<3>(c6_align_rslt1_m128i, presult);
decompress_sse4_c6_m2<6>(c6_align_rslt1_m128i, presult);
decompress_sse4_c6_m2<9>(c6_align_rslt1_m128i, presult);
decompress_sse4_c6_m2<12>(c6_align_rslt1_m128i, presult);
__m128i c6_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c6_align_rslt2_m128i = _mm_alignr_epi8(c6_load_rslt3_m128i, c6_load_rslt2_m128i, 14);
decompress_sse4_c6_m2<0>(c6_align_rslt2_m128i, presult);
decompress_sse4_c6_m2<3>(c6_align_rslt2_m128i, presult);
decompress_sse4_c6_m2<6>(c6_align_rslt2_m128i, presult);
decompress_sse4_c6_m2<9>(c6_align_rslt2_m128i, presult);
decompress_sse4_c6_m2<12>(c6_align_rslt2_m128i, presult);
decompress_sse4_c6_m2<13>(c6_load_rslt3_m128i, presult);
}
void decompress_sse4_c6(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c6_m1(vector_m128i, presult + index, offset);
offset += 3;
index += 64;
}
if (n & 0x3F) {
unpack_6(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c7
const __m128i SSE4_c7_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0x0F, 0xE0, 0x00, 0x00, 0x1F, 0xC0, 0x00, 0x00, 0x3F, 0x80, 0x00, 0x00, 0x00, 0x7F);
const __m128i SSE4_c7_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0x00, 0xFE, 0x00, 0x00, 0x01, 0xFC, 0x00, 0x00, 0x03, 0xF8, 0x00, 0x00, 0x07, 0xF0);
const __m128i SSE4_c7_mul_msk1_m128i = _mm_set_epi32(0x04, 0x02, 0x01, 0x80);
const __m128i SSE4_c7_mul_msk2_m128i = _mm_set_epi32(0x08, 0x04, 0x02, 0x01);
template <int segment>
inline void decompress_sse4_c7_m2(__m128i c7_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c7_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 3, segment + 2, 0xFF, 0xFF, segment + 2, segment + 1, 0xFF, 0xFF,
segment + 1, segment, 0xFF, 0xFF, 0xFF, segment);
__m128i c7_shfl_rslt_m128i = _mm_shuffle_epi8(c7_load_vec_m128i, SSE4_c7_shfl_msk_m128i);
__m128i c7_and_rslt_m128i = _mm_and_si128(c7_shfl_rslt_m128i, SSE4_c7_and_msk1_m128i);
__m128i c7_mul_rslt_m128i = _mm_mullo_epi32(c7_and_rslt_m128i, SSE4_c7_mul_msk1_m128i);
__m128i c7_rslt_m128i = _mm_srli_epi32(c7_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c7_rslt_m128i);
presult = presult + 4;
// decompress second 4 values
SSE4_c7_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment + 6, 0xFF, 0xFF, segment + 6, segment + 5, 0xFF,
0xFF, segment + 5, segment + 4, 0xFF, 0xFF, segment + 4, segment + 3);
c7_shfl_rslt_m128i = _mm_shuffle_epi8(c7_load_vec_m128i, SSE4_c7_shfl_msk_m128i);
c7_and_rslt_m128i = _mm_and_si128(c7_shfl_rslt_m128i, SSE4_c7_and_msk2_m128i);
c7_mul_rslt_m128i = _mm_mullo_epi32(c7_and_rslt_m128i, SSE4_c7_mul_msk2_m128i);
c7_rslt_m128i = _mm_srli_epi32(c7_mul_rslt_m128i, 4);
_mm_storeu_si128((__m128i*)presult, c7_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c7_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c7_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c7_m2<0>(c7_load_rslt1_m128i, presult);
decompress_sse4_c7_m2<7>(c7_load_rslt1_m128i, presult);
__m128i c7_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c7_align_rslt1_m128i = _mm_alignr_epi8(c7_load_rslt2_m128i, c7_load_rslt1_m128i, 14);
decompress_sse4_c7_m2<0>(c7_align_rslt1_m128i, presult);
decompress_sse4_c7_m2<7>(c7_align_rslt1_m128i, presult);
__m128i c7_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c7_align_rslt2_m128i = _mm_alignr_epi8(c7_load_rslt3_m128i, c7_load_rslt2_m128i, 12);
decompress_sse4_c7_m2<0>(c7_align_rslt2_m128i, presult);
decompress_sse4_c7_m2<7>(c7_align_rslt2_m128i, presult);
__m128i c7_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c7_align_rslt3_m128i = _mm_alignr_epi8(c7_load_rslt4_m128i, c7_load_rslt3_m128i, 10);
decompress_sse4_c7_m2<0>(c7_align_rslt3_m128i, presult);
decompress_sse4_c7_m2<7>(c7_align_rslt3_m128i, presult);
__m128i c7_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c7_align_rslt4_m128i = _mm_alignr_epi8(c7_load_rslt5_m128i, c7_load_rslt4_m128i, 8);
decompress_sse4_c7_m2<0>(c7_align_rslt4_m128i, presult);
decompress_sse4_c7_m2<7>(c7_align_rslt4_m128i, presult);
__m128i c7_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c7_align_rslt5_m128i = _mm_alignr_epi8(c7_load_rslt6_m128i, c7_load_rslt5_m128i, 6);
decompress_sse4_c7_m2<0>(c7_align_rslt5_m128i, presult);
decompress_sse4_c7_m2<7>(c7_align_rslt5_m128i, presult);
__m128i c7_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c7_align_rslt6_m128i = _mm_alignr_epi8(c7_load_rslt7_m128i, c7_load_rslt6_m128i, 4);
decompress_sse4_c7_m2<0>(c7_align_rslt6_m128i, presult);
decompress_sse4_c7_m2<7>(c7_align_rslt6_m128i, presult);
decompress_sse4_c7_m2<2>(c7_load_rslt7_m128i, presult);
decompress_sse4_c7_m2<9>(c7_load_rslt7_m128i, presult);
}
void decompress_sse4_c7(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c7_m1(vector_m128i, presult + index, offset);
offset += 7;
index += 128;
}
if (n & 0x7F) {
unpack_7(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c8
template <int segment>
inline void decompress_sse4_c8_m2(__m128i c8_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c8_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, 0xFF, segment + 3, 0xFF, 0xFF, 0xFF, segment + 2, 0xFF,
0xFF, 0xFF, segment + 1, 0xFF, 0xFF, 0xFF, segment);
__m128i c8_rslt_m128i = _mm_shuffle_epi8(c8_load_vec_m128i, SSE4_c8_shfl_msk_m128i);
_mm_storeu_si128((__m128i*)presult, c8_rslt_m128i);
//__m128i c8_equal_rslt_m128i = _mm_cmpeq_epi32(c8_rslt_m128i, SSE4_zero_m128i);
// int c8_tstz_rslt_int = _mm_testz_si128(c8_equal_rslt_m128i, SSE4_set_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c8_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c8_load_rslt_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c8_m2<0>(c8_load_rslt_m128i, presult);
decompress_sse4_c8_m2<4>(c8_load_rslt_m128i, presult);
decompress_sse4_c8_m2<8>(c8_load_rslt_m128i, presult);
decompress_sse4_c8_m2<12>(c8_load_rslt_m128i, presult);
}
void decompress_sse4_c8(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 16 <= n) {
decompress_sse4_c8_m1(vector_m128i, presult + index, offset);
offset += 1;
index += 16;
}
if (n & 0xF) {
unpack_8(dest + index, encode + offset * 4, (n & 0xF));
}
}
// c9
const __m128i SSE4_c9_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0x0F, 0xF8, 0x00, 0x00, 0x07, 0xFC, 0x00, 0x00, 0x03, 0xFE, 0x00, 0x00, 0x01, 0xFF);
const __m128i SSE4_c9_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0xFF, 0x80, 0x00, 0x00, 0x7F, 0xC0, 0x00, 0x00, 0x3F, 0xE0, 0x00, 0x00, 0x1F, 0xF0);
const __m128i SSE4_c9_mul_msk1_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
const __m128i SSE4_c9_mul_msk2_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
template <int segment>
inline void decompress_sse4_c9_m2(__m128i c9_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c9_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 4, segment + 3, 0xFF, 0xFF, segment + 3, segment + 2, 0xFF, 0xFF,
segment + 2, segment + 1, 0xFF, 0xFF, segment + 1, segment);
__m128i c9_shfl_rslt_m128i = _mm_shuffle_epi8(c9_load_vec_m128i, SSE4_c9_shfl_msk_m128i);
__m128i c9_and_rslt_m128i = _mm_and_si128(c9_shfl_rslt_m128i, SSE4_c9_and_msk1_m128i);
__m128i c9_mul_rslt_m128i = _mm_mullo_epi32(c9_and_rslt_m128i, SSE4_c9_mul_msk1_m128i);
__m128i c9_rslt_m128i = _mm_srli_epi32(c9_mul_rslt_m128i, 3);
_mm_storeu_si128((__m128i*)presult, c9_rslt_m128i);
presult = presult + 4;
// decompress second 4 values
SSE4_c9_shfl_msk_m128i = _mm_set_epi8(0xFF, 0xFF, segment + 8, segment + 7, 0xFF, 0xFF, segment + 7, segment + 6,
0xFF, 0xFF, segment + 6, segment + 5, 0xFF, 0xFF, segment + 5, segment + 4);
c9_shfl_rslt_m128i = _mm_shuffle_epi8(c9_load_vec_m128i, SSE4_c9_shfl_msk_m128i);
c9_and_rslt_m128i = _mm_and_si128(c9_shfl_rslt_m128i, SSE4_c9_and_msk2_m128i);
c9_mul_rslt_m128i = _mm_mullo_epi32(c9_and_rslt_m128i, SSE4_c9_mul_msk2_m128i);
c9_rslt_m128i = _mm_srli_epi32(c9_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c9_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c9_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c9_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c9_m2<0>(c9_load_rslt1_m128i, presult);
__m128i c9_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c9_align_rslt1_m128i = _mm_alignr_epi8(c9_load_rslt2_m128i, c9_load_rslt1_m128i, 9);
decompress_sse4_c9_m2<0>(c9_align_rslt1_m128i, presult);
decompress_sse4_c9_m2<2>(c9_load_rslt2_m128i, presult);
__m128i c9_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c9_align_rslt2_m128i = _mm_alignr_epi8(c9_load_rslt3_m128i, c9_load_rslt2_m128i, 11);
decompress_sse4_c9_m2<0>(c9_align_rslt2_m128i, presult);
decompress_sse4_c9_m2<4>(c9_load_rslt3_m128i, presult);
__m128i c9_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c9_align_rslt3_m128i = _mm_alignr_epi8(c9_load_rslt4_m128i, c9_load_rslt3_m128i, 13);
decompress_sse4_c9_m2<0>(c9_align_rslt3_m128i, presult);
decompress_sse4_c9_m2<6>(c9_load_rslt4_m128i, presult);
__m128i c9_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c9_align_rslt4_m128i = _mm_alignr_epi8(c9_load_rslt5_m128i, c9_load_rslt4_m128i, 15);
decompress_sse4_c9_m2<0>(c9_align_rslt4_m128i, presult);
__m128i c9_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c9_align_rslt5_m128i = _mm_alignr_epi8(c9_load_rslt6_m128i, c9_load_rslt5_m128i, 8);
decompress_sse4_c9_m2<0>(c9_align_rslt5_m128i, presult);
decompress_sse4_c9_m2<1>(c9_load_rslt6_m128i, presult);
__m128i c9_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c9_align_rslt6_m128i = _mm_alignr_epi8(c9_load_rslt7_m128i, c9_load_rslt6_m128i, 10);
decompress_sse4_c9_m2<0>(c9_align_rslt6_m128i, presult);
decompress_sse4_c9_m2<3>(c9_load_rslt7_m128i, presult);
__m128i c9_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c9_align_rslt7_m128i = _mm_alignr_epi8(c9_load_rslt8_m128i, c9_load_rslt7_m128i, 12);
decompress_sse4_c9_m2<0>(c9_align_rslt7_m128i, presult);
decompress_sse4_c9_m2<5>(c9_load_rslt8_m128i, presult);
__m128i c9_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c9_align_rslt8_m128i = _mm_alignr_epi8(c9_load_rslt9_m128i, c9_load_rslt8_m128i, 14);
decompress_sse4_c9_m2<0>(c9_align_rslt8_m128i, presult);
decompress_sse4_c9_m2<7>(c9_load_rslt9_m128i, presult);
}
void decompress_sse4_c9(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c9_m1(vector_m128i, presult + index, offset);
offset += 9;
index += 128;
}
if (n & 0x7F) {
unpack_9(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c10
const __m128i SSE4_c10_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0xFF, 0xC0, 0x00, 0x00, 0x3F, 0xF0, 0x00, 0x00, 0x0F, 0xFC, 0x00, 0x00, 0x03, 0xFF);
const __m128i SSE4_c10_mul_msk1_m128i = _mm_set_epi32(0x01, 0x04, 0x10, 0x40);
template <int segment>
inline void decompress_sse4_c10_m2(__m128i c10_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c10_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 4, segment + 3, 0xFF, 0xFF, segment + 3, segment + 2, 0xFF, 0xFF,
segment + 2, segment + 1, 0xFF, 0xFF, segment + 1, segment);
__m128i c10_shfl_rslt_m128i = _mm_shuffle_epi8(c10_load_vec_m128i, SSE4_c10_shfl_msk_m128i);
__m128i c10_and_rslt_m128i = _mm_and_si128(c10_shfl_rslt_m128i, SSE4_c10_and_msk1_m128i);
__m128i c10_mul_rslt_m128i = _mm_mullo_epi32(c10_and_rslt_m128i, SSE4_c10_mul_msk1_m128i);
__m128i c10_rslt_m128i = _mm_srli_epi32(c10_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c10_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c10_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c10_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c10_m2<0>(c10_load_rslt1_m128i, presult);
decompress_sse4_c10_m2<5>(c10_load_rslt1_m128i, presult);
decompress_sse4_c10_m2<10>(c10_load_rslt1_m128i, presult);
__m128i c10_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c10_align_rslt1_m128i = _mm_alignr_epi8(c10_load_rslt2_m128i, c10_load_rslt1_m128i, 15);
decompress_sse4_c10_m2<0>(c10_align_rslt1_m128i, presult);
decompress_sse4_c10_m2<5>(c10_align_rslt1_m128i, presult);
decompress_sse4_c10_m2<10>(c10_align_rslt1_m128i, presult);
__m128i c10_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c10_align_rslt2_m128i = _mm_alignr_epi8(c10_load_rslt3_m128i, c10_load_rslt2_m128i, 14);
decompress_sse4_c10_m2<0>(c10_align_rslt2_m128i, presult);
decompress_sse4_c10_m2<5>(c10_align_rslt2_m128i, presult);
decompress_sse4_c10_m2<10>(c10_align_rslt2_m128i, presult);
__m128i c10_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c10_align_rslt3_m128i = _mm_alignr_epi8(c10_load_rslt4_m128i, c10_load_rslt3_m128i, 13);
decompress_sse4_c10_m2<0>(c10_align_rslt3_m128i, presult);
decompress_sse4_c10_m2<5>(c10_align_rslt3_m128i, presult);
decompress_sse4_c10_m2<10>(c10_align_rslt3_m128i, presult);
__m128i c10_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c10_align_rslt4_m128i = _mm_alignr_epi8(c10_load_rslt5_m128i, c10_load_rslt4_m128i, 12);
decompress_sse4_c10_m2<0>(c10_align_rslt4_m128i, presult);
decompress_sse4_c10_m2<5>(c10_align_rslt4_m128i, presult);
decompress_sse4_c10_m2<10>(c10_align_rslt4_m128i, presult);
decompress_sse4_c10_m2<11>(c10_load_rslt5_m128i, presult);
}
void decompress_sse4_c10(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c10_m1(vector_m128i, presult + index, offset);
offset += 5;
index += 64;
}
if (n & 0x3F) {
unpack_10(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c11
const __m128i SSE4_c11_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0x0F, 0xFE, 0x00, 0x01, 0xFF, 0xC0, 0x00, 0x00, 0x3F, 0xF8, 0x00, 0x00, 0x07, 0xFF);
const __m128i SSE4_c11_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0xFF, 0xE0, 0x00, 0x00, 0x1F, 0xFC, 0x00, 0x03, 0xFF, 0x80, 0x00, 0x00, 0x7F, 0xF0);
const __m128i SSE4_c11_mul_msk1_m128i = _mm_set_epi32(0x20, 0x01, 0x08, 0x40);
const __m128i SSE4_c11_mul_msk2_m128i = _mm_set_epi32(0x04, 0x20, 0x01, 0x08);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c11_m2(__m128i c11_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c11_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 5, segment + 4, 0xFF, segment + 4, segment + 3, segment + 2, 0xFF, 0xFF,
segment + 2, segment + 1, 0xFF, 0xFF, segment + 1, segment);
__m128i c11_shfl_rslt_m128i = _mm_shuffle_epi8(c11_load_vec_m128i, SSE4_c11_shfl_msk_m128i);
__m128i c11_and_rslt_m128i = _mm_and_si128(c11_shfl_rslt_m128i, SSE4_c11_and_msk1_m128i);
__m128i c11_mul_rslt_m128i = _mm_mullo_epi32(c11_and_rslt_m128i, SSE4_c11_mul_msk1_m128i);
__m128i c11_rslt_m128i = _mm_srli_epi32(c11_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c11_rslt_m128i);
presult = presult + 4;
// decompress second 4 values
SSE4_c11_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 10, segment + 9, 0xFF, 0xFF, segment + 9, segment + 8, 0xFF, segment + 8,
segment + 7, segment + 6, 0xFF, 0xFF, segment + 6, segment + 5);
c11_shfl_rslt_m128i = _mm_shuffle_epi8(c11_load_vec_m128i, SSE4_c11_shfl_msk_m128i);
c11_and_rslt_m128i = _mm_and_si128(c11_shfl_rslt_m128i, SSE4_c11_and_msk2_m128i);
c11_mul_rslt_m128i = _mm_mullo_epi32(c11_and_rslt_m128i, SSE4_c11_mul_msk2_m128i);
c11_rslt_m128i = _mm_srli_epi32(c11_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c11_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c11_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c11_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c11_m2<0>(c11_load_rslt1_m128i, presult);
__m128i c11_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c11_align_rslt1_m128i = _mm_alignr_epi8(c11_load_rslt2_m128i, c11_load_rslt1_m128i, 11);
decompress_sse4_c11_m2<0>(c11_align_rslt1_m128i, presult);
__m128i c11_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c11_align_rslt2_m128i = _mm_alignr_epi8(c11_load_rslt3_m128i, c11_load_rslt2_m128i, 6);
decompress_sse4_c11_m2<0>(c11_align_rslt2_m128i, presult);
decompress_sse4_c11_m2<1>(c11_load_rslt3_m128i, presult);
__m128i c11_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c11_align_rslt3_m128i = _mm_alignr_epi8(c11_load_rslt4_m128i, c11_load_rslt3_m128i, 12);
decompress_sse4_c11_m2<0>(c11_align_rslt3_m128i, presult);
__m128i c11_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c11_align_rslt4_m128i = _mm_alignr_epi8(c11_load_rslt5_m128i, c11_load_rslt4_m128i, 7);
decompress_sse4_c11_m2<0>(c11_align_rslt4_m128i, presult);
decompress_sse4_c11_m2<2>(c11_load_rslt5_m128i, presult);
__m128i c11_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c11_align_rslt5_m128i = _mm_alignr_epi8(c11_load_rslt6_m128i, c11_load_rslt5_m128i, 13);
decompress_sse4_c11_m2<0>(c11_align_rslt5_m128i, presult);
__m128i c11_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c11_align_rslt6_m128i = _mm_alignr_epi8(c11_load_rslt7_m128i, c11_load_rslt6_m128i, 8);
decompress_sse4_c11_m2<0>(c11_align_rslt6_m128i, presult);
decompress_sse4_c11_m2<3>(c11_load_rslt7_m128i, presult);
__m128i c11_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c11_align_rslt7_m128i = _mm_alignr_epi8(c11_load_rslt8_m128i, c11_load_rslt7_m128i, 14);
decompress_sse4_c11_m2<0>(c11_align_rslt7_m128i, presult);
__m128i c11_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c11_align_rslt8_m128i = _mm_alignr_epi8(c11_load_rslt9_m128i, c11_load_rslt8_m128i, 9);
decompress_sse4_c11_m2<0>(c11_align_rslt8_m128i, presult);
decompress_sse4_c11_m2<4>(c11_load_rslt9_m128i, presult);
__m128i c11_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c11_align_rslt9_m128i = _mm_alignr_epi8(c11_load_rslt10_m128i, c11_load_rslt9_m128i, 15);
decompress_sse4_c11_m2<0>(c11_align_rslt9_m128i, presult);
__m128i c11_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c11_align_rslt10_m128i = _mm_alignr_epi8(c11_load_rslt11_m128i, c11_load_rslt10_m128i, 10);
decompress_sse4_c11_m2<0>(c11_align_rslt10_m128i, presult);
decompress_sse4_c11_m2<5>(c11_load_rslt11_m128i, presult);
}
void decompress_sse4_c11(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c11_m1(vector_m128i, presult + index, offset);
offset += 11;
index += 128;
}
if (n & 0x7F) {
unpack_11(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c12
const __m128i SSE4_c12_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0xFF, 0xF0, 0x00, 0x00, 0x0F, 0xFF, 0x00, 0x00, 0xFF, 0xF0, 0x00, 0x00, 0x0F, 0xFF);
const __m128i SSE4_c12_mul_msk1_m128i = _mm_set_epi32(0x01, 0x10, 0x01, 0x10);
template <int segment>
inline void decompress_sse4_c12_m2(__m128i c12_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c12_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 5, segment + 4, 0xFF, 0xFF, segment + 4, segment + 3, 0xFF, 0xFF,
segment + 2, segment + 1, 0xFF, 0xFF, segment + 1, segment);
__m128i c12_shfl_rslt_m128i = _mm_shuffle_epi8(c12_load_vec_m128i, SSE4_c12_shfl_msk_m128i);
__m128i c12_and_rslt_m128i = _mm_and_si128(c12_shfl_rslt_m128i, SSE4_c12_and_msk1_m128i);
__m128i c12_mul_rslt_m128i = _mm_mullo_epi32(c12_and_rslt_m128i, SSE4_c12_mul_msk1_m128i);
__m128i c12_rslt_m128i = _mm_srli_epi32(c12_mul_rslt_m128i, 4);
_mm_storeu_si128((__m128i*)presult, c12_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c12_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c12_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c12_m2<0>(c12_load_rslt1_m128i, presult);
decompress_sse4_c12_m2<6>(c12_load_rslt1_m128i, presult);
__m128i c12_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c12_align_rslt1_m128i = _mm_alignr_epi8(c12_load_rslt2_m128i, c12_load_rslt1_m128i, 12);
decompress_sse4_c12_m2<0>(c12_align_rslt1_m128i, presult);
decompress_sse4_c12_m2<6>(c12_align_rslt1_m128i, presult);
decompress_sse4_c12_m2<8>(c12_load_rslt2_m128i, presult);
__m128i c12_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c12_align_rslt2_m128i = _mm_alignr_epi8(c12_load_rslt3_m128i, c12_load_rslt2_m128i, 14);
decompress_sse4_c12_m2<0>(c12_align_rslt2_m128i, presult);
decompress_sse4_c12_m2<6>(c12_align_rslt2_m128i, presult);
decompress_sse4_c12_m2<10>(c12_load_rslt3_m128i, presult);
}
void decompress_sse4_c12(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 32 <= n) {
decompress_sse4_c12_m1(vector_m128i, presult + index, offset);
offset += 3;
index += 32;
}
if (n & 0x1F) {
unpack_12(dest + index, encode + offset * 4, (n & 0x1F));
}
}
// c13
const __m128i SSE4_c13_and_msk1_m128i =
_mm_set_epi8(0x00, 0x0F, 0xFF, 0x80, 0x00, 0x00, 0x7F, 0xFC, 0x00, 0x03, 0xFF, 0xE0, 0x00, 0x00, 0x1F, 0xFF);
const __m128i SSE4_c13_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0xFF, 0xF8, 0x00, 0x07, 0xFF, 0xC0, 0x00, 0x00, 0x3F, 0xFE, 0x00, 0x01, 0xFF, 0xF0);
const __m128i SSE4_c13_mul_msk1_m128i = _mm_set_epi32(0x01, 0x20, 0x04, 0x80);
const __m128i SSE4_c13_mul_msk2_m128i = _mm_set_epi32(0x08, 0x01, 0x20, 0x04);
template <int segment>
inline void decompress_sse4_c13_m2(__m128i c13_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c13_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 6, segment + 5, segment + 4, 0xFF, 0xFF, segment + 4, segment + 3, 0xFF,
segment + 3, segment + 2, segment + 1, 0xFF, 0xFF, segment + 1, segment);
__m128i c13_shfl_rslt_m128i = _mm_shuffle_epi8(c13_load_vec_m128i, SSE4_c13_shfl_msk_m128i);
__m128i c13_and_rslt_m128i = _mm_and_si128(c13_shfl_rslt_m128i, SSE4_c13_and_msk1_m128i);
__m128i c13_mul_rslt_m128i = _mm_mullo_epi32(c13_and_rslt_m128i, SSE4_c13_mul_msk1_m128i);
__m128i c13_rslt_m128i = _mm_srli_epi32(c13_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c13_rslt_m128i);
presult = presult + 4;
// decompress second 4 values
SSE4_c13_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 12, segment + 11, 0xFF, segment + 11, segment + 10, segment + 9, 0xFF, 0xFF,
segment + 9, segment + 8, 0xFF, segment + 8, segment + 7, segment + 6);
c13_shfl_rslt_m128i = _mm_shuffle_epi8(c13_load_vec_m128i, SSE4_c13_shfl_msk_m128i);
c13_and_rslt_m128i = _mm_and_si128(c13_shfl_rslt_m128i, SSE4_c13_and_msk2_m128i);
c13_mul_rslt_m128i = _mm_mullo_epi32(c13_and_rslt_m128i, SSE4_c13_mul_msk2_m128i);
c13_rslt_m128i = _mm_srli_epi32(c13_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c13_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c13_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c13_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c13_m2<0>(c13_load_rslt1_m128i, presult);
__m128i c13_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c13_align_rslt1_m128i = _mm_alignr_epi8(c13_load_rslt2_m128i, c13_load_rslt1_m128i, 13);
decompress_sse4_c13_m2<0>(c13_align_rslt1_m128i, presult);
__m128i c13_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c13_align_rslt2_m128i = _mm_alignr_epi8(c13_load_rslt3_m128i, c13_load_rslt2_m128i, 10);
decompress_sse4_c13_m2<0>(c13_align_rslt2_m128i, presult);
__m128i c13_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c13_align_rslt3_m128i = _mm_alignr_epi8(c13_load_rslt4_m128i, c13_load_rslt3_m128i, 7);
decompress_sse4_c13_m2<0>(c13_align_rslt3_m128i, presult);
__m128i c13_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c13_align_rslt4_m128i = _mm_alignr_epi8(c13_load_rslt5_m128i, c13_load_rslt4_m128i, 4);
decompress_sse4_c13_m2<0>(c13_align_rslt4_m128i, presult);
decompress_sse4_c13_m2<1>(c13_load_rslt5_m128i, presult);
__m128i c13_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c13_align_rslt5_m128i = _mm_alignr_epi8(c13_load_rslt6_m128i, c13_load_rslt5_m128i, 14);
decompress_sse4_c13_m2<0>(c13_align_rslt5_m128i, presult);
__m128i c13_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c13_align_rslt6_m128i = _mm_alignr_epi8(c13_load_rslt7_m128i, c13_load_rslt6_m128i, 11);
decompress_sse4_c13_m2<0>(c13_align_rslt6_m128i, presult);
__m128i c13_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c13_align_rslt7_m128i = _mm_alignr_epi8(c13_load_rslt8_m128i, c13_load_rslt7_m128i, 8);
decompress_sse4_c13_m2<0>(c13_align_rslt7_m128i, presult);
__m128i c13_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c13_align_rslt8_m128i = _mm_alignr_epi8(c13_load_rslt9_m128i, c13_load_rslt8_m128i, 5);
decompress_sse4_c13_m2<0>(c13_align_rslt8_m128i, presult);
decompress_sse4_c13_m2<2>(c13_load_rslt9_m128i, presult);
__m128i c13_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c13_align_rslt9_m128i = _mm_alignr_epi8(c13_load_rslt10_m128i, c13_load_rslt9_m128i, 15);
decompress_sse4_c13_m2<0>(c13_align_rslt9_m128i, presult);
__m128i c13_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c13_align_rslt10_m128i = _mm_alignr_epi8(c13_load_rslt11_m128i, c13_load_rslt10_m128i, 12);
decompress_sse4_c13_m2<0>(c13_align_rslt10_m128i, presult);
__m128i c13_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c13_align_rslt11_m128i = _mm_alignr_epi8(c13_load_rslt12_m128i, c13_load_rslt11_m128i, 9);
decompress_sse4_c13_m2<0>(c13_align_rslt11_m128i, presult);
__m128i c13_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c13_align_rslt12_m128i = _mm_alignr_epi8(c13_load_rslt13_m128i, c13_load_rslt12_m128i, 6);
decompress_sse4_c13_m2<0>(c13_align_rslt12_m128i, presult);
decompress_sse4_c13_m2<3>(c13_load_rslt13_m128i, presult);
}
void decompress_sse4_c13(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c13_m1(vector_m128i, presult + index, offset);
offset += 13;
index += 128;
}
if (n & 0x7F) {
unpack_13(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c14
const __m128i SSE4_c14_and_msk1_m128i =
_mm_set_epi8(0x00, 0x00, 0xFF, 0xFC, 0x00, 0x03, 0xFF, 0xF0, 0x00, 0x0F, 0xFF, 0xC0, 0x00, 0x00, 0x3F, 0xFF);
const __m128i SSE4_c14_mul_msk1_m128i = _mm_set_epi32(0x10, 0x04, 0x01, 0x40);
template <int segment>
inline void decompress_sse4_c14_m2(__m128i c14_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c14_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 6, segment + 5, 0xFF, segment + 5, segment + 4, segment + 3, 0xFF,
segment + 3, segment + 2, segment + 1, 0xFF, 0xFF, segment + 1, segment);
__m128i c14_shfl_rslt_m128i = _mm_shuffle_epi8(c14_load_vec_m128i, SSE4_c14_shfl_msk_m128i);
__m128i c14_and_rslt_m128i = _mm_and_si128(c14_shfl_rslt_m128i, SSE4_c14_and_msk1_m128i);
__m128i c14_mul_rslt_m128i = _mm_mullo_epi32(c14_and_rslt_m128i, SSE4_c14_mul_msk1_m128i);
__m128i c14_rslt_m128i = _mm_srli_epi32(c14_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c14_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c14_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c14_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c14_m2<0>(c14_load_rslt1_m128i, presult);
decompress_sse4_c14_m2<7>(c14_load_rslt1_m128i, presult);
__m128i c14_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c14_align_rslt1_m128i = _mm_alignr_epi8(c14_load_rslt2_m128i, c14_load_rslt1_m128i, 14);
decompress_sse4_c14_m2<0>(c14_align_rslt1_m128i, presult);
decompress_sse4_c14_m2<7>(c14_align_rslt1_m128i, presult);
__m128i c14_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c14_align_rslt2_m128i = _mm_alignr_epi8(c14_load_rslt3_m128i, c14_load_rslt2_m128i, 12);
decompress_sse4_c14_m2<0>(c14_align_rslt2_m128i, presult);
decompress_sse4_c14_m2<7>(c14_align_rslt2_m128i, presult);
__m128i c14_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c14_align_rslt3_m128i = _mm_alignr_epi8(c14_load_rslt4_m128i, c14_load_rslt3_m128i, 10);
decompress_sse4_c14_m2<0>(c14_align_rslt3_m128i, presult);
decompress_sse4_c14_m2<7>(c14_align_rslt3_m128i, presult);
__m128i c14_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c14_align_rslt4_m128i = _mm_alignr_epi8(c14_load_rslt5_m128i, c14_load_rslt4_m128i, 8);
decompress_sse4_c14_m2<0>(c14_align_rslt4_m128i, presult);
decompress_sse4_c14_m2<7>(c14_align_rslt4_m128i, presult);
__m128i c14_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c14_align_rslt5_m128i = _mm_alignr_epi8(c14_load_rslt6_m128i, c14_load_rslt5_m128i, 6);
decompress_sse4_c14_m2<0>(c14_align_rslt5_m128i, presult);
decompress_sse4_c14_m2<7>(c14_align_rslt5_m128i, presult);
__m128i c14_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c14_align_rslt6_m128i = _mm_alignr_epi8(c14_load_rslt7_m128i, c14_load_rslt6_m128i, 4);
decompress_sse4_c14_m2<0>(c14_align_rslt6_m128i, presult);
decompress_sse4_c14_m2<7>(c14_align_rslt6_m128i, presult);
decompress_sse4_c14_m2<2>(c14_load_rslt7_m128i, presult);
decompress_sse4_c14_m2<9>(c14_load_rslt7_m128i, presult);
}
void decompress_sse4_c14(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c14_m1(vector_m128i, presult + index, offset);
offset += 7;
index += 64;
}
if (n & 0x3F) {
unpack_14(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c15
const __m128i SSE4_c15_and_msk1_m128i =
_mm_set_epi8(0x00, 0x0F, 0xFF, 0xE0, 0x00, 0x1F, 0xFF, 0xC0, 0x00, 0x3F, 0xFF, 0x80, 0x00, 0x00, 0x7F, 0xFF);
const __m128i SSE4_c15_and_msk2_m128i =
_mm_set_epi8(0x00, 0x00, 0xFF, 0xFE, 0x00, 0x01, 0xFF, 0xFC, 0x00, 0x03, 0xFF, 0xF8, 0x00, 0x07, 0xFF, 0xF0);
const __m128i SSE4_c15_mul_msk1_m128i = _mm_set_epi32(0x04, 0x02, 0x01, 0x80);
const __m128i SSE4_c15_mul_msk2_m128i = _mm_set_epi32(0x08, 0x04, 0x02, 0x01);
template <int segment>
inline void decompress_sse4_c15_m2(__m128i c15_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c15_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 7, segment + 6, segment + 5, 0xFF, segment + 5, segment + 4, segment + 3, 0xFF,
segment + 3, segment + 2, segment + 1, 0xFF, 0xFF, segment + 1, segment);
__m128i c15_shfl_rslt_m128i = _mm_shuffle_epi8(c15_load_vec_m128i, SSE4_c15_shfl_msk_m128i);
__m128i c15_and_rslt_m128i = _mm_and_si128(c15_shfl_rslt_m128i, SSE4_c15_and_msk1_m128i);
__m128i c15_mul_rslt_m128i = _mm_mullo_epi32(c15_and_rslt_m128i, SSE4_c15_mul_msk1_m128i);
__m128i c15_rslt_m128i = _mm_srli_epi32(c15_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c15_rslt_m128i);
presult = presult + 4;
// decompress second 4 values
SSE4_c15_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 14, segment + 13, 0xFF, segment + 13, segment + 12, segment + 11, 0xFF,
segment + 11, segment + 10, segment + 9, 0xFF, segment + 9, segment + 8, segment + 7);
c15_shfl_rslt_m128i = _mm_shuffle_epi8(c15_load_vec_m128i, SSE4_c15_shfl_msk_m128i);
c15_and_rslt_m128i = _mm_and_si128(c15_shfl_rslt_m128i, SSE4_c15_and_msk2_m128i);
c15_mul_rslt_m128i = _mm_mullo_epi32(c15_and_rslt_m128i, SSE4_c15_mul_msk2_m128i);
c15_rslt_m128i = _mm_srli_epi32(c15_mul_rslt_m128i, 4);
_mm_storeu_si128((__m128i*)presult, c15_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c15_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c15_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c15_m2<0>(c15_load_rslt1_m128i, presult);
__m128i c15_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c15_align_rslt1_m128i = _mm_alignr_epi8(c15_load_rslt2_m128i, c15_load_rslt1_m128i, 15);
decompress_sse4_c15_m2<0>(c15_align_rslt1_m128i, presult);
__m128i c15_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c15_align_rslt2_m128i = _mm_alignr_epi8(c15_load_rslt3_m128i, c15_load_rslt2_m128i, 14);
decompress_sse4_c15_m2<0>(c15_align_rslt2_m128i, presult);
__m128i c15_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c15_align_rslt3_m128i = _mm_alignr_epi8(c15_load_rslt4_m128i, c15_load_rslt3_m128i, 13);
decompress_sse4_c15_m2<0>(c15_align_rslt3_m128i, presult);
__m128i c15_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c15_align_rslt4_m128i = _mm_alignr_epi8(c15_load_rslt5_m128i, c15_load_rslt4_m128i, 12);
decompress_sse4_c15_m2<0>(c15_align_rslt4_m128i, presult);
__m128i c15_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c15_align_rslt5_m128i = _mm_alignr_epi8(c15_load_rslt6_m128i, c15_load_rslt5_m128i, 11);
decompress_sse4_c15_m2<0>(c15_align_rslt5_m128i, presult);
__m128i c15_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c15_align_rslt6_m128i = _mm_alignr_epi8(c15_load_rslt7_m128i, c15_load_rslt6_m128i, 10);
decompress_sse4_c15_m2<0>(c15_align_rslt6_m128i, presult);
__m128i c15_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c15_align_rslt7_m128i = _mm_alignr_epi8(c15_load_rslt8_m128i, c15_load_rslt7_m128i, 9);
decompress_sse4_c15_m2<0>(c15_align_rslt7_m128i, presult);
__m128i c15_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c15_align_rslt8_m128i = _mm_alignr_epi8(c15_load_rslt9_m128i, c15_load_rslt8_m128i, 8);
decompress_sse4_c15_m2<0>(c15_align_rslt8_m128i, presult);
__m128i c15_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c15_align_rslt9_m128i = _mm_alignr_epi8(c15_load_rslt10_m128i, c15_load_rslt9_m128i, 7);
decompress_sse4_c15_m2<0>(c15_align_rslt9_m128i, presult);
__m128i c15_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c15_align_rslt10_m128i = _mm_alignr_epi8(c15_load_rslt11_m128i, c15_load_rslt10_m128i, 6);
decompress_sse4_c15_m2<0>(c15_align_rslt10_m128i, presult);
__m128i c15_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c15_align_rslt11_m128i = _mm_alignr_epi8(c15_load_rslt12_m128i, c15_load_rslt11_m128i, 5);
decompress_sse4_c15_m2<0>(c15_align_rslt11_m128i, presult);
__m128i c15_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c15_align_rslt12_m128i = _mm_alignr_epi8(c15_load_rslt13_m128i, c15_load_rslt12_m128i, 4);
decompress_sse4_c15_m2<0>(c15_align_rslt12_m128i, presult);
__m128i c15_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c15_align_rslt13_m128i = _mm_alignr_epi8(c15_load_rslt14_m128i, c15_load_rslt13_m128i, 3);
decompress_sse4_c15_m2<0>(c15_align_rslt13_m128i, presult);
__m128i c15_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c15_align_rslt14_m128i = _mm_alignr_epi8(c15_load_rslt15_m128i, c15_load_rslt14_m128i, 2);
decompress_sse4_c15_m2<0>(c15_align_rslt14_m128i, presult);
decompress_sse4_c15_m2<1>(c15_load_rslt15_m128i, presult);
}
/*
const __m128i SSE4_c15_and_msk1_m128i = _mm_set_epi8 (0x00, 0x0F, 0xFF, 0xE0,
0x00, 0x1F, 0xFF, 0xC0,
0x00, 0x3F, 0xFF, 0x80,
0x00, 0x00, 0x7F, 0xFF);
const __m128i SSE4_c15_and_msk2_m128i = _mm_set_epi8 (0x00, 0x00, 0xFF, 0xFE,
0x00, 0x01, 0xFF, 0xFC,
0x00, 0x03, 0xFF, 0xF8,
0x00, 0x07, 0xFF, 0xF0);
#ifdef MAVX2
const __m128i SSE4_c15_srlv_off1 = _mm_set_epi32(5, 6, 7, 0);
const __m128i SSE4_c15_srlv_off2 = _mm_set_epi32(1, 2, 3, 4);
#else
const __m128i SSE4_c15_mul_msk1_m128i = _mm_set_epi32(0x04, 0x02, 0x01, 0x80);
const __m128i SSE4_c15_mul_msk2_m128i = _mm_set_epi32(0x08, 0x04, 0x02, 0x01);
#endif
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
const __m128i SSE4_c15_shfl_msk_m2_1 = _mm_set_epi8(0xFF, 7, 6, 5,
0xFF, 5, 4, 3,
0xFF, 3, 2, 1,
0xFF, 0xFF, 1, 0);
const __m128i SSE4_c15_shfl_msk_m2_2 = _mm_set_epi8(0xFF, 0xFF, 14, 13,
0xFF, 13, 12, 11,
0xFF, 11, 10, 9,
0xFF, 9, 8, 7);
template <int segment>
inline void decompress_sse4_c15_m2(__m128i c15_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i c15_shfl_rslt_m128i = _mm_shuffle_epi8(c15_load_vec_m128i, SSE4_c15_shfl_msk_m2_1);
__m128i c15_and_rslt_m128i = _mm_and_si128(c15_shfl_rslt_m128i, SSE4_c15_and_msk1_m128i);
#ifdef MAVX2
__m128i c15_rslt_m128i = _mm_srlv_epi32(c15_and_rslt_m128i, SSE4_c15_srlv_off1);
#else
__m128i c15_mul_rslt_m128i = _mm_mullo_epi32(c15_and_rslt_m128i, SSE4_c15_mul_msk1_m128i);
__m128i c15_rslt_m128i = _mm_srli_epi32(c15_mul_rslt_m128i, 7);
#endif
_mm_storeu_si128((__m128i*)presult, c15_rslt_m128i);
presult = presult + 4;
// decompress second 4 values
c15_shfl_rslt_m128i = _mm_shuffle_epi8(c15_load_vec_m128i, SSE4_c15_shfl_msk_m2_2);
c15_and_rslt_m128i = _mm_and_si128(c15_shfl_rslt_m128i, SSE4_c15_and_msk2_m128i);
#ifdef MAVX2
c15_rslt_m128i = _mm_srlv_epi32(c15_and_rslt_m128i, SSE4_c15_srlv_off2);
#else
c15_mul_rslt_m128i = _mm_mullo_epi32(c15_and_rslt_m128i, SSE4_c15_mul_msk2_m128i);
c15_rslt_m128i = _mm_srli_epi32(c15_mul_rslt_m128i, 4);
#endif
_mm_storeu_si128((__m128i*)presult, c15_rslt_m128i);
presult = presult + 4;
}
const __m128i SSE4_c15_shfl_msk_m2_last_1 = _mm_set_epi8(0xFF, 8, 7, 6,
0xFF, 6, 5, 4,
0xFF, 4, 3, 2,
0xFF, 0xFF, 2, 1);
const __m128i SSE4_c15_shfl_msk_m2_last_2 = _mm_set_epi8(0xFF, 0xFF, 15, 14,
0xFF, 14, 13, 12,
0xFF, 12, 11, 10,
0xFF, 10, 9, 8);
template <int segment>
inline void decompress_sse4_c15_m2_last(__m128i c15_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i c15_shfl_rslt_m128i = _mm_shuffle_epi8(c15_load_vec_m128i, SSE4_c15_shfl_msk_m2_last_1);
__m128i c15_and_rslt_m128i = _mm_and_si128(c15_shfl_rslt_m128i, SSE4_c15_and_msk1_m128i);
#ifdef MAVX2
__m128i c15_rslt_m128i = _mm_srlv_epi32(c15_and_rslt_m128i, SSE4_c15_srlv_off1);
#else
__m128i c15_mul_rslt_m128i = _mm_mullo_epi32(c15_and_rslt_m128i, SSE4_c15_mul_msk1_m128i);
__m128i c15_rslt_m128i = _mm_srli_epi32(c15_mul_rslt_m128i, 7);
#endif
_mm_storeu_si128((__m128i*)presult, c15_rslt_m128i);
presult = presult + 4;
c15_shfl_rslt_m128i = _mm_shuffle_epi8(c15_load_vec_m128i, SSE4_c15_shfl_msk_m2_last_2);
c15_and_rslt_m128i = _mm_and_si128(c15_shfl_rslt_m128i, SSE4_c15_and_msk2_m128i);
#ifdef MAVX2
c15_rslt_m128i = _mm_srlv_epi32(c15_and_rslt_m128i, SSE4_c15_srlv_off2);
#else
c15_mul_rslt_m128i = _mm_mullo_epi32(c15_and_rslt_m128i, SSE4_c15_mul_msk2_m128i);
c15_rslt_m128i = _mm_srli_epi32(c15_mul_rslt_m128i, 4);
#endif
_mm_storeu_si128((__m128i*)presult, c15_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c15_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c15_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c15_m2<0>(c15_load_rslt1_m128i, presult);
__m128i c15_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c15_align_rslt1_m128i = _mm_alignr_epi8(c15_load_rslt2_m128i, c15_load_rslt1_m128i, 15);
decompress_sse4_c15_m2<0>(c15_align_rslt1_m128i, presult);
__m128i c15_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c15_align_rslt2_m128i = _mm_alignr_epi8(c15_load_rslt3_m128i, c15_load_rslt2_m128i, 14);
decompress_sse4_c15_m2<0>(c15_align_rslt2_m128i, presult);
__m128i c15_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c15_align_rslt3_m128i = _mm_alignr_epi8(c15_load_rslt4_m128i, c15_load_rslt3_m128i, 13);
decompress_sse4_c15_m2<0>(c15_align_rslt3_m128i, presult);
__m128i c15_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c15_align_rslt4_m128i = _mm_alignr_epi8(c15_load_rslt5_m128i, c15_load_rslt4_m128i, 12);
decompress_sse4_c15_m2<0>(c15_align_rslt4_m128i, presult);
__m128i c15_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c15_align_rslt5_m128i = _mm_alignr_epi8(c15_load_rslt6_m128i, c15_load_rslt5_m128i, 11);
decompress_sse4_c15_m2<0>(c15_align_rslt5_m128i, presult);
__m128i c15_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c15_align_rslt6_m128i = _mm_alignr_epi8(c15_load_rslt7_m128i, c15_load_rslt6_m128i, 10);
decompress_sse4_c15_m2<0>(c15_align_rslt6_m128i, presult);
__m128i c15_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c15_align_rslt7_m128i = _mm_alignr_epi8(c15_load_rslt8_m128i, c15_load_rslt7_m128i, 9);
decompress_sse4_c15_m2<0>(c15_align_rslt7_m128i, presult);
__m128i c15_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c15_align_rslt8_m128i = _mm_alignr_epi8(c15_load_rslt9_m128i, c15_load_rslt8_m128i, 8);
decompress_sse4_c15_m2<0>(c15_align_rslt8_m128i, presult);
__m128i c15_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c15_align_rslt9_m128i = _mm_alignr_epi8(c15_load_rslt10_m128i, c15_load_rslt9_m128i, 7);
decompress_sse4_c15_m2<0>(c15_align_rslt9_m128i, presult);
__m128i c15_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c15_align_rslt10_m128i = _mm_alignr_epi8(c15_load_rslt11_m128i, c15_load_rslt10_m128i, 6);
decompress_sse4_c15_m2<0>(c15_align_rslt10_m128i, presult);
__m128i c15_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c15_align_rslt11_m128i = _mm_alignr_epi8(c15_load_rslt12_m128i, c15_load_rslt11_m128i, 5);
decompress_sse4_c15_m2<0>(c15_align_rslt11_m128i, presult);
__m128i c15_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c15_align_rslt12_m128i = _mm_alignr_epi8(c15_load_rslt13_m128i, c15_load_rslt12_m128i, 4);
decompress_sse4_c15_m2<0>(c15_align_rslt12_m128i, presult);
__m128i c15_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c15_align_rslt13_m128i = _mm_alignr_epi8(c15_load_rslt14_m128i, c15_load_rslt13_m128i, 3);
decompress_sse4_c15_m2<0>(c15_align_rslt13_m128i, presult);
__m128i c15_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c15_align_rslt14_m128i = _mm_alignr_epi8(c15_load_rslt15_m128i, c15_load_rslt14_m128i, 2);
decompress_sse4_c15_m2<0>(c15_align_rslt14_m128i, presult);
decompress_sse4_c15_m2_last<1>(c15_load_rslt15_m128i, presult);
}
*/
void decompress_sse4_c15(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c15_m1(vector_m128i, presult + index, offset);
offset += 15;
index += 128;
}
if (n & 0x7F) {
unpack_15(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c16
template <int segment>
inline void decompress_sse4_c16_m2(__m128i c16_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c16_shfl_msk_m128i =
_mm_set_epi8(0xFF, 0xFF, segment + 7, segment + 6, 0xFF, 0xFF, segment + 5, segment + 4, 0xFF, 0xFF,
segment + 3, segment + 2, 0xFF, 0xFF, segment + 1, segment);
__m128i c16_rslt_m128i = _mm_shuffle_epi8(c16_load_vec_m128i, SSE4_c16_shfl_msk_m128i);
_mm_storeu_si128((__m128i*)presult, c16_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c16_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c16_load_rslt_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c16_m2<0>(c16_load_rslt_m128i, presult);
decompress_sse4_c16_m2<8>(c16_load_rslt_m128i, presult);
}
void decompress_sse4_c16(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 8 <= n) {
decompress_sse4_c16_m1(vector_m128i, presult + index, offset);
offset += 1;
index += 8;
}
if (n & 0x7) {
unpack_16(dest + index, encode + offset * 4, (n & 0x7));
}
}
// c17
const __m128i SSE4_c17_and_msk1_m128i =
_mm_set_epi8(0x00, 0x0F, 0xFF, 0xF8, 0x00, 0x07, 0xFF, 0xFC, 0x00, 0x03, 0xFF, 0xFE, 0x00, 0x01, 0xFF, 0xFF);
const __m128i SSE4_c17_and_msk2_m128i =
_mm_set_epi8(0x00, 0xFF, 0xFF, 0x80, 0x00, 0x7F, 0xFF, 0xC0, 0x00, 0x3F, 0xFF, 0xE0, 0x00, 0x1F, 0xFF, 0xF0);
const __m128i SSE4_c17_mul_msk1_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
const __m128i SSE4_c17_mul_msk2_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
template <int segment>
inline void decompress_sse4_c17_m2_s1(__m128i c17_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c17_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 8, segment + 7, segment + 6, 0xFF, segment + 6, segment + 5, segment + 4, 0xFF,
segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c17_shfl_rslt_m128i = _mm_shuffle_epi8(c17_load_vec_m128i, SSE4_c17_shfl_msk_m128i);
__m128i c17_and_rslt_m128i = _mm_and_si128(c17_shfl_rslt_m128i, SSE4_c17_and_msk1_m128i);
__m128i c17_mul_rslt_m128i = _mm_mullo_epi32(c17_and_rslt_m128i, SSE4_c17_mul_msk1_m128i);
__m128i c17_rslt_m128i = _mm_srli_epi32(c17_mul_rslt_m128i, 3);
_mm_storeu_si128((__m128i*)presult, c17_rslt_m128i);
//__m128i c17_equal_rslt_m128i = _mm_cmpeq_epi32(c17_rslt_m128i, SSE4_zero_m128i);
// int c17_tstz_rslt_int = _mm_testz_si128(c17_equal_rslt_m128i, SSE4_set_m128i);
presult = presult + 4;
}
template <int segment>
inline void decompress_sse4_c17_m2_s2(__m128i c17_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c17_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 8, segment + 7, segment + 6, 0xFF, segment + 6, segment + 5, segment + 4, 0xFF,
segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c17_shfl_rslt_m128i = _mm_shuffle_epi8(c17_load_vec_m128i, SSE4_c17_shfl_msk_m128i);
__m128i c17_and_rslt_m128i = _mm_and_si128(c17_shfl_rslt_m128i, SSE4_c17_and_msk2_m128i);
__m128i c17_mul_rslt_m128i = _mm_mullo_epi32(c17_and_rslt_m128i, SSE4_c17_mul_msk2_m128i);
__m128i c17_rslt_m128i = _mm_srli_epi32(c17_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c17_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c17_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c17_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c17_m2_s1<0>(c17_load_rslt1_m128i, presult);
__m128i c17_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c17_align_rslt1_m128i = _mm_alignr_epi8(c17_load_rslt2_m128i, c17_load_rslt1_m128i, 8);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt1_m128i, presult);
decompress_sse4_c17_m2_s1<1>(c17_load_rslt2_m128i, presult);
__m128i c17_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c17_align_rslt2_m128i = _mm_alignr_epi8(c17_load_rslt3_m128i, c17_load_rslt2_m128i, 9);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt2_m128i, presult);
decompress_sse4_c17_m2_s1<2>(c17_load_rslt3_m128i, presult);
__m128i c17_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c17_align_rslt3_m128i = _mm_alignr_epi8(c17_load_rslt4_m128i, c17_load_rslt3_m128i, 10);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt3_m128i, presult);
decompress_sse4_c17_m2_s1<3>(c17_load_rslt4_m128i, presult);
__m128i c17_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c17_align_rslt4_m128i = _mm_alignr_epi8(c17_load_rslt5_m128i, c17_load_rslt4_m128i, 11);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt4_m128i, presult);
decompress_sse4_c17_m2_s1<4>(c17_load_rslt5_m128i, presult);
__m128i c17_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c17_align_rslt5_m128i = _mm_alignr_epi8(c17_load_rslt6_m128i, c17_load_rslt5_m128i, 12);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt5_m128i, presult);
decompress_sse4_c17_m2_s1<5>(c17_load_rslt6_m128i, presult);
__m128i c17_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c17_align_rslt6_m128i = _mm_alignr_epi8(c17_load_rslt7_m128i, c17_load_rslt6_m128i, 13);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt6_m128i, presult);
decompress_sse4_c17_m2_s1<6>(c17_load_rslt7_m128i, presult);
__m128i c17_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c17_align_rslt7_m128i = _mm_alignr_epi8(c17_load_rslt8_m128i, c17_load_rslt7_m128i, 14);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt7_m128i, presult);
decompress_sse4_c17_m2_s1<7>(c17_load_rslt8_m128i, presult);
__m128i c17_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c17_align_rslt8_m128i = _mm_alignr_epi8(c17_load_rslt9_m128i, c17_load_rslt8_m128i, 15);
decompress_sse4_c17_m2_s2<0>(c17_align_rslt8_m128i, presult);
__m128i c17_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c17_align_rslt9_m128i = _mm_alignr_epi8(c17_load_rslt10_m128i, c17_load_rslt9_m128i, 8);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt9_m128i, presult);
decompress_sse4_c17_m2_s2<0>(c17_load_rslt10_m128i, presult);
__m128i c17_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c17_align_rslt10_m128i = _mm_alignr_epi8(c17_load_rslt11_m128i, c17_load_rslt10_m128i, 9);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt10_m128i, presult);
decompress_sse4_c17_m2_s2<1>(c17_load_rslt11_m128i, presult);
__m128i c17_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c17_align_rslt11_m128i = _mm_alignr_epi8(c17_load_rslt12_m128i, c17_load_rslt11_m128i, 10);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt11_m128i, presult);
decompress_sse4_c17_m2_s2<2>(c17_load_rslt12_m128i, presult);
__m128i c17_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c17_align_rslt12_m128i = _mm_alignr_epi8(c17_load_rslt13_m128i, c17_load_rslt12_m128i, 11);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt12_m128i, presult);
decompress_sse4_c17_m2_s2<3>(c17_load_rslt13_m128i, presult);
__m128i c17_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c17_align_rslt13_m128i = _mm_alignr_epi8(c17_load_rslt14_m128i, c17_load_rslt13_m128i, 12);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt13_m128i, presult);
decompress_sse4_c17_m2_s2<4>(c17_load_rslt14_m128i, presult);
__m128i c17_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c17_align_rslt14_m128i = _mm_alignr_epi8(c17_load_rslt15_m128i, c17_load_rslt14_m128i, 13);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt14_m128i, presult);
decompress_sse4_c17_m2_s2<5>(c17_load_rslt15_m128i, presult);
__m128i c17_load_rslt16_m128i = _mm_loadu_si128(its_vector_m128i + 15);
__m128i c17_align_rslt15_m128i = _mm_alignr_epi8(c17_load_rslt16_m128i, c17_load_rslt15_m128i, 14);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt15_m128i, presult);
decompress_sse4_c17_m2_s2<6>(c17_load_rslt16_m128i, presult);
__m128i c17_load_rslt17_m128i = _mm_loadu_si128(its_vector_m128i + 16);
__m128i c17_align_rslt16_m128i = _mm_alignr_epi8(c17_load_rslt17_m128i, c17_load_rslt16_m128i, 15);
decompress_sse4_c17_m2_s1<0>(c17_align_rslt16_m128i, presult);
decompress_sse4_c17_m2_s2<7>(c17_load_rslt17_m128i, presult);
}
void decompress_sse4_c17(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c17_m1(vector_m128i, presult + index, offset);
offset += 17;
index += 128;
}
if (n & 0x7F) {
unpack_17(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c18
const __m128i SSE4_c18_and_msk1_m128i =
_mm_set_epi8(0x00, 0xFF, 0xFF, 0xC0, 0x00, 0x3F, 0xFF, 0xF0, 0x00, 0x0F, 0xFF, 0xFC, 0x00, 0x03, 0xFF, 0xFF);
const __m128i SSE4_c18_mul_msk1_m128i = _mm_set_epi32(0x01, 0x04, 0x10, 0x40);
template <int segment>
inline void decompress_sse4_c18_m2(__m128i c18_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c18_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 8, segment + 7, segment + 6, 0xFF, segment + 6, segment + 5, segment + 4, 0xFF,
segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c18_shfl_rslt_m128i = _mm_shuffle_epi8(c18_load_vec_m128i, SSE4_c18_shfl_msk_m128i);
__m128i c18_and_rslt_m128i = _mm_and_si128(c18_shfl_rslt_m128i, SSE4_c18_and_msk1_m128i);
__m128i c18_mul_rslt_m128i = _mm_mullo_epi32(c18_and_rslt_m128i, SSE4_c18_mul_msk1_m128i);
__m128i c18_rslt_m128i = _mm_srli_epi32(c18_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c18_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c18_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c18_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c18_m2<0>(c18_load_rslt1_m128i, presult);
__m128i c18_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c18_align_rslt1_m128i = _mm_alignr_epi8(c18_load_rslt2_m128i, c18_load_rslt1_m128i, 9);
decompress_sse4_c18_m2<0>(c18_align_rslt1_m128i, presult);
decompress_sse4_c18_m2<2>(c18_load_rslt2_m128i, presult);
__m128i c18_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c18_align_rslt2_m128i = _mm_alignr_epi8(c18_load_rslt3_m128i, c18_load_rslt2_m128i, 11);
decompress_sse4_c18_m2<0>(c18_align_rslt2_m128i, presult);
decompress_sse4_c18_m2<4>(c18_load_rslt3_m128i, presult);
__m128i c18_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c18_align_rslt3_m128i = _mm_alignr_epi8(c18_load_rslt4_m128i, c18_load_rslt3_m128i, 13);
decompress_sse4_c18_m2<0>(c18_align_rslt3_m128i, presult);
decompress_sse4_c18_m2<6>(c18_load_rslt4_m128i, presult);
__m128i c18_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c18_align_rslt4_m128i = _mm_alignr_epi8(c18_load_rslt5_m128i, c18_load_rslt4_m128i, 15);
decompress_sse4_c18_m2<0>(c18_align_rslt4_m128i, presult);
__m128i c18_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c18_align_rslt5_m128i = _mm_alignr_epi8(c18_load_rslt6_m128i, c18_load_rslt5_m128i, 8);
decompress_sse4_c18_m2<0>(c18_align_rslt5_m128i, presult);
decompress_sse4_c18_m2<1>(c18_load_rslt6_m128i, presult);
__m128i c18_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c18_align_rslt6_m128i = _mm_alignr_epi8(c18_load_rslt7_m128i, c18_load_rslt6_m128i, 10);
decompress_sse4_c18_m2<0>(c18_align_rslt6_m128i, presult);
decompress_sse4_c18_m2<3>(c18_load_rslt7_m128i, presult);
__m128i c18_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c18_align_rslt7_m128i = _mm_alignr_epi8(c18_load_rslt8_m128i, c18_load_rslt7_m128i, 12);
decompress_sse4_c18_m2<0>(c18_align_rslt7_m128i, presult);
decompress_sse4_c18_m2<5>(c18_load_rslt8_m128i, presult);
__m128i c18_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c18_align_rslt8_m128i = _mm_alignr_epi8(c18_load_rslt9_m128i, c18_load_rslt8_m128i, 14);
decompress_sse4_c18_m2<0>(c18_align_rslt8_m128i, presult);
decompress_sse4_c18_m2<7>(c18_load_rslt9_m128i, presult);
}
void decompress_sse4_c18(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c18_m1(vector_m128i, presult + index, offset);
offset += 9;
index += 64;
}
if (n & 0x3F) {
unpack_18(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c19
const __m128i SSE4_c19_and_msk1_m128i =
_mm_set_epi8(0x00, 0x0F, 0xFF, 0xFE, 0x01, 0xFF, 0xFF, 0xC0, 0x00, 0x3F, 0xFF, 0xF8, 0x00, 0x07, 0xFF, 0xFF);
const __m128i SSE4_c19_and_msk2_m128i =
_mm_set_epi8(0x00, 0xFF, 0xFF, 0xE0, 0x00, 0x1F, 0xFF, 0xFC, 0x03, 0xFF, 0xFF, 0x80, 0x00, 0x7F, 0xFF, 0xF0);
const __m128i SSE4_c19_mul_msk1_m128i = _mm_set_epi32(0x20, 0x01, 0x08, 0x40);
const __m128i SSE4_c19_mul_msk2_m128i = _mm_set_epi32(0x04, 0x20, 0x01, 0x08);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c19_m2_s1(__m128i c19_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c19_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 9, segment + 8, segment + 7, segment + 7, segment + 6, segment + 5, segment + 4,
0xFF, segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c19_shfl_rslt_m128i = _mm_shuffle_epi8(c19_load_vec_m128i, SSE4_c19_shfl_msk_m128i);
__m128i c19_and_rslt_m128i = _mm_and_si128(c19_shfl_rslt_m128i, SSE4_c19_and_msk1_m128i);
__m128i c19_mul_rslt_m128i = _mm_mullo_epi32(c19_and_rslt_m128i, SSE4_c19_mul_msk1_m128i);
__m128i c19_rslt_m128i = _mm_srli_epi32(c19_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c19_rslt_m128i);
presult = presult + 4;
}
template <int segment>
inline void decompress_sse4_c19_m2_s2(__m128i c19_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c19_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 9, segment + 8, segment + 7, 0xFF, segment + 7, segment + 6, segment + 5,
segment + 5, segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c19_shfl_rslt_m128i = _mm_shuffle_epi8(c19_load_vec_m128i, SSE4_c19_shfl_msk_m128i);
__m128i c19_and_rslt_m128i = _mm_and_si128(c19_shfl_rslt_m128i, SSE4_c19_and_msk2_m128i);
__m128i c19_mul_rslt_m128i = _mm_mullo_epi32(c19_and_rslt_m128i, SSE4_c19_mul_msk2_m128i);
__m128i c19_rslt_m128i = _mm_srli_epi32(c19_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c19_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c19_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c19_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c19_m2_s1<0>(c19_load_rslt1_m128i, presult);
__m128i c19_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c19_align_rslt1_m128i = _mm_alignr_epi8(c19_load_rslt2_m128i, c19_load_rslt1_m128i, 9);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt1_m128i, presult);
decompress_sse4_c19_m2_s1<3>(c19_load_rslt2_m128i, presult);
__m128i c19_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c19_align_rslt2_m128i = _mm_alignr_epi8(c19_load_rslt3_m128i, c19_load_rslt2_m128i, 12);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt2_m128i, presult);
decompress_sse4_c19_m2_s1<6>(c19_load_rslt3_m128i, presult);
__m128i c19_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c19_align_rslt3_m128i = _mm_alignr_epi8(c19_load_rslt4_m128i, c19_load_rslt3_m128i, 15);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt3_m128i, presult);
__m128i c19_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c19_align_rslt4_m128i = _mm_alignr_epi8(c19_load_rslt5_m128i, c19_load_rslt4_m128i, 9);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt4_m128i, presult);
decompress_sse4_c19_m2_s2<2>(c19_load_rslt5_m128i, presult);
__m128i c19_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c19_align_rslt5_m128i = _mm_alignr_epi8(c19_load_rslt6_m128i, c19_load_rslt5_m128i, 12);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt5_m128i, presult);
decompress_sse4_c19_m2_s2<5>(c19_load_rslt6_m128i, presult);
__m128i c19_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c19_align_rslt6_m128i = _mm_alignr_epi8(c19_load_rslt7_m128i, c19_load_rslt6_m128i, 15);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt6_m128i, presult);
__m128i c19_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c19_align_rslt7_m128i = _mm_alignr_epi8(c19_load_rslt8_m128i, c19_load_rslt7_m128i, 8);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt7_m128i, presult);
decompress_sse4_c19_m2_s1<2>(c19_load_rslt8_m128i, presult);
__m128i c19_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c19_align_rslt8_m128i = _mm_alignr_epi8(c19_load_rslt9_m128i, c19_load_rslt8_m128i, 11);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt8_m128i, presult);
decompress_sse4_c19_m2_s1<5>(c19_load_rslt9_m128i, presult);
__m128i c19_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c19_align_rslt9_m128i = _mm_alignr_epi8(c19_load_rslt10_m128i, c19_load_rslt9_m128i, 14);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt9_m128i, presult);
__m128i c19_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c19_align_rslt10_m128i = _mm_alignr_epi8(c19_load_rslt11_m128i, c19_load_rslt10_m128i, 8);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt10_m128i, presult);
decompress_sse4_c19_m2_s2<1>(c19_load_rslt11_m128i, presult);
__m128i c19_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c19_align_rslt11_m128i = _mm_alignr_epi8(c19_load_rslt12_m128i, c19_load_rslt11_m128i, 11);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt11_m128i, presult);
decompress_sse4_c19_m2_s2<4>(c19_load_rslt12_m128i, presult);
__m128i c19_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c19_align_rslt12_m128i = _mm_alignr_epi8(c19_load_rslt13_m128i, c19_load_rslt12_m128i, 14);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt12_m128i, presult);
__m128i c19_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c19_align_rslt13_m128i = _mm_alignr_epi8(c19_load_rslt14_m128i, c19_load_rslt13_m128i, 7);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt13_m128i, presult);
decompress_sse4_c19_m2_s1<1>(c19_load_rslt14_m128i, presult);
__m128i c19_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c19_align_rslt14_m128i = _mm_alignr_epi8(c19_load_rslt15_m128i, c19_load_rslt14_m128i, 10);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt14_m128i, presult);
decompress_sse4_c19_m2_s1<4>(c19_load_rslt15_m128i, presult);
__m128i c19_load_rslt16_m128i = _mm_loadu_si128(its_vector_m128i + 15);
__m128i c19_align_rslt15_m128i = _mm_alignr_epi8(c19_load_rslt16_m128i, c19_load_rslt15_m128i, 13);
decompress_sse4_c19_m2_s2<0>(c19_align_rslt15_m128i, presult);
__m128i c19_load_rslt17_m128i = _mm_loadu_si128(its_vector_m128i + 16);
__m128i c19_align_rslt16_m128i = _mm_alignr_epi8(c19_load_rslt17_m128i, c19_load_rslt16_m128i, 7);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt16_m128i, presult);
decompress_sse4_c19_m2_s2<0>(c19_load_rslt17_m128i, presult);
__m128i c19_load_rslt18_m128i = _mm_loadu_si128(its_vector_m128i + 17);
__m128i c19_align_rslt17_m128i = _mm_alignr_epi8(c19_load_rslt18_m128i, c19_load_rslt17_m128i, 10);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt17_m128i, presult);
decompress_sse4_c19_m2_s2<3>(c19_load_rslt18_m128i, presult);
__m128i c19_load_rslt19_m128i = _mm_loadu_si128(its_vector_m128i + 18);
__m128i c19_align_rslt18_m128i = _mm_alignr_epi8(c19_load_rslt19_m128i, c19_load_rslt18_m128i, 13);
decompress_sse4_c19_m2_s1<0>(c19_align_rslt18_m128i, presult);
decompress_sse4_c19_m2_s2<6>(c19_load_rslt19_m128i, presult);
}
void decompress_sse4_c19(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c19_m1(vector_m128i, presult + index, offset);
offset += 19;
index += 128;
}
if (n & 0x7F) {
unpack_19(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c20
const __m128i SSE4_c20_and_msk1_m128i =
_mm_set_epi8(0x00, 0xFF, 0xFF, 0xF0, 0x00, 0x0F, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xF0, 0x00, 0x0F, 0xFF, 0xFF);
const __m128i SSE4_c20_mul_msk1_m128i = _mm_set_epi32(0x01, 0x10, 0x01, 0x10);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c20_m2(__m128i c20_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c20_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 9, segment + 8, segment + 7, 0xFF, segment + 7, segment + 6, segment + 5, 0xFF,
segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c20_shfl_rslt_m128i = _mm_shuffle_epi8(c20_load_vec_m128i, SSE4_c20_shfl_msk_m128i);
__m128i c20_and_rslt_m128i = _mm_and_si128(c20_shfl_rslt_m128i, SSE4_c20_and_msk1_m128i);
__m128i c20_mul_rslt_m128i = _mm_mullo_epi32(c20_and_rslt_m128i, SSE4_c20_mul_msk1_m128i);
__m128i c20_rslt_m128i = _mm_srli_epi32(c20_mul_rslt_m128i, 4);
_mm_storeu_si128((__m128i*)presult, c20_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c20_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c20_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c20_m2<0>(c20_load_rslt1_m128i, presult);
__m128i c20_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c20_align_rslt1_m128i = _mm_alignr_epi8(c20_load_rslt2_m128i, c20_load_rslt1_m128i, 10);
decompress_sse4_c20_m2<0>(c20_align_rslt1_m128i, presult);
decompress_sse4_c20_m2<4>(c20_load_rslt2_m128i, presult);
__m128i c20_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c20_align_rslt2_m128i = _mm_alignr_epi8(c20_load_rslt3_m128i, c20_load_rslt2_m128i, 14);
decompress_sse4_c20_m2<0>(c20_align_rslt2_m128i, presult);
__m128i c20_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c20_align_rslt3_m128i = _mm_alignr_epi8(c20_load_rslt4_m128i, c20_load_rslt3_m128i, 8);
decompress_sse4_c20_m2<0>(c20_align_rslt3_m128i, presult);
decompress_sse4_c20_m2<2>(c20_load_rslt4_m128i, presult);
__m128i c20_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c20_align_rslt4_m128i = _mm_alignr_epi8(c20_load_rslt5_m128i, c20_load_rslt4_m128i, 12);
decompress_sse4_c20_m2<0>(c20_align_rslt4_m128i, presult);
decompress_sse4_c20_m2<6>(c20_load_rslt5_m128i, presult);
}
void decompress_sse4_c20(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 32 <= n) {
decompress_sse4_c20_m1(vector_m128i, presult + index, offset);
offset += 5;
index += 32;
}
if (n & 0x1F) {
unpack_20(dest + index, encode + offset * 4, (n & 0x1F));
}
}
// c21
const __m128i SSE4_c21_and_msk1_m128i =
_mm_set_epi8(0x0F, 0xFF, 0xFF, 0x80, 0x00, 0x7F, 0xFF, 0xFC, 0x03, 0xFF, 0xFF, 0xE0, 0x00, 0x1F, 0xFF, 0xFF);
const __m128i SSE4_c21_and_msk2_m128i =
_mm_set_epi8(0x00, 0xFF, 0xFF, 0xF8, 0x07, 0xFF, 0xFF, 0xC0, 0x00, 0x3F, 0xFF, 0xFE, 0x01, 0xFF, 0xFF, 0xF0);
const __m128i SSE4_c21_mul_msk1_m128i = _mm_set_epi32(0x01, 0x20, 0x04, 0x80);
const __m128i SSE4_c21_mul_msk2_m128i = _mm_set_epi32(0x08, 0x01, 0x20, 0x04);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c21_m2_s1(__m128i c21_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c21_shfl_msk_m128i =
_mm_set_epi8(segment + 10, segment + 9, segment + 8, segment + 7, 0xFF, segment + 7, segment + 6, segment + 5,
segment + 5, segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c21_shfl_rslt_m128i = _mm_shuffle_epi8(c21_load_vec_m128i, SSE4_c21_shfl_msk_m128i);
__m128i c21_and_rslt_m128i = _mm_and_si128(c21_shfl_rslt_m128i, SSE4_c21_and_msk1_m128i);
__m128i c21_mul_rslt_m128i = _mm_mullo_epi32(c21_and_rslt_m128i, SSE4_c21_mul_msk1_m128i);
__m128i c21_rslt_m128i = _mm_srli_epi32(c21_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c21_rslt_m128i);
presult = presult + 4;
}
template <int segment>
inline void decompress_sse4_c21_m2_s2(__m128i c21_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c21_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 10, segment + 9, segment + 8, segment + 8, segment + 7, segment + 6, segment + 5,
0xFF, segment + 5, segment + 4, segment + 3, segment + 3, segment + 2, segment + 1, segment);
__m128i c21_shfl_rslt_m128i = _mm_shuffle_epi8(c21_load_vec_m128i, SSE4_c21_shfl_msk_m128i);
__m128i c21_and_rslt_m128i = _mm_and_si128(c21_shfl_rslt_m128i, SSE4_c21_and_msk2_m128i);
__m128i c21_mul_rslt_m128i = _mm_mullo_epi32(c21_and_rslt_m128i, SSE4_c21_mul_msk2_m128i);
__m128i c21_rslt_m128i = _mm_srli_epi32(c21_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c21_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c21_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c21_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c21_m2_s1<0>(c21_load_rslt1_m128i, presult);
__m128i c21_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c21_align_rslt1_m128i = _mm_alignr_epi8(c21_load_rslt2_m128i, c21_load_rslt1_m128i, 10);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt1_m128i, presult);
decompress_sse4_c21_m2_s1<5>(c21_load_rslt2_m128i, presult);
__m128i c21_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c21_align_rslt2_m128i = _mm_alignr_epi8(c21_load_rslt3_m128i, c21_load_rslt2_m128i, 15);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt2_m128i, presult);
__m128i c21_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c21_align_rslt3_m128i = _mm_alignr_epi8(c21_load_rslt4_m128i, c21_load_rslt3_m128i, 10);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt3_m128i, presult);
decompress_sse4_c21_m2_s2<4>(c21_load_rslt4_m128i, presult);
__m128i c21_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c21_align_rslt4_m128i = _mm_alignr_epi8(c21_load_rslt5_m128i, c21_load_rslt4_m128i, 15);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt4_m128i, presult);
__m128i c21_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c21_align_rslt5_m128i = _mm_alignr_epi8(c21_load_rslt6_m128i, c21_load_rslt5_m128i, 9);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt5_m128i, presult);
decompress_sse4_c21_m2_s1<4>(c21_load_rslt6_m128i, presult);
__m128i c21_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c21_align_rslt6_m128i = _mm_alignr_epi8(c21_load_rslt7_m128i, c21_load_rslt6_m128i, 14);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt6_m128i, presult);
__m128i c21_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c21_align_rslt7_m128i = _mm_alignr_epi8(c21_load_rslt8_m128i, c21_load_rslt7_m128i, 9);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt7_m128i, presult);
decompress_sse4_c21_m2_s2<3>(c21_load_rslt8_m128i, presult);
__m128i c21_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c21_align_rslt8_m128i = _mm_alignr_epi8(c21_load_rslt9_m128i, c21_load_rslt8_m128i, 14);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt8_m128i, presult);
__m128i c21_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c21_align_rslt9_m128i = _mm_alignr_epi8(c21_load_rslt10_m128i, c21_load_rslt9_m128i, 8);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt9_m128i, presult);
decompress_sse4_c21_m2_s1<3>(c21_load_rslt10_m128i, presult);
__m128i c21_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c21_align_rslt10_m128i = _mm_alignr_epi8(c21_load_rslt11_m128i, c21_load_rslt10_m128i, 13);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt10_m128i, presult);
__m128i c21_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c21_align_rslt11_m128i = _mm_alignr_epi8(c21_load_rslt12_m128i, c21_load_rslt11_m128i, 8);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt11_m128i, presult);
decompress_sse4_c21_m2_s2<2>(c21_load_rslt12_m128i, presult);
__m128i c21_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c21_align_rslt12_m128i = _mm_alignr_epi8(c21_load_rslt13_m128i, c21_load_rslt12_m128i, 13);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt12_m128i, presult);
__m128i c21_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c21_align_rslt13_m128i = _mm_alignr_epi8(c21_load_rslt14_m128i, c21_load_rslt13_m128i, 7);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt13_m128i, presult);
decompress_sse4_c21_m2_s1<2>(c21_load_rslt14_m128i, presult);
__m128i c21_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c21_align_rslt14_m128i = _mm_alignr_epi8(c21_load_rslt15_m128i, c21_load_rslt14_m128i, 12);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt14_m128i, presult);
__m128i c21_load_rslt16_m128i = _mm_loadu_si128(its_vector_m128i + 15);
__m128i c21_align_rslt15_m128i = _mm_alignr_epi8(c21_load_rslt16_m128i, c21_load_rslt15_m128i, 7);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt15_m128i, presult);
decompress_sse4_c21_m2_s2<1>(c21_load_rslt16_m128i, presult);
__m128i c21_load_rslt17_m128i = _mm_loadu_si128(its_vector_m128i + 16);
__m128i c21_align_rslt16_m128i = _mm_alignr_epi8(c21_load_rslt17_m128i, c21_load_rslt16_m128i, 12);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt16_m128i, presult);
__m128i c21_load_rslt18_m128i = _mm_loadu_si128(its_vector_m128i + 17);
__m128i c21_align_rslt17_m128i = _mm_alignr_epi8(c21_load_rslt18_m128i, c21_load_rslt17_m128i, 6);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt17_m128i, presult);
decompress_sse4_c21_m2_s1<1>(c21_load_rslt18_m128i, presult);
__m128i c21_load_rslt19_m128i = _mm_loadu_si128(its_vector_m128i + 18);
__m128i c21_align_rslt18_m128i = _mm_alignr_epi8(c21_load_rslt19_m128i, c21_load_rslt18_m128i, 11);
decompress_sse4_c21_m2_s2<0>(c21_align_rslt18_m128i, presult);
__m128i c21_load_rslt20_m128i = _mm_loadu_si128(its_vector_m128i + 19);
__m128i c21_align_rslt19_m128i = _mm_alignr_epi8(c21_load_rslt20_m128i, c21_load_rslt19_m128i, 6);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt19_m128i, presult);
decompress_sse4_c21_m2_s2<0>(c21_load_rslt20_m128i, presult);
__m128i c21_load_rslt21_m128i = _mm_loadu_si128(its_vector_m128i + 20);
__m128i c21_align_rslt20_m128i = _mm_alignr_epi8(c21_load_rslt21_m128i, c21_load_rslt20_m128i, 11);
decompress_sse4_c21_m2_s1<0>(c21_align_rslt20_m128i, presult);
decompress_sse4_c21_m2_s2<5>(c21_load_rslt21_m128i, presult);
}
void decompress_sse4_c21(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c21_m1(vector_m128i, presult + index, offset);
offset += 21;
index += 128;
}
if (n & 0x7F) {
unpack_21(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c22
const __m128i SSE4_c22_and_msk1_m128i =
_mm_set_epi8(0x00, 0xFF, 0xFF, 0xFC, 0x03, 0xFF, 0xFF, 0xF0, 0x0F, 0xFF, 0xFF, 0xC0, 0x00, 0x3F, 0xFF, 0xFF);
const __m128i SSE4_c22_mul_msk1_m128i = _mm_set_epi32(0x10, 0x04, 0x01, 0x40);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c22_m2(__m128i c22_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c22_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 10, segment + 9, segment + 8, segment + 8, segment + 7, segment + 6, segment + 5,
segment + 5, segment + 4, segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c22_shfl_rslt_m128i = _mm_shuffle_epi8(c22_load_vec_m128i, SSE4_c22_shfl_msk_m128i);
__m128i c22_and_rslt_m128i = _mm_and_si128(c22_shfl_rslt_m128i, SSE4_c22_and_msk1_m128i);
__m128i c22_mul_rslt_m128i = _mm_mullo_epi32(c22_and_rslt_m128i, SSE4_c22_mul_msk1_m128i);
__m128i c22_rslt_m128i = _mm_srli_epi32(c22_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c22_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c22_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c22_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c22_m2<0>(c22_load_rslt1_m128i, presult);
__m128i c22_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c22_align_rslt1_m128i = _mm_alignr_epi8(c22_load_rslt2_m128i, c22_load_rslt1_m128i, 11);
decompress_sse4_c22_m2<0>(c22_align_rslt1_m128i, presult);
__m128i c22_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c22_align_rslt2_m128i = _mm_alignr_epi8(c22_load_rslt3_m128i, c22_load_rslt2_m128i, 6);
decompress_sse4_c22_m2<0>(c22_align_rslt2_m128i, presult);
decompress_sse4_c22_m2<1>(c22_load_rslt3_m128i, presult);
__m128i c22_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c22_align_rslt3_m128i = _mm_alignr_epi8(c22_load_rslt4_m128i, c22_load_rslt3_m128i, 12);
decompress_sse4_c22_m2<0>(c22_align_rslt3_m128i, presult);
__m128i c22_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c22_align_rslt4_m128i = _mm_alignr_epi8(c22_load_rslt5_m128i, c22_load_rslt4_m128i, 7);
decompress_sse4_c22_m2<0>(c22_align_rslt4_m128i, presult);
decompress_sse4_c22_m2<2>(c22_load_rslt5_m128i, presult);
__m128i c22_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c22_align_rslt5_m128i = _mm_alignr_epi8(c22_load_rslt6_m128i, c22_load_rslt5_m128i, 13);
decompress_sse4_c22_m2<0>(c22_align_rslt5_m128i, presult);
__m128i c22_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c22_align_rslt6_m128i = _mm_alignr_epi8(c22_load_rslt7_m128i, c22_load_rslt6_m128i, 8);
decompress_sse4_c22_m2<0>(c22_align_rslt6_m128i, presult);
decompress_sse4_c22_m2<3>(c22_load_rslt7_m128i, presult);
__m128i c22_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c22_align_rslt7_m128i = _mm_alignr_epi8(c22_load_rslt8_m128i, c22_load_rslt7_m128i, 14);
decompress_sse4_c22_m2<0>(c22_align_rslt7_m128i, presult);
__m128i c22_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c22_align_rslt8_m128i = _mm_alignr_epi8(c22_load_rslt9_m128i, c22_load_rslt8_m128i, 9);
decompress_sse4_c22_m2<0>(c22_align_rslt8_m128i, presult);
decompress_sse4_c22_m2<4>(c22_load_rslt9_m128i, presult);
__m128i c22_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c22_align_rslt9_m128i = _mm_alignr_epi8(c22_load_rslt10_m128i, c22_load_rslt9_m128i, 15);
decompress_sse4_c22_m2<0>(c22_align_rslt9_m128i, presult);
__m128i c22_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c22_align_rslt10_m128i = _mm_alignr_epi8(c22_load_rslt11_m128i, c22_load_rslt10_m128i, 10);
decompress_sse4_c22_m2<0>(c22_align_rslt10_m128i, presult);
decompress_sse4_c22_m2<5>(c22_load_rslt11_m128i, presult);
}
void decompress_sse4_c22(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c22_m1(vector_m128i, presult + index, offset);
offset += 11;
index += 64;
}
if (n & 0x3F) {
unpack_22(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c23
const __m128i SSE4_c23_and_msk1_m128i =
_mm_set_epi8(0x0F, 0xFF, 0xFF, 0xE0, 0x1F, 0xFF, 0xFF, 0xC0, 0x3F, 0xFF, 0xFF, 0x80, 0x00, 0x7F, 0xFF, 0xFF);
const __m128i SSE4_c23_and_msk2_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xFE, 0x01, 0xFF, 0xFF, 0xFC, 0x03, 0xFF, 0xFF, 0xF8, 0x07, 0xFF, 0xFF, 0xF0);
const __m128i SSE4_c23_mul_msk1_m128i = _mm_set_epi32(0x04, 0x02, 0x01, 0x80);
const __m128i SSE4_c23_mul_msk2_m128i = _mm_set_epi32(0x08, 0x04, 0x02, 0x01);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c23_m2_s1(__m128i c23_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c23_shfl_msk_m128i = _mm_set_epi8(segment + 11, segment + 10, segment + 9, segment + 8, segment + 8,
segment + 7, segment + 6, segment + 5, segment + 5, segment + 4,
segment + 3, segment + 2, 0xFF, segment + 2, segment + 1, segment);
__m128i c23_shfl_rslt_m128i = _mm_shuffle_epi8(c23_load_vec_m128i, SSE4_c23_shfl_msk_m128i);
__m128i c23_and_rslt_m128i = _mm_and_si128(c23_shfl_rslt_m128i, SSE4_c23_and_msk1_m128i);
__m128i c23_mul_rslt_m128i = _mm_mullo_epi32(c23_and_rslt_m128i, SSE4_c23_mul_msk1_m128i);
__m128i c23_rslt_m128i = _mm_srli_epi32(c23_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c23_rslt_m128i);
presult = presult + 4;
}
template <int segment>
inline void decompress_sse4_c23_m2_s2(__m128i c23_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c23_shfl_msk_m128i = _mm_set_epi8(
0xFF, segment + 11, segment + 10, segment + 9, segment + 9, segment + 8, segment + 7, segment + 6, segment + 6,
segment + 5, segment + 4, segment + 3, segment + 3, segment + 2, segment + 1, segment);
__m128i c23_shfl_rslt_m128i = _mm_shuffle_epi8(c23_load_vec_m128i, SSE4_c23_shfl_msk_m128i);
__m128i c23_and_rslt_m128i = _mm_and_si128(c23_shfl_rslt_m128i, SSE4_c23_and_msk2_m128i);
__m128i c23_mul_rslt_m128i = _mm_mullo_epi32(c23_and_rslt_m128i, SSE4_c23_mul_msk2_m128i);
__m128i c23_rslt_m128i = _mm_srli_epi32(c23_mul_rslt_m128i, 4);
_mm_storeu_si128((__m128i*)presult, c23_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c23_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c23_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c23_m2_s1<0>(c23_load_rslt1_m128i, presult);
__m128i c23_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c23_align_rslt1_m128i = _mm_alignr_epi8(c23_load_rslt2_m128i, c23_load_rslt1_m128i, 11);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt1_m128i, presult);
__m128i c23_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c23_align_rslt2_m128i = _mm_alignr_epi8(c23_load_rslt3_m128i, c23_load_rslt2_m128i, 7);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt2_m128i, presult);
decompress_sse4_c23_m2_s2<2>(c23_load_rslt3_m128i, presult);
__m128i c23_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c23_align_rslt3_m128i = _mm_alignr_epi8(c23_load_rslt4_m128i, c23_load_rslt3_m128i, 14);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt3_m128i, presult);
__m128i c23_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c23_align_rslt4_m128i = _mm_alignr_epi8(c23_load_rslt5_m128i, c23_load_rslt4_m128i, 9);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt4_m128i, presult);
__m128i c23_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c23_align_rslt5_m128i = _mm_alignr_epi8(c23_load_rslt6_m128i, c23_load_rslt5_m128i, 5);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt5_m128i, presult);
decompress_sse4_c23_m2_s2<0>(c23_load_rslt6_m128i, presult);
__m128i c23_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c23_align_rslt6_m128i = _mm_alignr_epi8(c23_load_rslt7_m128i, c23_load_rslt6_m128i, 12);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt6_m128i, presult);
__m128i c23_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c23_align_rslt7_m128i = _mm_alignr_epi8(c23_load_rslt8_m128i, c23_load_rslt7_m128i, 7);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt7_m128i, presult);
decompress_sse4_c23_m2_s1<3>(c23_load_rslt8_m128i, presult);
__m128i c23_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c23_align_rslt8_m128i = _mm_alignr_epi8(c23_load_rslt9_m128i, c23_load_rslt8_m128i, 14);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt8_m128i, presult);
__m128i c23_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c23_align_rslt9_m128i = _mm_alignr_epi8(c23_load_rslt10_m128i, c23_load_rslt9_m128i, 10);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt9_m128i, presult);
__m128i c23_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c23_align_rslt10_m128i = _mm_alignr_epi8(c23_load_rslt11_m128i, c23_load_rslt10_m128i, 5);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt10_m128i, presult);
decompress_sse4_c23_m2_s1<1>(c23_load_rslt11_m128i, presult);
__m128i c23_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c23_align_rslt11_m128i = _mm_alignr_epi8(c23_load_rslt12_m128i, c23_load_rslt11_m128i, 12);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt11_m128i, presult);
__m128i c23_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c23_align_rslt12_m128i = _mm_alignr_epi8(c23_load_rslt13_m128i, c23_load_rslt12_m128i, 8);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt12_m128i, presult);
decompress_sse4_c23_m2_s2<3>(c23_load_rslt13_m128i, presult);
__m128i c23_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c23_align_rslt13_m128i = _mm_alignr_epi8(c23_load_rslt14_m128i, c23_load_rslt13_m128i, 15);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt13_m128i, presult);
__m128i c23_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c23_align_rslt14_m128i = _mm_alignr_epi8(c23_load_rslt15_m128i, c23_load_rslt14_m128i, 10);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt14_m128i, presult);
__m128i c23_load_rslt16_m128i = _mm_loadu_si128(its_vector_m128i + 15);
__m128i c23_align_rslt15_m128i = _mm_alignr_epi8(c23_load_rslt16_m128i, c23_load_rslt15_m128i, 6);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt15_m128i, presult);
decompress_sse4_c23_m2_s2<1>(c23_load_rslt16_m128i, presult);
__m128i c23_load_rslt17_m128i = _mm_loadu_si128(its_vector_m128i + 16);
__m128i c23_align_rslt16_m128i = _mm_alignr_epi8(c23_load_rslt17_m128i, c23_load_rslt16_m128i, 13);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt16_m128i, presult);
__m128i c23_load_rslt18_m128i = _mm_loadu_si128(its_vector_m128i + 17);
__m128i c23_align_rslt17_m128i = _mm_alignr_epi8(c23_load_rslt18_m128i, c23_load_rslt17_m128i, 8);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt17_m128i, presult);
decompress_sse4_c23_m2_s1<4>(c23_load_rslt18_m128i, presult);
__m128i c23_load_rslt19_m128i = _mm_loadu_si128(its_vector_m128i + 18);
__m128i c23_align_rslt18_m128i = _mm_alignr_epi8(c23_load_rslt19_m128i, c23_load_rslt18_m128i, 15);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt18_m128i, presult);
__m128i c23_load_rslt20_m128i = _mm_loadu_si128(its_vector_m128i + 19);
__m128i c23_align_rslt19_m128i = _mm_alignr_epi8(c23_load_rslt20_m128i, c23_load_rslt19_m128i, 11);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt19_m128i, presult);
__m128i c23_load_rslt21_m128i = _mm_loadu_si128(its_vector_m128i + 20);
__m128i c23_align_rslt20_m128i = _mm_alignr_epi8(c23_load_rslt21_m128i, c23_load_rslt20_m128i, 6);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt20_m128i, presult);
decompress_sse4_c23_m2_s1<2>(c23_load_rslt21_m128i, presult);
__m128i c23_load_rslt22_m128i = _mm_loadu_si128(its_vector_m128i + 21);
__m128i c23_align_rslt21_m128i = _mm_alignr_epi8(c23_load_rslt22_m128i, c23_load_rslt21_m128i, 13);
decompress_sse4_c23_m2_s2<0>(c23_align_rslt21_m128i, presult);
__m128i c23_load_rslt23_m128i = _mm_loadu_si128(its_vector_m128i + 22);
__m128i c23_align_rslt22_m128i = _mm_alignr_epi8(c23_load_rslt23_m128i, c23_load_rslt22_m128i, 9);
decompress_sse4_c23_m2_s1<0>(c23_align_rslt22_m128i, presult);
decompress_sse4_c23_m2_s2<4>(c23_load_rslt23_m128i, presult);
}
void decompress_sse4_c23(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c23_m1(vector_m128i, presult + index, offset);
offset += 23;
index += 128;
}
if (n & 0x7F) {
unpack_23(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c24
template <int segment>
inline void decompress_sse4_c24_m2(__m128i c24_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c24_shfl_msk_m128i =
_mm_set_epi8(0xFF, segment + 11, segment + 10, segment + 9, 0xFF, segment + 8, segment + 7, segment + 6, 0xFF,
segment + 5, segment + 4, segment + 3, 0xFF, segment + 2, segment + 1, segment);
__m128i c24_rslt_m128i = _mm_shuffle_epi8(c24_load_vec_m128i, SSE4_c24_shfl_msk_m128i);
_mm_storeu_si128((__m128i*)presult, c24_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c24_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c24_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c24_m2<0>(c24_load_rslt1_m128i, presult);
__m128i c24_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c24_align_rslt1_m128i = _mm_alignr_epi8(c24_load_rslt2_m128i, c24_load_rslt1_m128i, 12);
decompress_sse4_c24_m2<0>(c24_align_rslt1_m128i, presult);
__m128i c24_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c24_align_rslt2_m128i = _mm_alignr_epi8(c24_load_rslt3_m128i, c24_load_rslt2_m128i, 8);
decompress_sse4_c24_m2<0>(c24_align_rslt2_m128i, presult);
decompress_sse4_c24_m2<4>(c24_load_rslt3_m128i, presult);
}
void decompress_sse4_c24(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 16 <= n) {
decompress_sse4_c24_m1(vector_m128i, presult + index, offset);
offset += 3;
index += 16;
}
if (n & 0xF) {
unpack_24(dest + index, encode + offset * 4, (n & 0xF));
}
}
// c25
const __m128i SSE4_c25_and_msk1_m128i =
_mm_set_epi8(0x0F, 0xFF, 0xFF, 0xF8, 0x07, 0xFF, 0xFF, 0xFC, 0x03, 0xFF, 0xFF, 0xFE, 0x01, 0xFF, 0xFF, 0xFF);
const __m128i SSE4_c25_and_msk2_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0x80, 0x7F, 0xFF, 0xFF, 0xC0, 0x3F, 0xFF, 0xFF, 0xE0, 0x1F, 0xFF, 0xFF, 0xF0);
const __m128i SSE4_c25_mul_msk1_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
const __m128i SSE4_c25_mul_msk2_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c25_m2_s1(__m128i c25_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c25_shfl_msk_m128i = _mm_set_epi8(
segment + 12, segment + 11, segment + 10, segment + 9, segment + 9, segment + 8, segment + 7, segment + 6,
segment + 6, segment + 5, segment + 4, segment + 3, segment + 3, segment + 2, segment + 1, segment);
__m128i c25_shfl_rslt_m128i = _mm_shuffle_epi8(c25_load_vec_m128i, SSE4_c25_shfl_msk_m128i);
__m128i c25_and_rslt_m128i = _mm_and_si128(c25_shfl_rslt_m128i, SSE4_c25_and_msk1_m128i);
__m128i c25_mul_rslt_m128i = _mm_mullo_epi32(c25_and_rslt_m128i, SSE4_c25_mul_msk1_m128i);
__m128i c25_rslt_m128i = _mm_srli_epi32(c25_mul_rslt_m128i, 3);
_mm_storeu_si128((__m128i*)presult, c25_rslt_m128i);
presult = presult + 4;
}
template <int segment>
inline void decompress_sse4_c25_m2_s2(__m128i c25_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c25_shfl_msk_m128i = _mm_set_epi8(
segment + 12, segment + 11, segment + 10, segment + 9, segment + 9, segment + 8, segment + 7, segment + 6,
segment + 6, segment + 5, segment + 4, segment + 3, segment + 3, segment + 2, segment + 1, segment);
__m128i c25_shfl_rslt_m128i = _mm_shuffle_epi8(c25_load_vec_m128i, SSE4_c25_shfl_msk_m128i);
__m128i c25_and_rslt_m128i = _mm_and_si128(c25_shfl_rslt_m128i, SSE4_c25_and_msk2_m128i);
__m128i c25_mul_rslt_m128i = _mm_mullo_epi32(c25_and_rslt_m128i, SSE4_c25_mul_msk2_m128i);
__m128i c25_rslt_m128i = _mm_srli_epi32(c25_mul_rslt_m128i, 7);
_mm_storeu_si128((__m128i*)presult, c25_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c25_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c25_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c25_m2_s1<0>(c25_load_rslt1_m128i, presult);
__m128i c25_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c25_align_rslt1_m128i = _mm_alignr_epi8(c25_load_rslt2_m128i, c25_load_rslt1_m128i, 12);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt1_m128i, presult);
__m128i c25_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c25_align_rslt2_m128i = _mm_alignr_epi8(c25_load_rslt3_m128i, c25_load_rslt2_m128i, 9);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt2_m128i, presult);
__m128i c25_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c25_align_rslt3_m128i = _mm_alignr_epi8(c25_load_rslt4_m128i, c25_load_rslt3_m128i, 5);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt3_m128i, presult);
decompress_sse4_c25_m2_s1<2>(c25_load_rslt4_m128i, presult);
__m128i c25_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c25_align_rslt4_m128i = _mm_alignr_epi8(c25_load_rslt5_m128i, c25_load_rslt4_m128i, 14);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt4_m128i, presult);
__m128i c25_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c25_align_rslt5_m128i = _mm_alignr_epi8(c25_load_rslt6_m128i, c25_load_rslt5_m128i, 11);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt5_m128i, presult);
__m128i c25_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c25_align_rslt6_m128i = _mm_alignr_epi8(c25_load_rslt7_m128i, c25_load_rslt6_m128i, 7);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt6_m128i, presult);
__m128i c25_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c25_align_rslt7_m128i = _mm_alignr_epi8(c25_load_rslt8_m128i, c25_load_rslt7_m128i, 4);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt7_m128i, presult);
decompress_sse4_c25_m2_s2<0>(c25_load_rslt8_m128i, presult);
__m128i c25_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c25_align_rslt8_m128i = _mm_alignr_epi8(c25_load_rslt9_m128i, c25_load_rslt8_m128i, 13);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt8_m128i, presult);
__m128i c25_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c25_align_rslt9_m128i = _mm_alignr_epi8(c25_load_rslt10_m128i, c25_load_rslt9_m128i, 9);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt9_m128i, presult);
__m128i c25_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c25_align_rslt10_m128i = _mm_alignr_epi8(c25_load_rslt11_m128i, c25_load_rslt10_m128i, 6);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt10_m128i, presult);
decompress_sse4_c25_m2_s2<2>(c25_load_rslt11_m128i, presult);
__m128i c25_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c25_align_rslt11_m128i = _mm_alignr_epi8(c25_load_rslt12_m128i, c25_load_rslt11_m128i, 15);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt11_m128i, presult);
__m128i c25_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c25_align_rslt12_m128i = _mm_alignr_epi8(c25_load_rslt13_m128i, c25_load_rslt12_m128i, 11);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt12_m128i, presult);
__m128i c25_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c25_align_rslt13_m128i = _mm_alignr_epi8(c25_load_rslt14_m128i, c25_load_rslt13_m128i, 8);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt13_m128i, presult);
__m128i c25_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c25_align_rslt14_m128i = _mm_alignr_epi8(c25_load_rslt15_m128i, c25_load_rslt14_m128i, 4);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt14_m128i, presult);
decompress_sse4_c25_m2_s1<1>(c25_load_rslt15_m128i, presult);
__m128i c25_load_rslt16_m128i = _mm_loadu_si128(its_vector_m128i + 15);
__m128i c25_align_rslt15_m128i = _mm_alignr_epi8(c25_load_rslt16_m128i, c25_load_rslt15_m128i, 13);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt15_m128i, presult);
__m128i c25_load_rslt17_m128i = _mm_loadu_si128(its_vector_m128i + 16);
__m128i c25_align_rslt16_m128i = _mm_alignr_epi8(c25_load_rslt17_m128i, c25_load_rslt16_m128i, 10);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt16_m128i, presult);
__m128i c25_load_rslt18_m128i = _mm_loadu_si128(its_vector_m128i + 17);
__m128i c25_align_rslt17_m128i = _mm_alignr_epi8(c25_load_rslt18_m128i, c25_load_rslt17_m128i, 6);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt17_m128i, presult);
decompress_sse4_c25_m2_s1<3>(c25_load_rslt18_m128i, presult);
__m128i c25_load_rslt19_m128i = _mm_loadu_si128(its_vector_m128i + 18);
__m128i c25_align_rslt18_m128i = _mm_alignr_epi8(c25_load_rslt19_m128i, c25_load_rslt18_m128i, 15);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt18_m128i, presult);
__m128i c25_load_rslt20_m128i = _mm_loadu_si128(its_vector_m128i + 19);
__m128i c25_align_rslt19_m128i = _mm_alignr_epi8(c25_load_rslt20_m128i, c25_load_rslt19_m128i, 12);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt19_m128i, presult);
__m128i c25_load_rslt21_m128i = _mm_loadu_si128(its_vector_m128i + 20);
__m128i c25_align_rslt20_m128i = _mm_alignr_epi8(c25_load_rslt21_m128i, c25_load_rslt20_m128i, 8);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt20_m128i, presult);
__m128i c25_load_rslt22_m128i = _mm_loadu_si128(its_vector_m128i + 21);
__m128i c25_align_rslt21_m128i = _mm_alignr_epi8(c25_load_rslt22_m128i, c25_load_rslt21_m128i, 5);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt21_m128i, presult);
decompress_sse4_c25_m2_s2<1>(c25_load_rslt22_m128i, presult);
__m128i c25_load_rslt23_m128i = _mm_loadu_si128(its_vector_m128i + 22);
__m128i c25_align_rslt22_m128i = _mm_alignr_epi8(c25_load_rslt23_m128i, c25_load_rslt22_m128i, 14);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt22_m128i, presult);
__m128i c25_load_rslt24_m128i = _mm_loadu_si128(its_vector_m128i + 23);
__m128i c25_align_rslt23_m128i = _mm_alignr_epi8(c25_load_rslt24_m128i, c25_load_rslt23_m128i, 10);
decompress_sse4_c25_m2_s2<0>(c25_align_rslt23_m128i, presult);
__m128i c25_load_rslt25_m128i = _mm_loadu_si128(its_vector_m128i + 24);
__m128i c25_align_rslt24_m128i = _mm_alignr_epi8(c25_load_rslt25_m128i, c25_load_rslt24_m128i, 7);
decompress_sse4_c25_m2_s1<0>(c25_align_rslt24_m128i, presult);
decompress_sse4_c25_m2_s2<3>(c25_load_rslt25_m128i, presult);
}
void decompress_sse4_c25(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c25_m1(vector_m128i, presult + index, offset);
offset += 25;
index += 128;
}
if (n & 0x7F) {
unpack_25(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c26
const __m128i SSE4_c26_and_msk1_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xC0, 0x3F, 0xFF, 0xFF, 0xF0, 0x0F, 0xFF, 0xFF, 0xFC, 0x03, 0xFF, 0xFF, 0xFF);
const __m128i SSE4_c26_mul_msk1_m128i = _mm_set_epi32(0x01, 0x04, 0x10, 0x40);
template <int segment>
inline void decompress_sse4_c26_m2(__m128i c26_load_vec_m128i, unsigned int*& presult)
{
// decompress first 4 values
__m128i SSE4_c26_shfl_msk_m128i = _mm_set_epi8(
segment + 12, segment + 11, segment + 10, segment + 9, segment + 9, segment + 8, segment + 7, segment + 6,
segment + 6, segment + 5, segment + 4, segment + 3, segment + 3, segment + 2, segment + 1, segment);
__m128i c26_shfl_rslt_m128i = _mm_shuffle_epi8(c26_load_vec_m128i, SSE4_c26_shfl_msk_m128i);
__m128i c26_and_rslt_m128i = _mm_and_si128(c26_shfl_rslt_m128i, SSE4_c26_and_msk1_m128i);
__m128i c26_mul_rslt_m128i = _mm_mullo_epi32(c26_and_rslt_m128i, SSE4_c26_mul_msk1_m128i);
__m128i c26_rslt_m128i = _mm_srli_epi32(c26_mul_rslt_m128i, 6);
_mm_storeu_si128((__m128i*)presult, c26_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c26_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c26_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c26_m2<0>(c26_load_rslt1_m128i, presult);
__m128i c26_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c26_align_rslt1_m128i = _mm_alignr_epi8(c26_load_rslt2_m128i, c26_load_rslt1_m128i, 13);
decompress_sse4_c26_m2<0>(c26_align_rslt1_m128i, presult);
__m128i c26_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c26_align_rslt2_m128i = _mm_alignr_epi8(c26_load_rslt3_m128i, c26_load_rslt2_m128i, 10);
decompress_sse4_c26_m2<0>(c26_align_rslt2_m128i, presult);
__m128i c26_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c26_align_rslt3_m128i = _mm_alignr_epi8(c26_load_rslt4_m128i, c26_load_rslt3_m128i, 7);
decompress_sse4_c26_m2<0>(c26_align_rslt3_m128i, presult);
__m128i c26_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c26_align_rslt4_m128i = _mm_alignr_epi8(c26_load_rslt5_m128i, c26_load_rslt4_m128i, 4);
decompress_sse4_c26_m2<0>(c26_align_rslt4_m128i, presult);
decompress_sse4_c26_m2<1>(c26_load_rslt5_m128i, presult);
__m128i c26_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c26_align_rslt5_m128i = _mm_alignr_epi8(c26_load_rslt6_m128i, c26_load_rslt5_m128i, 14);
decompress_sse4_c26_m2<0>(c26_align_rslt5_m128i, presult);
__m128i c26_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c26_align_rslt6_m128i = _mm_alignr_epi8(c26_load_rslt7_m128i, c26_load_rslt6_m128i, 11);
decompress_sse4_c26_m2<0>(c26_align_rslt6_m128i, presult);
__m128i c26_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c26_align_rslt7_m128i = _mm_alignr_epi8(c26_load_rslt8_m128i, c26_load_rslt7_m128i, 8);
decompress_sse4_c26_m2<0>(c26_align_rslt7_m128i, presult);
__m128i c26_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c26_align_rslt8_m128i = _mm_alignr_epi8(c26_load_rslt9_m128i, c26_load_rslt8_m128i, 5);
decompress_sse4_c26_m2<0>(c26_align_rslt8_m128i, presult);
decompress_sse4_c26_m2<2>(c26_load_rslt9_m128i, presult);
__m128i c26_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c26_align_rslt9_m128i = _mm_alignr_epi8(c26_load_rslt10_m128i, c26_load_rslt9_m128i, 15);
decompress_sse4_c26_m2<0>(c26_align_rslt9_m128i, presult);
__m128i c26_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c26_align_rslt10_m128i = _mm_alignr_epi8(c26_load_rslt11_m128i, c26_load_rslt10_m128i, 12);
decompress_sse4_c26_m2<0>(c26_align_rslt10_m128i, presult);
__m128i c26_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c26_align_rslt11_m128i = _mm_alignr_epi8(c26_load_rslt12_m128i, c26_load_rslt11_m128i, 9);
decompress_sse4_c26_m2<0>(c26_align_rslt11_m128i, presult);
__m128i c26_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c26_align_rslt12_m128i = _mm_alignr_epi8(c26_load_rslt13_m128i, c26_load_rslt12_m128i, 6);
decompress_sse4_c26_m2<0>(c26_align_rslt12_m128i, presult);
decompress_sse4_c26_m2<3>(c26_load_rslt13_m128i, presult);
}
void decompress_sse4_c26(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c26_m1(vector_m128i, presult + index, offset);
offset += 13;
index += 64;
}
if (n & 0x3F) {
unpack_26(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c27
const __m128i SSE4_c27_and_msk1_m128i =
_mm_set_epi8(0x0F, 0xFF, 0xFF, 0xFE, 0x07, 0xFF, 0xFF, 0xFF, 0x3F, 0xFF, 0xFF, 0xF8, 0x07, 0xFF, 0xFF, 0xFF);
const __m128i SSE4_c27_and_msk2_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xE0, 0x1F, 0xFF, 0xFF, 0xFC, 0x07, 0xFF, 0xFF, 0xFF, 0x7F, 0xFF, 0xFF, 0xF0);
const __m128i SSE4_c27_mul_msk1_m128i = _mm_set_epi32(0x04, 0x08, 0x01, 0x08);
const __m128i SSE4_c27_mul_msk2_m128i = _mm_set_epi32(0x01, 0x08, 0x20, 0x02);
template <int segment>
inline void decompress_sse4_c27_m2_s1(__m128i c27_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c27_shfl_msk_m128i = _mm_set_epi8(
segment + 13, segment + 12, segment + 11, segment + 10, segment + 9, segment + 8, segment + 7, segment + 6,
segment + 6, segment + 5, segment + 4, segment + 3, segment + 3, segment + 2, segment + 1, segment);
__m128i c27_shfl_rslt_m128i = _mm_shuffle_epi8(c27_load_vec_m128i, SSE4_c27_shfl_msk_m128i);
__m128i c27_shift_rslt_m128i = _mm_srli_epi64(c27_shfl_rslt_m128i, 6);
__m128i c27_blend_rslt_m128i = _mm_blend_epi16(c27_shfl_rslt_m128i, c27_shift_rslt_m128i, 0x30);
__m128i c27_and_rslt_m128i = _mm_and_si128(c27_blend_rslt_m128i, SSE4_c27_and_msk1_m128i);
__m128i c27_mul_rslt_m128i = _mm_mullo_epi32(c27_and_rslt_m128i, SSE4_c27_mul_msk1_m128i);
__m128i c27_rslt_m128i = _mm_srli_epi32(c27_mul_rslt_m128i, 3);
_mm_storeu_si128((__m128i*)presult, c27_rslt_m128i);
presult = presult + 4;
}
template <int segment>
inline void decompress_sse4_c27_m2_s2(__m128i c27_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c27_shfl_msk_m128i = _mm_set_epi8(
segment + 13, segment + 12, segment + 11, segment + 10, segment + 10, segment + 9, segment + 8, segment + 7,
segment + 7, segment + 6, segment + 5, segment + 4, segment + 3, segment + 2, segment + 1, segment);
__m128i c27_shfl_rslt_m128i = _mm_shuffle_epi8(c27_load_vec_m128i, SSE4_c27_shfl_msk_m128i);
__m128i c27_shift_rslt_m128i = _mm_slli_epi64(c27_shfl_rslt_m128i, 1);
__m128i c27_blend_rslt_m128i = _mm_blend_epi16(c27_shfl_rslt_m128i, c27_shift_rslt_m128i, 0x0C);
__m128i c27_and_rslt_m128i = _mm_and_si128(c27_blend_rslt_m128i, SSE4_c27_and_msk2_m128i);
__m128i c27_mul_rslt_m128i = _mm_mullo_epi32(c27_and_rslt_m128i, SSE4_c27_mul_msk2_m128i);
__m128i c27_rslt_m128i = _mm_srli_epi32(c27_mul_rslt_m128i, 5);
_mm_storeu_si128((__m128i*)presult, c27_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c27_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c27_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c27_m2_s1<0>(c27_load_rslt1_m128i, presult);
__m128i c27_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c27_align_rslt1_m128i = _mm_alignr_epi8(c27_load_rslt2_m128i, c27_load_rslt1_m128i, 13);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt1_m128i, presult);
__m128i c27_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c27_align_rslt2_m128i = _mm_alignr_epi8(c27_load_rslt3_m128i, c27_load_rslt2_m128i, 11);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt2_m128i, presult);
__m128i c27_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c27_align_rslt3_m128i = _mm_alignr_epi8(c27_load_rslt4_m128i, c27_load_rslt3_m128i, 8);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt3_m128i, presult);
__m128i c27_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c27_align_rslt4_m128i = _mm_alignr_epi8(c27_load_rslt5_m128i, c27_load_rslt4_m128i, 6);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt4_m128i, presult);
__m128i c27_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c27_align_rslt5_m128i = _mm_alignr_epi8(c27_load_rslt6_m128i, c27_load_rslt5_m128i, 3);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt5_m128i, presult);
decompress_sse4_c27_m2_s1<1>(c27_load_rslt6_m128i, presult);
__m128i c27_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c27_align_rslt6_m128i = _mm_alignr_epi8(c27_load_rslt7_m128i, c27_load_rslt6_m128i, 14);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt6_m128i, presult);
__m128i c27_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c27_align_rslt7_m128i = _mm_alignr_epi8(c27_load_rslt8_m128i, c27_load_rslt7_m128i, 12);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt7_m128i, presult);
__m128i c27_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c27_align_rslt8_m128i = _mm_alignr_epi8(c27_load_rslt9_m128i, c27_load_rslt8_m128i, 9);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt8_m128i, presult);
__m128i c27_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c27_align_rslt9_m128i = _mm_alignr_epi8(c27_load_rslt10_m128i, c27_load_rslt9_m128i, 7);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt9_m128i, presult);
__m128i c27_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c27_align_rslt10_m128i = _mm_alignr_epi8(c27_load_rslt11_m128i, c27_load_rslt10_m128i, 4);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt10_m128i, presult);
decompress_sse4_c27_m2_s1<2>(c27_load_rslt11_m128i, presult);
__m128i c27_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c27_align_rslt11_m128i = _mm_alignr_epi8(c27_load_rslt12_m128i, c27_load_rslt11_m128i, 15);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt11_m128i, presult);
__m128i c27_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c27_align_rslt12_m128i = _mm_alignr_epi8(c27_load_rslt13_m128i, c27_load_rslt12_m128i, 13);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt12_m128i, presult);
__m128i c27_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c27_align_rslt13_m128i = _mm_alignr_epi8(c27_load_rslt14_m128i, c27_load_rslt13_m128i, 10);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt13_m128i, presult);
__m128i c27_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c27_align_rslt14_m128i = _mm_alignr_epi8(c27_load_rslt15_m128i, c27_load_rslt14_m128i, 8);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt14_m128i, presult);
__m128i c27_load_rslt16_m128i = _mm_loadu_si128(its_vector_m128i + 15);
__m128i c27_align_rslt15_m128i = _mm_alignr_epi8(c27_load_rslt16_m128i, c27_load_rslt15_m128i, 5);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt15_m128i, presult);
__m128i c27_load_rslt17_m128i = _mm_loadu_si128(its_vector_m128i + 16);
__m128i c27_align_rslt16_m128i = _mm_alignr_epi8(c27_load_rslt17_m128i, c27_load_rslt16_m128i, 3);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt16_m128i, presult);
decompress_sse4_c27_m2_s2<0>(c27_load_rslt17_m128i, presult);
__m128i c27_load_rslt18_m128i = _mm_loadu_si128(its_vector_m128i + 17);
__m128i c27_align_rslt17_m128i = _mm_alignr_epi8(c27_load_rslt18_m128i, c27_load_rslt17_m128i, 14);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt17_m128i, presult);
__m128i c27_load_rslt19_m128i = _mm_loadu_si128(its_vector_m128i + 18);
__m128i c27_align_rslt18_m128i = _mm_alignr_epi8(c27_load_rslt19_m128i, c27_load_rslt18_m128i, 11);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt18_m128i, presult);
__m128i c27_load_rslt20_m128i = _mm_loadu_si128(its_vector_m128i + 19);
__m128i c27_align_rslt19_m128i = _mm_alignr_epi8(c27_load_rslt20_m128i, c27_load_rslt19_m128i, 9);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt19_m128i, presult);
__m128i c27_load_rslt21_m128i = _mm_loadu_si128(its_vector_m128i + 20);
__m128i c27_align_rslt20_m128i = _mm_alignr_epi8(c27_load_rslt21_m128i, c27_load_rslt20_m128i, 6);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt20_m128i, presult);
__m128i c27_load_rslt22_m128i = _mm_loadu_si128(its_vector_m128i + 21);
__m128i c27_align_rslt21_m128i = _mm_alignr_epi8(c27_load_rslt22_m128i, c27_load_rslt21_m128i, 4);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt21_m128i, presult);
decompress_sse4_c27_m2_s2<1>(c27_load_rslt22_m128i, presult);
__m128i c27_load_rslt23_m128i = _mm_loadu_si128(its_vector_m128i + 22);
__m128i c27_align_rslt22_m128i = _mm_alignr_epi8(c27_load_rslt23_m128i, c27_load_rslt22_m128i, 15);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt22_m128i, presult);
__m128i c27_load_rslt24_m128i = _mm_loadu_si128(its_vector_m128i + 23);
__m128i c27_align_rslt23_m128i = _mm_alignr_epi8(c27_load_rslt24_m128i, c27_load_rslt23_m128i, 12);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt23_m128i, presult);
__m128i c27_load_rslt25_m128i = _mm_loadu_si128(its_vector_m128i + 24);
__m128i c27_align_rslt24_m128i = _mm_alignr_epi8(c27_load_rslt25_m128i, c27_load_rslt24_m128i, 10);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt24_m128i, presult);
__m128i c27_load_rslt26_m128i = _mm_loadu_si128(its_vector_m128i + 25);
__m128i c27_align_rslt25_m128i = _mm_alignr_epi8(c27_load_rslt26_m128i, c27_load_rslt25_m128i, 7);
decompress_sse4_c27_m2_s2<0>(c27_align_rslt25_m128i, presult);
__m128i c27_load_rslt27_m128i = _mm_loadu_si128(its_vector_m128i + 26);
__m128i c27_align_rslt26_m128i = _mm_alignr_epi8(c27_load_rslt27_m128i, c27_load_rslt26_m128i, 5);
decompress_sse4_c27_m2_s1<0>(c27_align_rslt26_m128i, presult);
decompress_sse4_c27_m2_s2<2>(c27_load_rslt27_m128i, presult);
}
void decompress_sse4_c27(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c27_m1(vector_m128i, presult + index, offset);
offset += 27;
index += 128;
}
if (n & 0x7F) {
unpack_27(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c28
const __m128i SSE4_c28_and_msk1_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xF0, 0x0F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xF0, 0x0F, 0xFF, 0xFF, 0xFF);
const __m128i SSE4_c28_mul_msk1_m128i = _mm_set_epi32(0x01, 0x10, 0x01, 0x10);
////////////////////////////////////////////////
// FUNCTIONS
////////////////////////////////////////////////
template <int segment>
inline void decompress_sse4_c28_m2(__m128i c28_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c28_shfl_msk_m128i = _mm_set_epi8(
segment + 13, segment + 12, segment + 11, segment + 10, segment + 10, segment + 9, segment + 8, segment + 7,
segment + 6, segment + 5, segment + 4, segment + 3, segment + 3, segment + 2, segment + 1, segment);
__m128i c28_shfl_rslt_m128i = _mm_shuffle_epi8(c28_load_vec_m128i, SSE4_c28_shfl_msk_m128i);
__m128i c28_and_rslt_m128i = _mm_and_si128(c28_shfl_rslt_m128i, SSE4_c28_and_msk1_m128i);
__m128i c28_mul_rslt_m128i = _mm_mullo_epi32(c28_and_rslt_m128i, SSE4_c28_mul_msk1_m128i);
__m128i c28_rslt_m128i = _mm_srli_epi32(c28_mul_rslt_m128i, 4);
_mm_storeu_si128((__m128i*)presult, c28_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c28_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c28_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c28_m2<0>(c28_load_rslt1_m128i, presult);
__m128i c28_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c28_align_rslt1_m128i = _mm_alignr_epi8(c28_load_rslt2_m128i, c28_load_rslt1_m128i, 14);
decompress_sse4_c28_m2<0>(c28_align_rslt1_m128i, presult);
__m128i c28_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c28_align_rslt2_m128i = _mm_alignr_epi8(c28_load_rslt3_m128i, c28_load_rslt2_m128i, 12);
decompress_sse4_c28_m2<0>(c28_align_rslt2_m128i, presult);
__m128i c28_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c28_align_rslt3_m128i = _mm_alignr_epi8(c28_load_rslt4_m128i, c28_load_rslt3_m128i, 10);
decompress_sse4_c28_m2<0>(c28_align_rslt3_m128i, presult);
__m128i c28_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c28_align_rslt4_m128i = _mm_alignr_epi8(c28_load_rslt5_m128i, c28_load_rslt4_m128i, 8);
decompress_sse4_c28_m2<0>(c28_align_rslt4_m128i, presult);
__m128i c28_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c28_align_rslt5_m128i = _mm_alignr_epi8(c28_load_rslt6_m128i, c28_load_rslt5_m128i, 6);
decompress_sse4_c28_m2<0>(c28_align_rslt5_m128i, presult);
__m128i c28_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c28_align_rslt6_m128i = _mm_alignr_epi8(c28_load_rslt7_m128i, c28_load_rslt6_m128i, 4);
decompress_sse4_c28_m2<0>(c28_align_rslt6_m128i, presult);
decompress_sse4_c28_m2<2>(c28_load_rslt7_m128i, presult);
}
void decompress_sse4_c28(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 32 <= n) {
decompress_sse4_c28_m1(vector_m128i, presult + index, offset);
offset += 7;
index += 32;
}
if (n & 0x1F) {
unpack_28(dest + index, encode + offset * 4, (n & 0x1F));
}
}
// c29
const __m128i SSE4_c29_and_msk1_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xF8, 0x7F, 0xFF, 0xFF, 0xFC, 0x3F, 0xFF, 0xFF, 0xFF, 0x1F, 0xFF, 0xFF, 0xFF);
const __m128i SSE4_c29_and_msk2_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xF8, 0x7F, 0xFF, 0xFF, 0xFC, 0x3F, 0xFF, 0xFF, 0xFE, 0x1F, 0xFF, 0xFF, 0xFF);
const __m128i SSE4_c29_mul_msk1_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
const __m128i SSE4_c29_mul_msk2_m128i = _mm_set_epi32(0x01, 0x02, 0x04, 0x08);
template <int segment>
inline void decompress_sse4_c29_m2_s1(__m128i c29_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c29_shfl_msk_m128i = _mm_set_epi8(
segment + 14, segment + 13, segment + 12, segment + 11, segment + 10, segment + 9, segment + 8, segment + 7,
segment + 7, segment + 6, segment + 5, segment + 4, segment + 3, segment + 2, segment + 1, segment);
__m128i c29_shfl_rslt_m128i = _mm_shuffle_epi8(c29_load_vec_m128i, SSE4_c29_shfl_msk_m128i);
__m128i c29_shift_rslt_m128i = _mm_slli_epi64(c29_shfl_rslt_m128i, 4);
__m128i c29_blend_rslt_m128i = _mm_blend_epi16(c29_shfl_rslt_m128i, c29_shift_rslt_m128i, 0xCC);
__m128i c29_and_rslt_m128i = _mm_and_si128(c29_blend_rslt_m128i, SSE4_c29_and_msk1_m128i);
__m128i c29_mul_rslt_m128i = _mm_mullo_epi32(c29_and_rslt_m128i, SSE4_c29_mul_msk1_m128i);
__m128i c29_rslt_m128i = _mm_srli_epi32(c29_mul_rslt_m128i, 3);
_mm_storeu_si128((__m128i*)presult, c29_rslt_m128i);
presult = presult + 4;
}
template <int segment>
inline void decompress_sse4_c29_m2_s2(__m128i c29_load_vec_m128i, unsigned int*& presult)
{
__m128i SSE4_c29_shfl_msk_m128i = _mm_set_epi8(
segment + 14, segment + 13, segment + 12, segment + 11, segment + 10, segment + 9, segment + 8, segment + 7,
segment + 7, segment + 6, segment + 5, segment + 4, segment + 3, segment + 2, segment + 1, segment);
__m128i c29_shfl_rslt_m128i = _mm_shuffle_epi8(c29_load_vec_m128i, SSE4_c29_shfl_msk_m128i);
__m128i c29_shift_rslt_m128i = _mm_srli_epi64(c29_shfl_rslt_m128i, 4);
__m128i c29_blend_rslt_m128i = _mm_blend_epi16(c29_shfl_rslt_m128i, c29_shift_rslt_m128i, 0x33);
__m128i c29_and_rslt_m128i = _mm_and_si128(c29_blend_rslt_m128i, SSE4_c29_and_msk2_m128i);
__m128i c29_mul_rslt_m128i = _mm_mullo_epi32(c29_and_rslt_m128i, SSE4_c29_mul_msk2_m128i);
__m128i c29_rslt_m128i = _mm_srli_epi32(c29_mul_rslt_m128i, 3);
_mm_storeu_si128((__m128i*)presult, c29_rslt_m128i);
presult = presult + 4;
}
inline void decompress_sse4_c29_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c29_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c29_m2_s1<0>(c29_load_rslt1_m128i, presult);
__m128i c29_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c29_align_rslt1_m128i = _mm_alignr_epi8(c29_load_rslt2_m128i, c29_load_rslt1_m128i, 14);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt1_m128i, presult);
__m128i c29_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c29_align_rslt2_m128i = _mm_alignr_epi8(c29_load_rslt3_m128i, c29_load_rslt2_m128i, 13);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt2_m128i, presult);
__m128i c29_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c29_align_rslt3_m128i = _mm_alignr_epi8(c29_load_rslt4_m128i, c29_load_rslt3_m128i, 11);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt3_m128i, presult);
__m128i c29_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c29_align_rslt4_m128i = _mm_alignr_epi8(c29_load_rslt5_m128i, c29_load_rslt4_m128i, 10);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt4_m128i, presult);
__m128i c29_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c29_align_rslt5_m128i = _mm_alignr_epi8(c29_load_rslt6_m128i, c29_load_rslt5_m128i, 8);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt5_m128i, presult);
__m128i c29_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c29_align_rslt6_m128i = _mm_alignr_epi8(c29_load_rslt7_m128i, c29_load_rslt6_m128i, 7);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt6_m128i, presult);
__m128i c29_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c29_align_rslt7_m128i = _mm_alignr_epi8(c29_load_rslt8_m128i, c29_load_rslt7_m128i, 5);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt7_m128i, presult);
__m128i c29_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c29_align_rslt8_m128i = _mm_alignr_epi8(c29_load_rslt9_m128i, c29_load_rslt8_m128i, 4);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt8_m128i, presult);
__m128i c29_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c29_align_rslt9_m128i = _mm_alignr_epi8(c29_load_rslt10_m128i, c29_load_rslt9_m128i, 2);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt9_m128i, presult);
decompress_sse4_c29_m2_s1<1>(c29_load_rslt10_m128i, presult);
__m128i c29_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c29_align_rslt10_m128i = _mm_alignr_epi8(c29_load_rslt11_m128i, c29_load_rslt10_m128i, 15);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt10_m128i, presult);
__m128i c29_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c29_align_rslt11_m128i = _mm_alignr_epi8(c29_load_rslt12_m128i, c29_load_rslt11_m128i, 14);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt11_m128i, presult);
__m128i c29_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c29_align_rslt12_m128i = _mm_alignr_epi8(c29_load_rslt13_m128i, c29_load_rslt12_m128i, 12);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt12_m128i, presult);
__m128i c29_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c29_align_rslt13_m128i = _mm_alignr_epi8(c29_load_rslt14_m128i, c29_load_rslt13_m128i, 11);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt13_m128i, presult);
__m128i c29_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c29_align_rslt14_m128i = _mm_alignr_epi8(c29_load_rslt15_m128i, c29_load_rslt14_m128i, 9);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt14_m128i, presult);
__m128i c29_load_rslt16_m128i = _mm_loadu_si128(its_vector_m128i + 15);
__m128i c29_align_rslt15_m128i = _mm_alignr_epi8(c29_load_rslt16_m128i, c29_load_rslt15_m128i, 8);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt15_m128i, presult);
__m128i c29_load_rslt17_m128i = _mm_loadu_si128(its_vector_m128i + 16);
__m128i c29_align_rslt16_m128i = _mm_alignr_epi8(c29_load_rslt17_m128i, c29_load_rslt16_m128i, 6);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt16_m128i, presult);
__m128i c29_load_rslt18_m128i = _mm_loadu_si128(its_vector_m128i + 17);
__m128i c29_align_rslt17_m128i = _mm_alignr_epi8(c29_load_rslt18_m128i, c29_load_rslt17_m128i, 5);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt17_m128i, presult);
__m128i c29_load_rslt19_m128i = _mm_loadu_si128(its_vector_m128i + 18);
__m128i c29_align_rslt18_m128i = _mm_alignr_epi8(c29_load_rslt19_m128i, c29_load_rslt18_m128i, 3);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt18_m128i, presult);
__m128i c29_load_rslt20_m128i = _mm_loadu_si128(its_vector_m128i + 19);
__m128i c29_align_rslt19_m128i = _mm_alignr_epi8(c29_load_rslt20_m128i, c29_load_rslt19_m128i, 2);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt19_m128i, presult);
decompress_sse4_c29_m2_s2<0>(c29_load_rslt20_m128i, presult);
__m128i c29_load_rslt21_m128i = _mm_loadu_si128(its_vector_m128i + 20);
__m128i c29_align_rslt20_m128i = _mm_alignr_epi8(c29_load_rslt21_m128i, c29_load_rslt20_m128i, 15);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt20_m128i, presult);
__m128i c29_load_rslt22_m128i = _mm_loadu_si128(its_vector_m128i + 21);
__m128i c29_align_rslt21_m128i = _mm_alignr_epi8(c29_load_rslt22_m128i, c29_load_rslt21_m128i, 13);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt21_m128i, presult);
__m128i c29_load_rslt23_m128i = _mm_loadu_si128(its_vector_m128i + 22);
__m128i c29_align_rslt22_m128i = _mm_alignr_epi8(c29_load_rslt23_m128i, c29_load_rslt22_m128i, 12);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt22_m128i, presult);
__m128i c29_load_rslt24_m128i = _mm_loadu_si128(its_vector_m128i + 23);
__m128i c29_align_rslt23_m128i = _mm_alignr_epi8(c29_load_rslt24_m128i, c29_load_rslt23_m128i, 10);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt23_m128i, presult);
__m128i c29_load_rslt25_m128i = _mm_loadu_si128(its_vector_m128i + 24);
__m128i c29_align_rslt24_m128i = _mm_alignr_epi8(c29_load_rslt25_m128i, c29_load_rslt24_m128i, 9);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt24_m128i, presult);
__m128i c29_load_rslt26_m128i = _mm_loadu_si128(its_vector_m128i + 25);
__m128i c29_align_rslt25_m128i = _mm_alignr_epi8(c29_load_rslt26_m128i, c29_load_rslt25_m128i, 7);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt25_m128i, presult);
__m128i c29_load_rslt27_m128i = _mm_loadu_si128(its_vector_m128i + 26);
__m128i c29_align_rslt26_m128i = _mm_alignr_epi8(c29_load_rslt27_m128i, c29_load_rslt26_m128i, 6);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt26_m128i, presult);
__m128i c29_load_rslt28_m128i = _mm_loadu_si128(its_vector_m128i + 27);
__m128i c29_align_rslt27_m128i = _mm_alignr_epi8(c29_load_rslt28_m128i, c29_load_rslt27_m128i, 4);
decompress_sse4_c29_m2_s2<0>(c29_align_rslt27_m128i, presult);
__m128i c29_load_rslt29_m128i = _mm_loadu_si128(its_vector_m128i + 28);
__m128i c29_align_rslt28_m128i = _mm_alignr_epi8(c29_load_rslt29_m128i, c29_load_rslt28_m128i, 3);
decompress_sse4_c29_m2_s1<0>(c29_align_rslt28_m128i, presult);
decompress_sse4_c29_m2_s2<1>(c29_load_rslt29_m128i, presult);
}
void decompress_sse4_c29(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 128 <= n) {
decompress_sse4_c29_m1(vector_m128i, presult + index, offset);
offset += 29;
index += 128;
}
if (n & 0x7F) {
unpack_29(dest + index, encode + offset * 4, (n & 0x7F));
}
}
// c30
const __m128i SSE4_c30_and_msk1_m128i =
_mm_set_epi8(0xFF, 0xFF, 0xFF, 0xFC, 0x3F, 0xFF, 0xFF, 0xFF, 0x3F, 0xFF, 0xFF, 0xFF, 0x3F, 0xFF, 0xFF, 0xFF);
const __m128i SSE4_c30_mul_msk1_m128i = _mm_set_epi32(0x01, 0x04, 0x04, 0x04);
template <int segment>
inline void decompress_sse4_c30_m2(__m128i c30_load_vec_m128i, unsigned int* presult)
{
__m128i SSE4_c30_shfl_msk_m128i = _mm_set_epi8(
segment + 14, segment + 13, segment + 12, segment + 11, segment + 10, segment + 9, segment + 8, segment + 7,
segment + 7, segment + 6, segment + 5, segment + 4, segment + 3, segment + 2, segment + 1, segment);
__m128i c30_shfl_rslt_m128i = _mm_shuffle_epi8(c30_load_vec_m128i, SSE4_c30_shfl_msk_m128i);
__m128i c30_shift_rslt_t1_m128i = _mm_slli_epi64(c30_shfl_rslt_m128i, 2);
__m128i c30_shift_rslt_t2_m128i = _mm_srli_epi64(c30_shfl_rslt_m128i, 4);
__m128i c30_blend_rslt1_m128i = _mm_blend_epi16(c30_shfl_rslt_m128i, c30_shift_rslt_t1_m128i, 0x0C);
__m128i c30_blend_rslt2_m128i = _mm_blend_epi16(c30_blend_rslt1_m128i, c30_shift_rslt_t2_m128i, 0x30);
__m128i c30_and_rslt_m128i = _mm_and_si128(c30_blend_rslt2_m128i, SSE4_c30_and_msk1_m128i);
__m128i c30_mul_rslt_m128i = _mm_mullo_epi32(c30_and_rslt_m128i, SSE4_c30_mul_msk1_m128i);
__m128i c30_rslt_m128i = _mm_srli_epi32(c30_mul_rslt_m128i, 2);
_mm_storeu_si128((__m128i*)presult, c30_rslt_m128i);
//__m128i c30_equal_rslt_m128i = _mm_cmpeq_epi32(c30_rslt_m128i, SSE4_zero_m128i);
// int c30_tstz_rslt_int = _mm_testz_si128(c30_equal_rslt_m128i, SSE4_set_m128i);
}
inline void decompress_sse4_c30_m1(__m128i* vector_m128i, unsigned int* presult, unsigned int offset)
{
__m128i* its_vector_m128i = vector_m128i + offset;
__m128i c30_load_rslt1_m128i = _mm_loadu_si128(its_vector_m128i);
decompress_sse4_c30_m2<0>(c30_load_rslt1_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt2_m128i = _mm_loadu_si128(its_vector_m128i + 1);
__m128i c30_align_rslt1_m128i = _mm_alignr_epi8(c30_load_rslt2_m128i, c30_load_rslt1_m128i, 15);
decompress_sse4_c30_m2<0>(c30_align_rslt1_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt3_m128i = _mm_loadu_si128(its_vector_m128i + 2);
__m128i c30_align_rslt2_m128i = _mm_alignr_epi8(c30_load_rslt3_m128i, c30_load_rslt2_m128i, 14);
decompress_sse4_c30_m2<0>(c30_align_rslt2_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt4_m128i = _mm_loadu_si128(its_vector_m128i + 3);
__m128i c30_align_rslt3_m128i = _mm_alignr_epi8(c30_load_rslt4_m128i, c30_load_rslt3_m128i, 13);
decompress_sse4_c30_m2<0>(c30_align_rslt3_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt5_m128i = _mm_loadu_si128(its_vector_m128i + 4);
__m128i c30_align_rslt4_m128i = _mm_alignr_epi8(c30_load_rslt5_m128i, c30_load_rslt4_m128i, 12);
decompress_sse4_c30_m2<0>(c30_align_rslt4_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt6_m128i = _mm_loadu_si128(its_vector_m128i + 5);
__m128i c30_align_rslt5_m128i = _mm_alignr_epi8(c30_load_rslt6_m128i, c30_load_rslt5_m128i, 11);
decompress_sse4_c30_m2<0>(c30_align_rslt5_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt7_m128i = _mm_loadu_si128(its_vector_m128i + 6);
__m128i c30_align_rslt6_m128i = _mm_alignr_epi8(c30_load_rslt7_m128i, c30_load_rslt6_m128i, 10);
decompress_sse4_c30_m2<0>(c30_align_rslt6_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt8_m128i = _mm_loadu_si128(its_vector_m128i + 7);
__m128i c30_align_rslt7_m128i = _mm_alignr_epi8(c30_load_rslt8_m128i, c30_load_rslt7_m128i, 9);
decompress_sse4_c30_m2<0>(c30_align_rslt7_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt9_m128i = _mm_loadu_si128(its_vector_m128i + 8);
__m128i c30_align_rslt8_m128i = _mm_alignr_epi8(c30_load_rslt9_m128i, c30_load_rslt8_m128i, 8);
decompress_sse4_c30_m2<0>(c30_align_rslt8_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt10_m128i = _mm_loadu_si128(its_vector_m128i + 9);
__m128i c30_align_rslt9_m128i = _mm_alignr_epi8(c30_load_rslt10_m128i, c30_load_rslt9_m128i, 7);
decompress_sse4_c30_m2<0>(c30_align_rslt9_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt11_m128i = _mm_loadu_si128(its_vector_m128i + 10);
__m128i c30_align_rslt10_m128i = _mm_alignr_epi8(c30_load_rslt11_m128i, c30_load_rslt10_m128i, 6);
decompress_sse4_c30_m2<0>(c30_align_rslt10_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt12_m128i = _mm_loadu_si128(its_vector_m128i + 11);
__m128i c30_align_rslt11_m128i = _mm_alignr_epi8(c30_load_rslt12_m128i, c30_load_rslt11_m128i, 5);
decompress_sse4_c30_m2<0>(c30_align_rslt11_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt13_m128i = _mm_loadu_si128(its_vector_m128i + 12);
__m128i c30_align_rslt12_m128i = _mm_alignr_epi8(c30_load_rslt13_m128i, c30_load_rslt12_m128i, 4);
decompress_sse4_c30_m2<0>(c30_align_rslt12_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt14_m128i = _mm_loadu_si128(its_vector_m128i + 13);
__m128i c30_align_rslt13_m128i = _mm_alignr_epi8(c30_load_rslt14_m128i, c30_load_rslt13_m128i, 3);
decompress_sse4_c30_m2<0>(c30_align_rslt13_m128i, presult);
presult = presult + 4;
__m128i c30_load_rslt15_m128i = _mm_loadu_si128(its_vector_m128i + 14);
__m128i c30_align_rslt14_m128i = _mm_alignr_epi8(c30_load_rslt15_m128i, c30_load_rslt14_m128i, 2);
decompress_sse4_c30_m2<0>(c30_align_rslt14_m128i, presult);
presult = presult + 4;
decompress_sse4_c30_m2<1>(c30_load_rslt15_m128i, presult);
presult = presult + 4;
}
void decompress_sse4_c30(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
unsigned int index = 0, offset = 0;
__m128i* vector_m128i = (__m128i*)encode;
unsigned int* presult = (unsigned int*)dest;
while (index + 64 <= n) {
decompress_sse4_c30_m1(vector_m128i, presult + index, offset);
offset += 15;
index += 64;
}
if (n & 0x3F) {
unpack_30(dest + index, encode + offset * 4, (n & 0x3F));
}
}
// c31
void decompress_sse4_c31(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
for (uint32_t i = 32; i <= n; i += 32, dest += 32, encode += 31) {
dest[0] = encode[0] & 0x7FFFFFFF;
dest[1] = ((encode[0] >> 31) | (encode[1] << 1)) & 0x7FFFFFFF;
dest[2] = ((encode[1] >> 30) | (encode[2] << 2)) & 0x7FFFFFFF;
dest[3] = ((encode[2] >> 29) | (encode[3] << 3)) & 0x7FFFFFFF;
dest[4] = ((encode[3] >> 28) | (encode[4] << 4)) & 0x7FFFFFFF;
dest[5] = ((encode[4] >> 27) | (encode[5] << 5)) & 0x7FFFFFFF;
dest[6] = ((encode[5] >> 26) | (encode[6] << 6)) & 0x7FFFFFFF;
dest[7] = ((encode[6] >> 25) | (encode[7] << 7)) & 0x7FFFFFFF;
dest[8] = ((encode[7] >> 24) | (encode[8] << 8)) & 0x7FFFFFFF;
dest[9] = ((encode[8] >> 23) | (encode[9] << 9)) & 0x7FFFFFFF;
dest[10] = ((encode[9] >> 22) | (encode[10] << 10)) & 0x7FFFFFFF;
dest[11] = ((encode[10] >> 21) | (encode[11] << 11)) & 0x7FFFFFFF;
dest[12] = ((encode[11] >> 20) | (encode[12] << 12)) & 0x7FFFFFFF;
dest[13] = ((encode[12] >> 19) | (encode[13] << 13)) & 0x7FFFFFFF;
dest[14] = ((encode[13] >> 18) | (encode[14] << 14)) & 0x7FFFFFFF;
dest[15] = ((encode[14] >> 17) | (encode[15] << 15)) & 0x7FFFFFFF;
dest[16] = ((encode[15] >> 16) | (encode[16] << 16)) & 0x7FFFFFFF;
dest[17] = ((encode[16] >> 15) | (encode[17] << 17)) & 0x7FFFFFFF;
dest[18] = ((encode[17] >> 14) | (encode[18] << 18)) & 0x7FFFFFFF;
dest[19] = ((encode[18] >> 13) | (encode[19] << 19)) & 0x7FFFFFFF;
dest[20] = ((encode[19] >> 12) | (encode[20] << 20)) & 0x7FFFFFFF;
dest[21] = ((encode[20] >> 11) | (encode[21] << 21)) & 0x7FFFFFFF;
dest[22] = ((encode[21] >> 10) | (encode[22] << 22)) & 0x7FFFFFFF;
dest[23] = ((encode[22] >> 9) | (encode[23] << 23)) & 0x7FFFFFFF;
dest[24] = ((encode[23] >> 8) | (encode[24] << 24)) & 0x7FFFFFFF;
dest[25] = ((encode[24] >> 7) | (encode[25] << 25)) & 0x7FFFFFFF;
dest[26] = ((encode[25] >> 6) | (encode[26] << 26)) & 0x7FFFFFFF;
dest[27] = ((encode[26] >> 5) | (encode[27] << 27)) & 0x7FFFFFFF;
dest[28] = ((encode[27] >> 4) | (encode[28] << 28)) & 0x7FFFFFFF;
dest[29] = ((encode[28] >> 3) | (encode[29] << 29)) & 0x7FFFFFFF;
dest[30] = ((encode[29] >> 2) | (encode[30] << 30)) & 0x7FFFFFFF;
dest[31] = (encode[30] >> 1) & 0x7FFFFFFF;
}
if (n & 0x1F) {
unaligned_unpack_31(dest, encode, (n & 0x1F));
}
}
// c32
void decompress_sse4_c32(uint32_t* dest, const uint32_t* encode, uint32_t n)
{
for (uint32_t i = 32; i <= n; i += 32, dest += 32, encode += 32) {
dest[0] = encode[0] & 0xFFFFFFFF;
dest[1] = encode[1] & 0xFFFFFFFF;
dest[2] = encode[2] & 0xFFFFFFFF;
dest[3] = encode[3] & 0xFFFFFFFF;
dest[4] = encode[4] & 0xFFFFFFFF;
dest[5] = encode[5] & 0xFFFFFFFF;
dest[6] = encode[6] & 0xFFFFFFFF;
dest[7] = encode[7] & 0xFFFFFFFF;
dest[8] = encode[8] & 0xFFFFFFFF;
dest[9] = encode[9] & 0xFFFFFFFF;
dest[10] = encode[10] & 0xFFFFFFFF;
dest[11] = encode[11] & 0xFFFFFFFF;
dest[12] = encode[12] & 0xFFFFFFFF;
dest[13] = encode[13] & 0xFFFFFFFF;
dest[14] = encode[14] & 0xFFFFFFFF;
dest[15] = encode[15] & 0xFFFFFFFF;
dest[16] = encode[16] & 0xFFFFFFFF;
dest[17] = encode[17] & 0xFFFFFFFF;
dest[18] = encode[18] & 0xFFFFFFFF;
dest[19] = encode[19] & 0xFFFFFFFF;
dest[20] = encode[20] & 0xFFFFFFFF;
dest[21] = encode[21] & 0xFFFFFFFF;
dest[22] = encode[22] & 0xFFFFFFFF;
dest[23] = encode[23] & 0xFFFFFFFF;
dest[24] = encode[24] & 0xFFFFFFFF;
dest[25] = encode[25] & 0xFFFFFFFF;
dest[26] = encode[26] & 0xFFFFFFFF;
dest[27] = encode[27] & 0xFFFFFFFF;
dest[28] = encode[28] & 0xFFFFFFFF;
dest[29] = encode[29] & 0xFFFFFFFF;
dest[30] = encode[30] & 0xFFFFFFFF;
dest[31] = encode[31] & 0xFFFFFFFF;
}
if (n & 0x1F) {
for (uint32_t i = 0; i < (n & 0x1F); ++i) {
dest[i] = encode[i] & 0xFFFFFFFF;
}
}
}
} // namespace indexlib::index