parquet/internal/utils/_lib/bit_packing_avx2.c (1,484 lines of code) (raw):
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <immintrin.h>
#include <string.h>
inline const uint32_t* unpack0_32_avx2(const uint32_t* in, uint32_t* out) {
memset(out, 0x0, 32 * sizeof(*out));
out += 32;
return in;
}
inline static const uint32_t* unpack1_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(7, 6, 5, 4,
3, 2, 1, 0);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(15, 14, 13, 12,
11, 10, 9, 8);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(23, 22, 21, 20,
19, 18, 17, 16);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(31, 30, 29, 28,
27, 26, 25, 24);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 1;
return in;
}
inline static const uint32_t* unpack2_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
6, 4, 2, 0);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
22, 20, 18, 16);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(14, 12, 10, 8,
6, 4, 2, 0);
reg_inls = _mm256_set_epi32(in[1], in[1],
in[1], in[1],
in[1], in[1],
in[1], in[1]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(30, 28, 26, 24,
22, 20, 18, 16);
reg_inls = _mm256_set_epi32(in[1], in[1],
in[1], in[1],
in[1], in[1],
in[1], in[1]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 2;
return in;
}
inline static const uint32_t* unpack3_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(21, 18, 15, 12,
9, 6, 3, 0);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(13, 10, 7, 4,
1, 0, 27, 24);
reg_inls = _mm256_set_epi32(in[1], in[1],
in[1], in[1],
in[1], in[0] >> 30 | in[1] << 2,
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(5, 2, 0, 28,
25, 22, 19, 16);
reg_inls = _mm256_set_epi32(in[2], in[2],
in[1] >> 31 | in[2] << 1, in[1],
in[1], in[1],
in[1], in[1]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(29, 26, 23, 20,
17, 14, 11, 8);
reg_inls = _mm256_set_epi32(in[2], in[2],
in[2], in[2],
in[2], in[2],
in[2], in[2]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 3;
return in;
}
inline static const uint32_t* unpack4_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0xf;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
12, 8, 4, 0);
reg_inls = _mm256_set_epi32(in[0], in[0],
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
12, 8, 4, 0);
reg_inls = _mm256_set_epi32(in[1], in[1],
in[1], in[1],
in[1], in[1],
in[1], in[1]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
12, 8, 4, 0);
reg_inls = _mm256_set_epi32(in[2], in[2],
in[2], in[2],
in[2], in[2],
in[2], in[2]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(28, 24, 20, 16,
12, 8, 4, 0);
reg_inls = _mm256_set_epi32(in[3], in[3],
in[3], in[3],
in[3], in[3],
in[3], in[3]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 4;
return in;
}
inline static const uint32_t* unpack5_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1f;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(3, 0, 25, 20,
15, 10, 5, 0);
reg_inls = _mm256_set_epi32(in[1], in[0] >> 30 | in[1] << 2,
in[0], in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(11, 6, 1, 0,
23, 18, 13, 8);
reg_inls = _mm256_set_epi32(in[2], in[2],
in[2], in[1] >> 28 | in[2] << 4,
in[1], in[1],
in[1], in[1]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(19, 14, 9, 4,
0, 26, 21, 16);
reg_inls = _mm256_set_epi32(in[3], in[3],
in[3], in[3],
in[2] >> 31 | in[3] << 1, in[2],
in[2], in[2]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(27, 22, 17, 12,
7, 2, 0, 24);
reg_inls = _mm256_set_epi32(in[4], in[4],
in[4], in[4],
in[4], in[4],
in[3] >> 29 | in[4] << 3, in[3]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 5;
return in;
}
inline static const uint32_t* unpack6_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3f;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
18, 12, 6, 0);
reg_inls = _mm256_set_epi32(in[1], in[1],
in[0] >> 30 | in[1] << 2, in[0],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
2, 0, 22, 16);
reg_inls = _mm256_set_epi32(in[2], in[2],
in[2], in[2],
in[2], in[1] >> 28 | in[2] << 4,
in[1], in[1]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(10, 4, 0, 24,
18, 12, 6, 0);
reg_inls = _mm256_set_epi32(in[4], in[4],
in[3] >> 30 | in[4] << 2, in[3],
in[3], in[3],
in[3], in[3]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(26, 20, 14, 8,
2, 0, 22, 16);
reg_inls = _mm256_set_epi32(in[5], in[5],
in[5], in[5],
in[5], in[4] >> 28 | in[5] << 4,
in[4], in[4]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 6;
return in;
}
inline static const uint32_t* unpack7_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7f;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(17, 10, 3, 0,
21, 14, 7, 0);
reg_inls = _mm256_set_epi32(in[1], in[1],
in[1], in[0] >> 28 | in[1] << 4,
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(9, 2, 0, 20,
13, 6, 0, 24);
reg_inls = _mm256_set_epi32(in[3], in[3],
in[2] >> 27 | in[3] << 5, in[2],
in[2], in[2],
in[1] >> 31 | in[2] << 1, in[1]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(1, 0, 19, 12,
5, 0, 23, 16);
reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
in[4], in[4],
in[4], in[3] >> 30 | in[4] << 2,
in[3], in[3]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(25, 18, 11, 4,
0, 22, 15, 8);
reg_inls = _mm256_set_epi32(in[6], in[6],
in[6], in[6],
in[5] >> 29 | in[6] << 3, in[5],
in[5], in[5]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 7;
return in;
}
inline static const uint32_t* unpack8_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0xff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
24, 16, 8, 0);
reg_inls = _mm256_set_epi32(in[1], in[1],
in[1], in[1],
in[0], in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
24, 16, 8, 0);
reg_inls = _mm256_set_epi32(in[3], in[3],
in[3], in[3],
in[2], in[2],
in[2], in[2]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
24, 16, 8, 0);
reg_inls = _mm256_set_epi32(in[5], in[5],
in[5], in[5],
in[4], in[4],
in[4], in[4]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(24, 16, 8, 0,
24, 16, 8, 0);
reg_inls = _mm256_set_epi32(in[7], in[7],
in[7], in[7],
in[6], in[6],
in[6], in[6]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 8;
return in;
}
inline static const uint32_t* unpack9_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1ff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 22, 13, 4,
0, 18, 9, 0);
reg_inls = _mm256_set_epi32(in[1] >> 31 | in[2] << 1, in[1],
in[1], in[1],
in[0] >> 27 | in[1] << 5, in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(7, 0, 21, 12,
3, 0, 17, 8);
reg_inls = _mm256_set_epi32(in[4], in[3] >> 30 | in[4] << 2,
in[3], in[3],
in[3], in[2] >> 26 | in[3] << 6,
in[2], in[2]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(15, 6, 0, 20,
11, 2, 0, 16);
reg_inls = _mm256_set_epi32(in[6], in[6],
in[5] >> 29 | in[6] << 3, in[5],
in[5], in[5],
in[4] >> 25 | in[5] << 7, in[4]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(23, 14, 5, 0,
19, 10, 1, 0);
reg_inls = _mm256_set_epi32(in[8], in[8],
in[8], in[7] >> 28 | in[8] << 4,
in[7], in[7],
in[7], in[6] >> 24 | in[7] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 9;
return in;
}
inline static const uint32_t* unpack10_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3ff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
0, 20, 10, 0);
reg_inls = _mm256_set_epi32(in[2], in[1] >> 28 | in[2] << 4,
in[1], in[1],
in[0] >> 30 | in[1] << 2, in[0],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
14, 4, 0, 16);
reg_inls = _mm256_set_epi32(in[4], in[4],
in[4], in[3] >> 24 | in[4] << 8,
in[3], in[3],
in[2] >> 26 | in[3] << 6, in[2]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(6, 0, 18, 8,
0, 20, 10, 0);
reg_inls = _mm256_set_epi32(in[7], in[6] >> 28 | in[7] << 4,
in[6], in[6],
in[5] >> 30 | in[6] << 2, in[5],
in[5], in[5]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(22, 12, 2, 0,
14, 4, 0, 16);
reg_inls = _mm256_set_epi32(in[9], in[9],
in[9], in[8] >> 24 | in[9] << 8,
in[8], in[8],
in[7] >> 26 | in[8] << 6, in[7]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 10;
return in;
}
inline static const uint32_t* unpack11_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7ff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(13, 2, 0, 12,
1, 0, 11, 0);
reg_inls = _mm256_set_epi32(in[2], in[2],
in[1] >> 23 | in[2] << 9, in[1],
in[1], in[0] >> 22 | in[1] << 10,
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(5, 0, 15, 4,
0, 14, 3, 0);
reg_inls = _mm256_set_epi32(in[5], in[4] >> 26 | in[5] << 6,
in[4], in[4],
in[3] >> 25 | in[4] << 7, in[3],
in[3], in[2] >> 24 | in[3] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 18, 7, 0,
17, 6, 0, 16);
reg_inls = _mm256_set_epi32(in[7] >> 29 | in[8] << 3, in[7],
in[7], in[6] >> 28 | in[7] << 4,
in[6], in[6],
in[5] >> 27 | in[6] << 5, in[5]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(21, 10, 0, 20,
9, 0, 19, 8);
reg_inls = _mm256_set_epi32(in[10], in[10],
in[9] >> 31 | in[10] << 1, in[9],
in[9], in[8] >> 30 | in[9] << 2,
in[8], in[8]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 11;
return in;
}
inline static const uint32_t* unpack12_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0xfff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
4, 0, 12, 0);
reg_inls = _mm256_set_epi32(in[2], in[2],
in[1] >> 28 | in[2] << 4, in[1],
in[1], in[0] >> 24 | in[1] << 8,
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
4, 0, 12, 0);
reg_inls = _mm256_set_epi32(in[5], in[5],
in[4] >> 28 | in[5] << 4, in[4],
in[4], in[3] >> 24 | in[4] << 8,
in[3], in[3]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
4, 0, 12, 0);
reg_inls = _mm256_set_epi32(in[8], in[8],
in[7] >> 28 | in[8] << 4, in[7],
in[7], in[6] >> 24 | in[7] << 8,
in[6], in[6]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(20, 8, 0, 16,
4, 0, 12, 0);
reg_inls = _mm256_set_epi32(in[11], in[11],
in[10] >> 28 | in[11] << 4, in[10],
in[10], in[9] >> 24 | in[10] << 8,
in[9], in[9]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 12;
return in;
}
inline static const uint32_t* unpack13_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1fff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 14, 1, 0,
7, 0, 13, 0);
reg_inls = _mm256_set_epi32(in[2] >> 27 | in[3] << 5, in[2],
in[2], in[1] >> 20 | in[2] << 12,
in[1], in[0] >> 26 | in[1] << 6,
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(3, 0, 9, 0,
15, 2, 0, 8);
reg_inls = _mm256_set_epi32(in[6], in[5] >> 22 | in[6] << 10,
in[5], in[4] >> 28 | in[5] << 4,
in[4], in[4],
in[3] >> 21 | in[4] << 11, in[3]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(11, 0, 17, 4,
0, 10, 0, 16);
reg_inls = _mm256_set_epi32(in[9], in[8] >> 30 | in[9] << 2,
in[8], in[8],
in[7] >> 23 | in[8] << 9, in[7],
in[6] >> 29 | in[7] << 3, in[6]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(19, 6, 0, 12,
0, 18, 5, 0);
reg_inls = _mm256_set_epi32(in[12], in[12],
in[11] >> 25 | in[12] << 7, in[11],
in[10] >> 31 | in[11] << 1, in[10],
in[10], in[9] >> 24 | in[10] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 13;
return in;
}
inline static const uint32_t* unpack14_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3fff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
10, 0, 14, 0);
reg_inls = _mm256_set_epi32(in[3], in[2] >> 20 | in[3] << 12,
in[2], in[1] >> 24 | in[2] << 8,
in[1], in[0] >> 28 | in[1] << 4,
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
0, 12, 0, 16);
reg_inls = _mm256_set_epi32(in[6], in[6],
in[5] >> 22 | in[6] << 10, in[5],
in[4] >> 26 | in[5] << 6, in[4],
in[3] >> 30 | in[4] << 2, in[3]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(2, 0, 6, 0,
10, 0, 14, 0);
reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
in[9], in[8] >> 24 | in[9] << 8,
in[8], in[7] >> 28 | in[8] << 4,
in[7], in[7]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(18, 4, 0, 8,
0, 12, 0, 16);
reg_inls = _mm256_set_epi32(in[13], in[13],
in[12] >> 22 | in[13] << 10, in[12],
in[11] >> 26 | in[12] << 6, in[11],
in[10] >> 30 | in[11] << 2, in[10]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 14;
return in;
}
inline static const uint32_t* unpack15_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7fff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(9, 0, 11, 0,
13, 0, 15, 0);
reg_inls = _mm256_set_epi32(in[3], in[2] >> 26 | in[3] << 6,
in[2], in[1] >> 28 | in[2] << 4,
in[1], in[0] >> 30 | in[1] << 2,
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(1, 0, 3, 0,
5, 0, 7, 0);
reg_inls = _mm256_set_epi32(in[7], in[6] >> 18 | in[7] << 14,
in[6], in[5] >> 20 | in[6] << 12,
in[5], in[4] >> 22 | in[5] << 10,
in[4], in[3] >> 24 | in[4] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 10, 0, 12,
0, 14, 0, 16);
reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
in[9] >> 27 | in[10] << 5, in[9],
in[8] >> 29 | in[9] << 3, in[8],
in[7] >> 31 | in[8] << 1, in[7]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(17, 2, 0, 4,
0, 6, 0, 8);
reg_inls = _mm256_set_epi32(in[14], in[14],
in[13] >> 19 | in[14] << 13, in[13],
in[12] >> 21 | in[13] << 11, in[12],
in[11] >> 23 | in[12] << 9, in[11]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 15;
return in;
}
inline static const uint32_t* unpack16_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0xffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
16, 0, 16, 0);
reg_inls = _mm256_set_epi32(in[3], in[3],
in[2], in[2],
in[1], in[1],
in[0], in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
16, 0, 16, 0);
reg_inls = _mm256_set_epi32(in[7], in[7],
in[6], in[6],
in[5], in[5],
in[4], in[4]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
16, 0, 16, 0);
reg_inls = _mm256_set_epi32(in[11], in[11],
in[10], in[10],
in[9], in[9],
in[8], in[8]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(16, 0, 16, 0,
16, 0, 16, 0);
reg_inls = _mm256_set_epi32(in[15], in[15],
in[14], in[14],
in[13], in[13],
in[12], in[12]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 16;
return in;
}
inline static const uint32_t* unpack17_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1ffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 6, 0, 4,
0, 2, 0, 0);
reg_inls = _mm256_set_epi32(in[3] >> 23 | in[4] << 9, in[3],
in[2] >> 21 | in[3] << 11, in[2],
in[1] >> 19 | in[2] << 13, in[1],
in[0] >> 17 | in[1] << 15, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 14, 0, 12,
0, 10, 0, 8);
reg_inls = _mm256_set_epi32(in[7] >> 31 | in[8] << 1, in[7],
in[6] >> 29 | in[7] << 3, in[6],
in[5] >> 27 | in[6] << 5, in[5],
in[4] >> 25 | in[5] << 7, in[4]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(7, 0, 5, 0,
3, 0, 1, 0);
reg_inls = _mm256_set_epi32(in[12], in[11] >> 22 | in[12] << 10,
in[11], in[10] >> 20 | in[11] << 12,
in[10], in[9] >> 18 | in[10] << 14,
in[9], in[8] >> 16 | in[9] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(15, 0, 13, 0,
11, 0, 9, 0);
reg_inls = _mm256_set_epi32(in[16], in[15] >> 30 | in[16] << 2,
in[15], in[14] >> 28 | in[15] << 4,
in[14], in[13] >> 26 | in[14] << 6,
in[13], in[12] >> 24 | in[13] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 17;
return in;
}
inline static const uint32_t* unpack18_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3ffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
0, 4, 0, 0);
reg_inls = _mm256_set_epi32(in[3] >> 30 | in[4] << 2, in[3],
in[2] >> 26 | in[3] << 6, in[2],
in[1] >> 22 | in[2] << 10, in[1],
in[0] >> 18 | in[1] << 14, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
6, 0, 2, 0);
reg_inls = _mm256_set_epi32(in[8], in[7] >> 28 | in[8] << 4,
in[7], in[6] >> 24 | in[7] << 8,
in[6], in[5] >> 20 | in[6] << 12,
in[5], in[4] >> 16 | in[5] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 12, 0, 8,
0, 4, 0, 0);
reg_inls = _mm256_set_epi32(in[12] >> 30 | in[13] << 2, in[12],
in[11] >> 26 | in[12] << 6, in[11],
in[10] >> 22 | in[11] << 10, in[10],
in[9] >> 18 | in[10] << 14, in[9]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(14, 0, 10, 0,
6, 0, 2, 0);
reg_inls = _mm256_set_epi32(in[17], in[16] >> 28 | in[17] << 4,
in[16], in[15] >> 24 | in[16] << 8,
in[15], in[14] >> 20 | in[15] << 12,
in[14], in[13] >> 16 | in[14] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 18;
return in;
}
inline static const uint32_t* unpack19_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7ffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(5, 0, 0, 12,
0, 6, 0, 0);
reg_inls = _mm256_set_epi32(in[4], in[3] >> 18 | in[4] << 14,
in[2] >> 31 | in[3] << 1, in[2],
in[1] >> 25 | in[2] << 7, in[1],
in[0] >> 19 | in[1] << 13, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 10, 0, 4,
0, 0, 11, 0);
reg_inls = _mm256_set_epi32(in[8] >> 29 | in[9] << 3, in[8],
in[7] >> 23 | in[8] << 9, in[7],
in[6] >> 17 | in[7] << 15, in[5] >> 30 | in[6] << 2,
in[5], in[4] >> 24 | in[5] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
9, 0, 3, 0);
reg_inls = _mm256_set_epi32(in[13] >> 21 | in[14] << 11, in[13],
in[12] >> 15 | in[13] << 17, in[11] >> 28 | in[12] << 4,
in[11], in[10] >> 22 | in[11] << 10,
in[10], in[9] >> 16 | in[10] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(13, 0, 7, 0,
1, 0, 0, 8);
reg_inls = _mm256_set_epi32(in[18], in[17] >> 26 | in[18] << 6,
in[17], in[16] >> 20 | in[17] << 12,
in[16], in[15] >> 14 | in[16] << 18,
in[14] >> 27 | in[15] << 5, in[14]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 19;
return in;
}
inline static const uint32_t* unpack20_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0xfffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
0, 8, 0, 0);
reg_inls = _mm256_set_epi32(in[4], in[3] >> 24 | in[4] << 8,
in[3], in[2] >> 16 | in[3] << 16,
in[1] >> 28 | in[2] << 4, in[1],
in[0] >> 20 | in[1] << 12, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
0, 8, 0, 0);
reg_inls = _mm256_set_epi32(in[9], in[8] >> 24 | in[9] << 8,
in[8], in[7] >> 16 | in[8] << 16,
in[6] >> 28 | in[7] << 4, in[6],
in[5] >> 20 | in[6] << 12, in[5]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
0, 8, 0, 0);
reg_inls = _mm256_set_epi32(in[14], in[13] >> 24 | in[14] << 8,
in[13], in[12] >> 16 | in[13] << 16,
in[11] >> 28 | in[12] << 4, in[11],
in[10] >> 20 | in[11] << 12, in[10]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(12, 0, 4, 0,
0, 8, 0, 0);
reg_inls = _mm256_set_epi32(in[19], in[18] >> 24 | in[19] << 8,
in[18], in[17] >> 16 | in[18] << 16,
in[16] >> 28 | in[17] << 4, in[16],
in[15] >> 20 | in[16] << 12, in[15]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 20;
return in;
}
inline static const uint32_t* unpack21_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1fffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 9, 0,
0, 10, 0, 0);
reg_inls = _mm256_set_epi32(in[4] >> 19 | in[5] << 13, in[3] >> 30 | in[4] << 2,
in[3], in[2] >> 20 | in[3] << 12,
in[1] >> 31 | in[2] << 1, in[1],
in[0] >> 21 | in[1] << 11, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
7, 0, 0, 8);
reg_inls = _mm256_set_epi32(in[9] >> 27 | in[10] << 5, in[9],
in[8] >> 17 | in[9] << 15, in[7] >> 28 | in[8] << 4,
in[7], in[6] >> 18 | in[7] << 14,
in[5] >> 29 | in[6] << 3, in[5]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(3, 0, 0, 4,
0, 0, 5, 0);
reg_inls = _mm256_set_epi32(in[15], in[14] >> 14 | in[15] << 18,
in[13] >> 25 | in[14] << 7, in[13],
in[12] >> 15 | in[13] << 17, in[11] >> 26 | in[12] << 6,
in[11], in[10] >> 16 | in[11] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(11, 0, 1, 0,
0, 2, 0, 0);
reg_inls = _mm256_set_epi32(in[20], in[19] >> 22 | in[20] << 10,
in[19], in[18] >> 12 | in[19] << 20,
in[17] >> 23 | in[18] << 9, in[17],
in[16] >> 13 | in[17] << 19, in[15] >> 24 | in[16] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 21;
return in;
}
inline static const uint32_t* unpack22_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3fffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
2, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[4] >> 26 | in[5] << 6, in[4],
in[3] >> 14 | in[4] << 18, in[2] >> 24 | in[3] << 8,
in[2], in[1] >> 12 | in[2] << 20,
in[0] >> 22 | in[1] << 10, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
0, 0, 6, 0);
reg_inls = _mm256_set_epi32(in[10], in[9] >> 20 | in[10] << 12,
in[8] >> 30 | in[9] << 2, in[8],
in[7] >> 18 | in[8] << 14, in[6] >> 28 | in[7] << 4,
in[6], in[5] >> 16 | in[6] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 4, 0, 0,
2, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[15] >> 26 | in[16] << 6, in[15],
in[14] >> 14 | in[15] << 18, in[13] >> 24 | in[14] << 8,
in[13], in[12] >> 12 | in[13] << 20,
in[11] >> 22 | in[12] << 10, in[11]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(10, 0, 0, 8,
0, 0, 6, 0);
reg_inls = _mm256_set_epi32(in[21], in[20] >> 20 | in[21] << 12,
in[19] >> 30 | in[20] << 2, in[19],
in[18] >> 18 | in[19] << 14, in[17] >> 28 | in[18] << 4,
in[17], in[16] >> 16 | in[17] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 22;
return in;
}
inline static const uint32_t* unpack23_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7fffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
5, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[5], in[4] >> 10 | in[5] << 22,
in[3] >> 19 | in[4] << 13, in[2] >> 28 | in[3] << 4,
in[2], in[1] >> 14 | in[2] << 18,
in[0] >> 23 | in[1] << 9, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
0, 6, 0, 0);
reg_inls = _mm256_set_epi32(in[10] >> 25 | in[11] << 7, in[10],
in[9] >> 11 | in[10] << 21, in[8] >> 20 | in[9] << 12,
in[7] >> 29 | in[8] << 3, in[7],
in[6] >> 15 | in[7] << 17, in[5] >> 24 | in[6] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 3, 0,
0, 0, 7, 0);
reg_inls = _mm256_set_epi32(in[16] >> 17 | in[17] << 15, in[15] >> 26 | in[16] << 6,
in[15], in[14] >> 12 | in[15] << 20,
in[13] >> 21 | in[14] << 11, in[12] >> 30 | in[13] << 2,
in[12], in[11] >> 16 | in[12] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(9, 0, 0, 4,
0, 0, 0, 8);
reg_inls = _mm256_set_epi32(in[22], in[21] >> 18 | in[22] << 14,
in[20] >> 27 | in[21] << 5, in[20],
in[19] >> 13 | in[20] << 19, in[18] >> 22 | in[19] << 10,
in[17] >> 31 | in[18] << 1, in[17]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 23;
return in;
}
inline static const uint32_t* unpack24_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0xffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
8, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[5], in[4] >> 16 | in[5] << 16,
in[3] >> 24 | in[4] << 8, in[3],
in[2], in[1] >> 16 | in[2] << 16,
in[0] >> 24 | in[1] << 8, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
8, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[11], in[10] >> 16 | in[11] << 16,
in[9] >> 24 | in[10] << 8, in[9],
in[8], in[7] >> 16 | in[8] << 16,
in[6] >> 24 | in[7] << 8, in[6]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
8, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[17], in[16] >> 16 | in[17] << 16,
in[15] >> 24 | in[16] << 8, in[15],
in[14], in[13] >> 16 | in[14] << 16,
in[12] >> 24 | in[13] << 8, in[12]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(8, 0, 0, 0,
8, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[23], in[22] >> 16 | in[23] << 16,
in[21] >> 24 | in[22] << 8, in[21],
in[20], in[19] >> 16 | in[20] << 16,
in[18] >> 24 | in[19] << 8, in[18]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 24;
return in;
}
inline static const uint32_t* unpack25_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1ffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[5] >> 15 | in[6] << 17, in[4] >> 22 | in[5] << 10,
in[3] >> 29 | in[4] << 3, in[3],
in[2] >> 11 | in[3] << 21, in[1] >> 18 | in[2] << 14,
in[0] >> 25 | in[1] << 7, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 5, 0,
0, 0, 1, 0);
reg_inls = _mm256_set_epi32(in[11] >> 23 | in[12] << 9, in[10] >> 30 | in[11] << 2,
in[10], in[9] >> 12 | in[10] << 20,
in[8] >> 19 | in[9] << 13, in[7] >> 26 | in[8] << 6,
in[7], in[6] >> 8 | in[7] << 24);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 6, 0, 0,
0, 2, 0, 0);
reg_inls = _mm256_set_epi32(in[17] >> 31 | in[18] << 1, in[17],
in[16] >> 13 | in[17] << 19, in[15] >> 20 | in[16] << 12,
in[14] >> 27 | in[15] << 5, in[14],
in[13] >> 9 | in[14] << 23, in[12] >> 16 | in[13] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(7, 0, 0, 0,
3, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[24], in[23] >> 14 | in[24] << 18,
in[22] >> 21 | in[23] << 11, in[21] >> 28 | in[22] << 4,
in[21], in[20] >> 10 | in[21] << 22,
in[19] >> 17 | in[20] << 15, in[18] >> 24 | in[19] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 25;
return in;
}
inline static const uint32_t* unpack26_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3ffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[5] >> 22 | in[6] << 10, in[4] >> 28 | in[5] << 4,
in[4], in[3] >> 8 | in[4] << 24,
in[2] >> 14 | in[3] << 18, in[1] >> 20 | in[2] << 12,
in[0] >> 26 | in[1] << 6, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
0, 4, 0, 0);
reg_inls = _mm256_set_epi32(in[12], in[11] >> 12 | in[12] << 20,
in[10] >> 18 | in[11] << 14, in[9] >> 24 | in[10] << 8,
in[8] >> 30 | in[9] << 2, in[8],
in[7] >> 10 | in[8] << 22, in[6] >> 16 | in[7] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 2, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[18] >> 22 | in[19] << 10, in[17] >> 28 | in[18] << 4,
in[17], in[16] >> 8 | in[17] << 24,
in[15] >> 14 | in[16] << 18, in[14] >> 20 | in[15] << 12,
in[13] >> 26 | in[14] << 6, in[13]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(6, 0, 0, 0,
0, 4, 0, 0);
reg_inls = _mm256_set_epi32(in[25], in[24] >> 12 | in[25] << 20,
in[23] >> 18 | in[24] << 14, in[22] >> 24 | in[23] << 8,
in[21] >> 30 | in[22] << 2, in[21],
in[20] >> 10 | in[21] << 22, in[19] >> 16 | in[20] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 26;
return in;
}
inline static const uint32_t* unpack27_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7ffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 2, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[5] >> 29 | in[6] << 3, in[5],
in[4] >> 7 | in[5] << 25, in[3] >> 12 | in[4] << 20,
in[2] >> 17 | in[3] << 15, in[1] >> 22 | in[2] << 10,
in[0] >> 27 | in[1] << 5, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 4,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[12] >> 21 | in[13] << 11, in[11] >> 26 | in[12] << 6,
in[10] >> 31 | in[11] << 1, in[10],
in[9] >> 9 | in[10] << 23, in[8] >> 14 | in[9] << 18,
in[7] >> 19 | in[8] << 13, in[6] >> 24 | in[7] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
1, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[19] >> 13 | in[20] << 19, in[18] >> 18 | in[19] << 14,
in[17] >> 23 | in[18] << 9, in[16] >> 28 | in[17] << 4,
in[16], in[15] >> 6 | in[16] << 26,
in[14] >> 11 | in[15] << 21, in[13] >> 16 | in[14] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(5, 0, 0, 0,
0, 0, 3, 0);
reg_inls = _mm256_set_epi32(in[26], in[25] >> 10 | in[26] << 22,
in[24] >> 15 | in[25] << 17, in[23] >> 20 | in[24] << 12,
in[22] >> 25 | in[23] << 7, in[21] >> 30 | in[22] << 2,
in[21], in[20] >> 8 | in[21] << 24);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 27;
return in;
}
inline static const uint32_t* unpack28_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0xfffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[6], in[5] >> 8 | in[6] << 24,
in[4] >> 12 | in[5] << 20, in[3] >> 16 | in[4] << 16,
in[2] >> 20 | in[3] << 12, in[1] >> 24 | in[2] << 8,
in[0] >> 28 | in[1] << 4, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[13], in[12] >> 8 | in[13] << 24,
in[11] >> 12 | in[12] << 20, in[10] >> 16 | in[11] << 16,
in[9] >> 20 | in[10] << 12, in[8] >> 24 | in[9] << 8,
in[7] >> 28 | in[8] << 4, in[7]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[20], in[19] >> 8 | in[20] << 24,
in[18] >> 12 | in[19] << 20, in[17] >> 16 | in[18] << 16,
in[16] >> 20 | in[17] << 12, in[15] >> 24 | in[16] << 8,
in[14] >> 28 | in[15] << 4, in[14]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(4, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[27], in[26] >> 8 | in[27] << 24,
in[25] >> 12 | in[26] << 20, in[24] >> 16 | in[25] << 16,
in[23] >> 20 | in[24] << 12, in[22] >> 24 | in[23] << 8,
in[21] >> 28 | in[22] << 4, in[21]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 28;
return in;
}
inline static const uint32_t* unpack29_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x1fffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[6] >> 11 | in[7] << 21, in[5] >> 14 | in[6] << 18,
in[4] >> 17 | in[5] << 15, in[3] >> 20 | in[4] << 12,
in[2] >> 23 | in[3] << 9, in[1] >> 26 | in[2] << 6,
in[0] >> 29 | in[1] << 3, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
0, 2, 0, 0);
reg_inls = _mm256_set_epi32(in[13] >> 19 | in[14] << 13, in[12] >> 22 | in[13] << 10,
in[11] >> 25 | in[12] << 7, in[10] >> 28 | in[11] << 4,
in[9] >> 31 | in[10] << 1, in[9],
in[8] >> 5 | in[9] << 27, in[7] >> 8 | in[8] << 24);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 1, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[20] >> 27 | in[21] << 5, in[19] >> 30 | in[20] << 2,
in[19], in[18] >> 4 | in[19] << 28,
in[17] >> 7 | in[18] << 25, in[16] >> 10 | in[17] << 22,
in[15] >> 13 | in[16] << 19, in[14] >> 16 | in[15] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(3, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[28], in[27] >> 6 | in[28] << 26,
in[26] >> 9 | in[27] << 23, in[25] >> 12 | in[26] << 20,
in[24] >> 15 | in[25] << 17, in[23] >> 18 | in[24] << 14,
in[22] >> 21 | in[23] << 11, in[21] >> 24 | in[22] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 29;
return in;
}
inline static const uint32_t* unpack30_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x3fffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[6] >> 18 | in[7] << 14, in[5] >> 20 | in[6] << 12,
in[4] >> 22 | in[5] << 10, in[3] >> 24 | in[4] << 8,
in[2] >> 26 | in[3] << 6, in[1] >> 28 | in[2] << 4,
in[0] >> 30 | in[1] << 2, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[14], in[13] >> 4 | in[14] << 28,
in[12] >> 6 | in[13] << 26, in[11] >> 8 | in[12] << 24,
in[10] >> 10 | in[11] << 22, in[9] >> 12 | in[10] << 20,
in[8] >> 14 | in[9] << 18, in[7] >> 16 | in[8] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[21] >> 18 | in[22] << 14, in[20] >> 20 | in[21] << 12,
in[19] >> 22 | in[20] << 10, in[18] >> 24 | in[19] << 8,
in[17] >> 26 | in[18] << 6, in[16] >> 28 | in[17] << 4,
in[15] >> 30 | in[16] << 2, in[15]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(2, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[29], in[28] >> 4 | in[29] << 28,
in[27] >> 6 | in[28] << 26, in[26] >> 8 | in[27] << 24,
in[25] >> 10 | in[26] << 22, in[24] >> 12 | in[25] << 20,
in[23] >> 14 | in[24] << 18, in[22] >> 16 | in[23] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 30;
return in;
}
inline static const uint32_t* unpack31_32_avx2(const uint32_t* in, uint32_t* out) {
uint32_t mask = 0x7fffffff;
__m256i reg_shifts, reg_inls, reg_masks;
__m256i results;
reg_masks = _mm256_set1_epi32(mask);
// shift the first 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[6] >> 25 | in[7] << 7, in[5] >> 26 | in[6] << 6,
in[4] >> 27 | in[5] << 5, in[3] >> 28 | in[4] << 4,
in[2] >> 29 | in[3] << 3, in[1] >> 30 | in[2] << 2,
in[0] >> 31 | in[1] << 1, in[0]);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the second 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[14] >> 17 | in[15] << 15, in[13] >> 18 | in[14] << 14,
in[12] >> 19 | in[13] << 13, in[11] >> 20 | in[12] << 12,
in[10] >> 21 | in[11] << 11, in[9] >> 22 | in[10] << 10,
in[8] >> 23 | in[9] << 9, in[7] >> 24 | in[8] << 8);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the third 8 outs
reg_shifts = _mm256_set_epi32(0, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[22] >> 9 | in[23] << 23, in[21] >> 10 | in[22] << 22,
in[20] >> 11 | in[21] << 21, in[19] >> 12 | in[20] << 20,
in[18] >> 13 | in[19] << 19, in[17] >> 14 | in[18] << 18,
in[16] >> 15 | in[17] << 17, in[15] >> 16 | in[16] << 16);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
// shift the last 8 outs
reg_shifts = _mm256_set_epi32(1, 0, 0, 0,
0, 0, 0, 0);
reg_inls = _mm256_set_epi32(in[30], in[29] >> 2 | in[30] << 30,
in[28] >> 3 | in[29] << 29, in[27] >> 4 | in[28] << 28,
in[26] >> 5 | in[27] << 27, in[25] >> 6 | in[26] << 26,
in[24] >> 7 | in[25] << 25, in[23] >> 8 | in[24] << 24);
results = _mm256_and_si256(_mm256_srlv_epi32(reg_inls, reg_shifts), reg_masks);
_mm256_storeu_si256((__m256i*)(out), results);
out += 8;
in += 31;
return in;
}
inline const uint32_t* unpack32_32_avx2(const uint32_t* in, uint32_t* out) {
memcpy(out, in, 32 * sizeof(*out));
in += 32;
out += 32;
return in;
}
int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) {
batch_size = batch_size / 32 * 32;
int num_loops = batch_size / 32;
switch (num_bits) {
case 0:
for (int i = 0; i < num_loops; ++i) in = unpack0_32_avx2(in, out + i * 32);
break;
case 1:
for (int i = 0; i < num_loops; ++i) in = unpack1_32_avx2(in, out + i * 32);
break;
case 2:
for (int i = 0; i < num_loops; ++i) in = unpack2_32_avx2(in, out + i * 32);
break;
case 3:
for (int i = 0; i < num_loops; ++i) in = unpack3_32_avx2(in, out + i * 32);
break;
case 4:
for (int i = 0; i < num_loops; ++i) in = unpack4_32_avx2(in, out + i * 32);
break;
case 5:
for (int i = 0; i < num_loops; ++i) in = unpack5_32_avx2(in, out + i * 32);
break;
case 6:
for (int i = 0; i < num_loops; ++i) in = unpack6_32_avx2(in, out + i * 32);
break;
case 7:
for (int i = 0; i < num_loops; ++i) in = unpack7_32_avx2(in, out + i * 32);
break;
case 8:
for (int i = 0; i < num_loops; ++i) in = unpack8_32_avx2(in, out + i * 32);
break;
case 9:
for (int i = 0; i < num_loops; ++i) in = unpack9_32_avx2(in, out + i * 32);
break;
case 10:
for (int i = 0; i < num_loops; ++i) in = unpack10_32_avx2(in, out + i * 32);
break;
case 11:
for (int i = 0; i < num_loops; ++i) in = unpack11_32_avx2(in, out + i * 32);
break;
case 12:
for (int i = 0; i < num_loops; ++i) in = unpack12_32_avx2(in, out + i * 32);
break;
case 13:
for (int i = 0; i < num_loops; ++i) in = unpack13_32_avx2(in, out + i * 32);
break;
case 14:
for (int i = 0; i < num_loops; ++i) in = unpack14_32_avx2(in, out + i * 32);
break;
case 15:
for (int i = 0; i < num_loops; ++i) in = unpack15_32_avx2(in, out + i * 32);
break;
case 16:
for (int i = 0; i < num_loops; ++i) in = unpack16_32_avx2(in, out + i * 32);
break;
case 17:
for (int i = 0; i < num_loops; ++i) in = unpack17_32_avx2(in, out + i * 32);
break;
case 18:
for (int i = 0; i < num_loops; ++i) in = unpack18_32_avx2(in, out + i * 32);
break;
case 19:
for (int i = 0; i < num_loops; ++i) in = unpack19_32_avx2(in, out + i * 32);
break;
case 20:
for (int i = 0; i < num_loops; ++i) in = unpack20_32_avx2(in, out + i * 32);
break;
case 21:
for (int i = 0; i < num_loops; ++i) in = unpack21_32_avx2(in, out + i * 32);
break;
case 22:
for (int i = 0; i < num_loops; ++i) in = unpack22_32_avx2(in, out + i * 32);
break;
case 23:
for (int i = 0; i < num_loops; ++i) in = unpack23_32_avx2(in, out + i * 32);
break;
case 24:
for (int i = 0; i < num_loops; ++i) in = unpack24_32_avx2(in, out + i * 32);
break;
case 25:
for (int i = 0; i < num_loops; ++i) in = unpack25_32_avx2(in, out + i * 32);
break;
case 26:
for (int i = 0; i < num_loops; ++i) in = unpack26_32_avx2(in, out + i * 32);
break;
case 27:
for (int i = 0; i < num_loops; ++i) in = unpack27_32_avx2(in, out + i * 32);
break;
case 28:
for (int i = 0; i < num_loops; ++i) in = unpack28_32_avx2(in, out + i * 32);
break;
case 29:
for (int i = 0; i < num_loops; ++i) in = unpack29_32_avx2(in, out + i * 32);
break;
case 30:
for (int i = 0; i < num_loops; ++i) in = unpack30_32_avx2(in, out + i * 32);
break;
case 31:
for (int i = 0; i < num_loops; ++i) in = unpack31_32_avx2(in, out + i * 32);
break;
case 32:
for (int i = 0; i < num_loops; ++i) in = unpack32_32_avx2(in, out + i * 32);
break;
}
return batch_size;
}