parquet/internal/utils/_lib/bit_packing_neon.c (2,550 lines of code) (raw):

// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include <stdint.h> #include <string.h> #include "arm_neon.h" inline const uint32_t* unpack0_32_neon(const uint32_t* in, uint32_t* out) { for (const uint32_t* end = out + 32; out != end; out++) { *out = 0; } return in; } inline static const uint32_t* unpack1_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 1, 2, 3}; uint32_t shifts_2nd[4] = {4, 5, 6, 7}; uint32_t shifts_3rd[4] = {8, 9, 10, 11}; uint32_t shifts_4th[4] = {12, 13, 14, 15}; uint32_t shifts_5th[4] = {16, 17, 18, 19}; uint32_t shifts_6th[4] = {20, 21, 22, 23}; uint32_t shifts_7th[4] = {24, 25, 26, 27}; uint32_t shifts_8th[4] = {28, 29, 30, 31}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[0] >> shifts_2nd[0]; ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[0] >> shifts_3rd[0]; ind[1] = in[0] >> shifts_3rd[1]; ind[2] = in[0] >> shifts_3rd[2]; ind[3] = in[0] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[0] >> shifts_4th[0]; ind[1] = in[0] >> shifts_4th[1]; ind[2] = in[0] >> shifts_4th[2]; ind[3] = in[0] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[0] >> shifts_5th[0]; ind[1] = in[0] >> shifts_5th[1]; ind[2] = in[0] >> shifts_5th[2]; ind[3] = in[0] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[0] >> shifts_6th[0]; ind[1] = in[0] >> shifts_6th[1]; ind[2] = in[0] >> shifts_6th[2]; ind[3] = in[0] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[0] >> shifts_7th[0]; ind[1] = in[0] >> shifts_7th[1]; ind[2] = in[0] >> shifts_7th[2]; ind[3] = in[0] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[0] >> shifts_8th[0]; ind[1] = in[0] >> shifts_8th[1]; ind[2] = in[0] >> shifts_8th[2]; ind[3] = in[0] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 1; return in; } inline static const uint32_t* unpack2_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 2, 4, 6}; uint32_t shifts_2nd[4] = {8, 10, 12, 14}; uint32_t shifts_3rd[4] = {16, 18, 20, 22}; uint32_t shifts_4th[4] = {24, 26, 28, 30}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[0] >> shifts_2nd[0]; ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[0] >> shifts_3rd[0]; ind[1] = in[0] >> shifts_3rd[1]; ind[2] = in[0] >> shifts_3rd[2]; ind[3] = in[0] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[0] >> shifts_4th[0]; ind[1] = in[0] >> shifts_4th[1]; ind[2] = in[0] >> shifts_4th[2]; ind[3] = in[0] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[1] >> shifts_1st[0]; ind[1] = in[1] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[1] >> shifts_2nd[0]; ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[1] >> shifts_3rd[0]; ind[1] = in[1] >> shifts_3rd[1]; ind[2] = in[1] >> shifts_3rd[2]; ind[3] = in[1] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[1] >> shifts_4th[0]; ind[1] = in[1] >> shifts_4th[1]; ind[2] = in[1] >> shifts_4th[2]; ind[3] = in[1] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 2; return in; } inline static const uint32_t* unpack3_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 3, 6, 9}; uint32_t shifts_2nd[4] = {12, 15, 18, 21}; uint32_t shifts_3rd[4] = {24, 27, 0, 1}; uint32_t shifts_4th[4] = {4, 7, 10, 13}; uint32_t shifts_5th[4] = {16, 19, 22, 25}; uint32_t shifts_6th[4] = {28, 0, 2, 5}; uint32_t shifts_7th[4] = {8, 11, 14, 17}; uint32_t shifts_8th[4] = {20, 23, 26, 29}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[0] >> shifts_2nd[0]; ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[0] >> shifts_3rd[0]; ind[1] = in[0] >> shifts_3rd[1]; ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_3rd[2]; ind[3] = in[1] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[1] >> shifts_4th[0]; ind[1] = in[1] >> shifts_4th[1]; ind[2] = in[1] >> shifts_4th[2]; ind[3] = in[1] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[1] >> shifts_5th[0]; ind[1] = in[1] >> shifts_5th[1]; ind[2] = in[1] >> shifts_5th[2]; ind[3] = in[1] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[1] >> shifts_6th[0]; ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_6th[1]; ind[2] = in[2] >> shifts_6th[2]; ind[3] = in[2] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[2] >> shifts_7th[0]; ind[1] = in[2] >> shifts_7th[1]; ind[2] = in[2] >> shifts_7th[2]; ind[3] = in[2] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[2] >> shifts_8th[0]; ind[1] = in[2] >> shifts_8th[1]; ind[2] = in[2] >> shifts_8th[2]; ind[3] = in[2] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 3; return in; } inline static const uint32_t* unpack4_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xf; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 4, 8, 12}; uint32_t shifts_2nd[4] = {16, 20, 24, 28}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[0] >> shifts_2nd[0]; ind[1] = in[0] >> shifts_2nd[1]; ind[2] = in[0] >> shifts_2nd[2]; ind[3] = in[0] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[1] >> shifts_1st[0]; ind[1] = in[1] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[1] >> shifts_2nd[0]; ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[2] >> shifts_1st[0]; ind[1] = in[2] >> shifts_1st[1]; ind[2] = in[2] >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[2] >> shifts_2nd[0]; ind[1] = in[2] >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[3] >> shifts_1st[0]; ind[1] = in[3] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[3] >> shifts_2nd[0]; ind[1] = in[3] >> shifts_2nd[1]; ind[2] = in[3] >> shifts_2nd[2]; ind[3] = in[3] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 4; return in; } inline static const uint32_t* unpack5_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1f; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 5, 10, 15}; uint32_t shifts_2nd[4] = {20, 25, 0, 3}; uint32_t shifts_3rd[4] = {8, 13, 18, 23}; uint32_t shifts_4th[4] = {0, 1, 6, 11}; uint32_t shifts_5th[4] = {16, 21, 26, 0}; uint32_t shifts_6th[4] = {4, 9, 14, 19}; uint32_t shifts_7th[4] = {24, 0, 2, 7}; uint32_t shifts_8th[4] = {12, 17, 22, 27}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[0] >> shifts_2nd[0]; ind[1] = in[0] >> shifts_2nd[1]; ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[1] >> shifts_3rd[0]; ind[1] = in[1] >> shifts_3rd[1]; ind[2] = in[1] >> shifts_3rd[2]; ind[3] = in[1] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_4th[0]; ind[1] = in[2] >> shifts_4th[1]; ind[2] = in[2] >> shifts_4th[2]; ind[3] = in[2] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[2] >> shifts_5th[0]; ind[1] = in[2] >> shifts_5th[1]; ind[2] = in[2] >> shifts_5th[2]; ind[3] = (in[2] >> 31 | in[3] << 1) >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[3] >> shifts_6th[0]; ind[1] = in[3] >> shifts_6th[1]; ind[2] = in[3] >> shifts_6th[2]; ind[3] = in[3] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[3] >> shifts_7th[0]; ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_7th[1]; ind[2] = in[4] >> shifts_7th[2]; ind[3] = in[4] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[4] >> shifts_8th[0]; ind[1] = in[4] >> shifts_8th[1]; ind[2] = in[4] >> shifts_8th[2]; ind[3] = in[4] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 5; return in; } inline static const uint32_t* unpack6_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3f; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 6, 12, 18}; uint32_t shifts_2nd[4] = {24, 0, 4, 10}; uint32_t shifts_3rd[4] = {16, 22, 0, 2}; uint32_t shifts_4th[4] = {8, 14, 20, 26}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[0] >> shifts_2nd[0]; ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[1] >> shifts_3rd[0]; ind[1] = in[1] >> shifts_3rd[1]; ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_3rd[2]; ind[3] = in[2] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[2] >> shifts_4th[0]; ind[1] = in[2] >> shifts_4th[1]; ind[2] = in[2] >> shifts_4th[2]; ind[3] = in[2] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[3] >> shifts_1st[0]; ind[1] = in[3] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[3] >> shifts_2nd[0]; ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[1]; ind[2] = in[4] >> shifts_2nd[2]; ind[3] = in[4] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[4] >> shifts_3rd[0]; ind[1] = in[4] >> shifts_3rd[1]; ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_3rd[2]; ind[3] = in[5] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[5] >> shifts_4th[0]; ind[1] = in[5] >> shifts_4th[1]; ind[2] = in[5] >> shifts_4th[2]; ind[3] = in[5] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 6; return in; } inline static const uint32_t* unpack7_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7f; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 7, 14, 21}; uint32_t shifts_2nd[4] = {0, 3, 10, 17}; uint32_t shifts_3rd[4] = {24, 0, 6, 13}; uint32_t shifts_4th[4] = {20, 0, 2, 9}; uint32_t shifts_5th[4] = {16, 23, 0, 5}; uint32_t shifts_6th[4] = {12, 19, 0, 1}; uint32_t shifts_7th[4] = {8, 15, 22, 0}; uint32_t shifts_8th[4] = {4, 11, 18, 25}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[0] >> 28 | in[1] << 4) >> shifts_2nd[0]; ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = in[1] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[1] >> shifts_3rd[0]; ind[1] = (in[1] >> 31 | in[2] << 1) >> shifts_3rd[1]; ind[2] = in[2] >> shifts_3rd[2]; ind[3] = in[2] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[2] >> shifts_4th[0]; ind[1] = (in[2] >> 27 | in[3] << 5) >> shifts_4th[1]; ind[2] = in[3] >> shifts_4th[2]; ind[3] = in[3] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[3] >> shifts_5th[0]; ind[1] = in[3] >> shifts_5th[1]; ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_5th[2]; ind[3] = in[4] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[4] >> shifts_6th[0]; ind[1] = in[4] >> shifts_6th[1]; ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_6th[2]; ind[3] = in[5] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[5] >> shifts_7th[0]; ind[1] = in[5] >> shifts_7th[1]; ind[2] = in[5] >> shifts_7th[2]; ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[6] >> shifts_8th[0]; ind[1] = in[6] >> shifts_8th[1]; ind[2] = in[6] >> shifts_8th[2]; ind[3] = in[6] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 7; return in; } inline static const uint32_t* unpack8_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 8, 16, 24}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = in[0] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[1] >> shifts_1st[0]; ind[1] = in[1] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[2] >> shifts_1st[0]; ind[1] = in[2] >> shifts_1st[1]; ind[2] = in[2] >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[3] >> shifts_1st[0]; ind[1] = in[3] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[4] >> shifts_1st[0]; ind[1] = in[4] >> shifts_1st[1]; ind[2] = in[4] >> shifts_1st[2]; ind[3] = in[4] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[5] >> shifts_1st[0]; ind[1] = in[5] >> shifts_1st[1]; ind[2] = in[5] >> shifts_1st[2]; ind[3] = in[5] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[6] >> shifts_1st[0]; ind[1] = in[6] >> shifts_1st[1]; ind[2] = in[6] >> shifts_1st[2]; ind[3] = in[6] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[7] >> shifts_1st[0]; ind[1] = in[7] >> shifts_1st[1]; ind[2] = in[7] >> shifts_1st[2]; ind[3] = in[7] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 8; return in; } inline static const uint32_t* unpack9_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1ff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 9, 18, 0}; uint32_t shifts_2nd[4] = {4, 13, 22, 0}; uint32_t shifts_3rd[4] = {8, 17, 0, 3}; uint32_t shifts_4th[4] = {12, 21, 0, 7}; uint32_t shifts_5th[4] = {16, 0, 2, 11}; uint32_t shifts_6th[4] = {20, 0, 6, 15}; uint32_t shifts_7th[4] = {0, 1, 10, 19}; uint32_t shifts_8th[4] = {0, 5, 14, 23}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[1] >> shifts_2nd[0]; ind[1] = in[1] >> shifts_2nd[1]; ind[2] = in[1] >> shifts_2nd[2]; ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[2] >> shifts_3rd[0]; ind[1] = in[2] >> shifts_3rd[1]; ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[2]; ind[3] = in[3] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[3] >> shifts_4th[0]; ind[1] = in[3] >> shifts_4th[1]; ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_4th[2]; ind[3] = in[4] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[4] >> shifts_5th[0]; ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_5th[1]; ind[2] = in[5] >> shifts_5th[2]; ind[3] = in[5] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[5] >> shifts_6th[0]; ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_6th[1]; ind[2] = in[6] >> shifts_6th[2]; ind[3] = in[6] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_7th[0]; ind[1] = in[7] >> shifts_7th[1]; ind[2] = in[7] >> shifts_7th[2]; ind[3] = in[7] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_8th[0]; ind[1] = in[8] >> shifts_8th[1]; ind[2] = in[8] >> shifts_8th[2]; ind[3] = in[8] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 9; return in; } inline static const uint32_t* unpack10_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3ff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 10, 20, 0}; uint32_t shifts_2nd[4] = {8, 18, 0, 6}; uint32_t shifts_3rd[4] = {16, 0, 4, 14}; uint32_t shifts_4th[4] = {0, 2, 12, 22}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[0] >> shifts_1st[2]; ind[3] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[1] >> shifts_2nd[0]; ind[1] = in[1] >> shifts_2nd[1]; ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[2] >> shifts_3rd[0]; ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_3rd[1]; ind[2] = in[3] >> shifts_3rd[2]; ind[3] = in[3] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_4th[0]; ind[1] = in[4] >> shifts_4th[1]; ind[2] = in[4] >> shifts_4th[2]; ind[3] = in[4] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[5] >> shifts_1st[0]; ind[1] = in[5] >> shifts_1st[1]; ind[2] = in[5] >> shifts_1st[2]; ind[3] = (in[5] >> 30 | in[6] << 2) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[6] >> shifts_2nd[0]; ind[1] = in[6] >> shifts_2nd[1]; ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_2nd[2]; ind[3] = in[7] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[7] >> shifts_3rd[0]; ind[1] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[1]; ind[2] = in[8] >> shifts_3rd[2]; ind[3] = in[8] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_4th[0]; ind[1] = in[9] >> shifts_4th[1]; ind[2] = in[9] >> shifts_4th[2]; ind[3] = in[9] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 10; return in; } inline static const uint32_t* unpack11_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7ff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 11, 0, 1}; uint32_t shifts_2nd[4] = {12, 0, 2, 13}; uint32_t shifts_3rd[4] = {0, 3, 14, 0}; uint32_t shifts_4th[4] = {4, 15, 0, 5}; uint32_t shifts_5th[4] = {16, 0, 6, 17}; uint32_t shifts_6th[4] = {0, 7, 18, 0}; uint32_t shifts_7th[4] = {8, 19, 0, 9}; uint32_t shifts_8th[4] = {20, 0, 10, 21}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[1] >> shifts_2nd[0]; ind[1] = (in[1] >> 23 | in[2] << 9) >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_3rd[0]; ind[1] = in[3] >> shifts_3rd[1]; ind[2] = in[3] >> shifts_3rd[2]; ind[3] = (in[3] >> 25 | in[4] << 7) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[4] >> shifts_4th[0]; ind[1] = in[4] >> shifts_4th[1]; ind[2] = (in[4] >> 26 | in[5] << 6) >> shifts_4th[2]; ind[3] = in[5] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[5] >> shifts_5th[0]; ind[1] = (in[5] >> 27 | in[6] << 5) >> shifts_5th[1]; ind[2] = in[6] >> shifts_5th[2]; ind[3] = in[6] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[6] >> 28 | in[7] << 4) >> shifts_6th[0]; ind[1] = in[7] >> shifts_6th[1]; ind[2] = in[7] >> shifts_6th[2]; ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[8] >> shifts_7th[0]; ind[1] = in[8] >> shifts_7th[1]; ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_7th[2]; ind[3] = in[9] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[9] >> shifts_8th[0]; ind[1] = (in[9] >> 31 | in[10] << 1) >> shifts_8th[1]; ind[2] = in[10] >> shifts_8th[2]; ind[3] = in[10] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 11; return in; } inline static const uint32_t* unpack12_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xfff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 12, 0, 4}; uint32_t shifts_2nd[4] = {16, 0, 8, 20}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[1] >> shifts_2nd[0]; ind[1] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = in[2] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[3] >> shifts_1st[0]; ind[1] = in[3] >> shifts_1st[1]; ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[2]; ind[3] = in[4] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[4] >> shifts_2nd[0]; ind[1] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[1]; ind[2] = in[5] >> shifts_2nd[2]; ind[3] = in[5] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[6] >> shifts_1st[0]; ind[1] = in[6] >> shifts_1st[1]; ind[2] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[2]; ind[3] = in[7] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[7] >> shifts_2nd[0]; ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_2nd[1]; ind[2] = in[8] >> shifts_2nd[2]; ind[3] = in[8] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[9] >> shifts_1st[0]; ind[1] = in[9] >> shifts_1st[1]; ind[2] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[2]; ind[3] = in[10] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[10] >> shifts_2nd[0]; ind[1] = (in[10] >> 28 | in[11] << 4) >> shifts_2nd[1]; ind[2] = in[11] >> shifts_2nd[2]; ind[3] = in[11] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 12; return in; } inline static const uint32_t* unpack13_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1fff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 13, 0, 7}; uint32_t shifts_2nd[4] = {0, 1, 14, 0}; uint32_t shifts_3rd[4] = {8, 0, 2, 15}; uint32_t shifts_4th[4] = {0, 9, 0, 3}; uint32_t shifts_5th[4] = {16, 0, 10, 0}; uint32_t shifts_6th[4] = {4, 17, 0, 11}; uint32_t shifts_7th[4] = {0, 5, 18, 0}; uint32_t shifts_8th[4] = {12, 0, 6, 19}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[1] >> 20 | in[2] << 12) >> shifts_2nd[0]; ind[1] = in[2] >> shifts_2nd[1]; ind[2] = in[2] >> shifts_2nd[2]; ind[3] = (in[2] >> 27 | in[3] << 5) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[3] >> shifts_3rd[0]; ind[1] = (in[3] >> 21 | in[4] << 11) >> shifts_3rd[1]; ind[2] = in[4] >> shifts_3rd[2]; ind[3] = in[4] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[4] >> 28 | in[5] << 4) >> shifts_4th[0]; ind[1] = in[5] >> shifts_4th[1]; ind[2] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[2]; ind[3] = in[6] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[6] >> shifts_5th[0]; ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_5th[1]; ind[2] = in[7] >> shifts_5th[2]; ind[3] = (in[7] >> 23 | in[8] << 9) >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[8] >> shifts_6th[0]; ind[1] = in[8] >> shifts_6th[1]; ind[2] = (in[8] >> 30 | in[9] << 2) >> shifts_6th[2]; ind[3] = in[9] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_7th[0]; ind[1] = in[10] >> shifts_7th[1]; ind[2] = in[10] >> shifts_7th[2]; ind[3] = (in[10] >> 31 | in[11] << 1) >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[11] >> shifts_8th[0]; ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_8th[1]; ind[2] = in[12] >> shifts_8th[2]; ind[3] = in[12] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 13; return in; } inline static const uint32_t* unpack14_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3fff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 14, 0, 10}; uint32_t shifts_2nd[4] = {0, 6, 0, 2}; uint32_t shifts_3rd[4] = {16, 0, 12, 0}; uint32_t shifts_4th[4] = {8, 0, 4, 18}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[1] >> 24 | in[2] << 8) >> shifts_2nd[0]; ind[1] = in[2] >> shifts_2nd[1]; ind[2] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[2]; ind[3] = in[3] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[3] >> shifts_3rd[0]; ind[1] = (in[3] >> 30 | in[4] << 2) >> shifts_3rd[1]; ind[2] = in[4] >> shifts_3rd[2]; ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[5] >> shifts_4th[0]; ind[1] = (in[5] >> 22 | in[6] << 10) >> shifts_4th[1]; ind[2] = in[6] >> shifts_4th[2]; ind[3] = in[6] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[7] >> shifts_1st[0]; ind[1] = in[7] >> shifts_1st[1]; ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[2]; ind[3] = in[8] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[0]; ind[1] = in[9] >> shifts_2nd[1]; ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_2nd[2]; ind[3] = in[10] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[10] >> shifts_3rd[0]; ind[1] = (in[10] >> 30 | in[11] << 2) >> shifts_3rd[1]; ind[2] = in[11] >> shifts_3rd[2]; ind[3] = (in[11] >> 26 | in[12] << 6) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[12] >> shifts_4th[0]; ind[1] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[1]; ind[2] = in[13] >> shifts_4th[2]; ind[3] = in[13] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 14; return in; } inline static const uint32_t* unpack15_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7fff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 15, 0, 13}; uint32_t shifts_2nd[4] = {0, 11, 0, 9}; uint32_t shifts_3rd[4] = {0, 7, 0, 5}; uint32_t shifts_4th[4] = {0, 3, 0, 1}; uint32_t shifts_5th[4] = {16, 0, 14, 0}; uint32_t shifts_6th[4] = {12, 0, 10, 0}; uint32_t shifts_7th[4] = {8, 0, 6, 0}; uint32_t shifts_8th[4] = {4, 0, 2, 17}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[1] >> 28 | in[2] << 4) >> shifts_2nd[0]; ind[1] = in[2] >> shifts_2nd[1]; ind[2] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[2]; ind[3] = in[3] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_3rd[0]; ind[1] = in[4] >> shifts_3rd[1]; ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_3rd[2]; ind[3] = in[5] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[5] >> 20 | in[6] << 12) >> shifts_4th[0]; ind[1] = in[6] >> shifts_4th[1]; ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_4th[2]; ind[3] = in[7] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[7] >> shifts_5th[0]; ind[1] = (in[7] >> 31 | in[8] << 1) >> shifts_5th[1]; ind[2] = in[8] >> shifts_5th[2]; ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[9] >> shifts_6th[0]; ind[1] = (in[9] >> 27 | in[10] << 5) >> shifts_6th[1]; ind[2] = in[10] >> shifts_6th[2]; ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[11] >> shifts_7th[0]; ind[1] = (in[11] >> 23 | in[12] << 9) >> shifts_7th[1]; ind[2] = in[12] >> shifts_7th[2]; ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[13] >> shifts_8th[0]; ind[1] = (in[13] >> 19 | in[14] << 13) >> shifts_8th[1]; ind[2] = in[14] >> shifts_8th[2]; ind[3] = in[14] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 15; return in; } inline static const uint32_t* unpack16_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 16, 0, 16}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = in[0] >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = in[1] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[2] >> shifts_1st[0]; ind[1] = in[2] >> shifts_1st[1]; ind[2] = in[3] >> shifts_1st[2]; ind[3] = in[3] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[4] >> shifts_1st[0]; ind[1] = in[4] >> shifts_1st[1]; ind[2] = in[5] >> shifts_1st[2]; ind[3] = in[5] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[6] >> shifts_1st[0]; ind[1] = in[6] >> shifts_1st[1]; ind[2] = in[7] >> shifts_1st[2]; ind[3] = in[7] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[8] >> shifts_1st[0]; ind[1] = in[8] >> shifts_1st[1]; ind[2] = in[9] >> shifts_1st[2]; ind[3] = in[9] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[10] >> shifts_1st[0]; ind[1] = in[10] >> shifts_1st[1]; ind[2] = in[11] >> shifts_1st[2]; ind[3] = in[11] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[12] >> shifts_1st[0]; ind[1] = in[12] >> shifts_1st[1]; ind[2] = in[13] >> shifts_1st[2]; ind[3] = in[13] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[14] >> shifts_1st[0]; ind[1] = in[14] >> shifts_1st[1]; ind[2] = in[15] >> shifts_1st[2]; ind[3] = in[15] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 16; return in; } inline static const uint32_t* unpack17_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1ffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 2, 0}; uint32_t shifts_2nd[4] = {4, 0, 6, 0}; uint32_t shifts_3rd[4] = {8, 0, 10, 0}; uint32_t shifts_4th[4] = {12, 0, 14, 0}; uint32_t shifts_5th[4] = {0, 1, 0, 3}; uint32_t shifts_6th[4] = {0, 5, 0, 7}; uint32_t shifts_7th[4] = {0, 9, 0, 11}; uint32_t shifts_8th[4] = {0, 13, 0, 15}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 17 | in[1] << 15) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 19 | in[2] << 13) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[2] >> shifts_2nd[0]; ind[1] = (in[2] >> 21 | in[3] << 11) >> shifts_2nd[1]; ind[2] = in[3] >> shifts_2nd[2]; ind[3] = (in[3] >> 23 | in[4] << 9) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[4] >> shifts_3rd[0]; ind[1] = (in[4] >> 25 | in[5] << 7) >> shifts_3rd[1]; ind[2] = in[5] >> shifts_3rd[2]; ind[3] = (in[5] >> 27 | in[6] << 5) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[6] >> shifts_4th[0]; ind[1] = (in[6] >> 29 | in[7] << 3) >> shifts_4th[1]; ind[2] = in[7] >> shifts_4th[2]; ind[3] = (in[7] >> 31 | in[8] << 1) >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[8] >> 16 | in[9] << 16) >> shifts_5th[0]; ind[1] = in[9] >> shifts_5th[1]; ind[2] = (in[9] >> 18 | in[10] << 14) >> shifts_5th[2]; ind[3] = in[10] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[10] >> 20 | in[11] << 12) >> shifts_6th[0]; ind[1] = in[11] >> shifts_6th[1]; ind[2] = (in[11] >> 22 | in[12] << 10) >> shifts_6th[2]; ind[3] = in[12] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[12] >> 24 | in[13] << 8) >> shifts_7th[0]; ind[1] = in[13] >> shifts_7th[1]; ind[2] = (in[13] >> 26 | in[14] << 6) >> shifts_7th[2]; ind[3] = in[14] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[14] >> 28 | in[15] << 4) >> shifts_8th[0]; ind[1] = in[15] >> shifts_8th[1]; ind[2] = (in[15] >> 30 | in[16] << 2) >> shifts_8th[2]; ind[3] = in[16] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 17; return in; } inline static const uint32_t* unpack18_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3ffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 4, 0}; uint32_t shifts_2nd[4] = {8, 0, 12, 0}; uint32_t shifts_3rd[4] = {0, 2, 0, 6}; uint32_t shifts_4th[4] = {0, 10, 0, 14}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 18 | in[1] << 14) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[2] >> shifts_2nd[0]; ind[1] = (in[2] >> 26 | in[3] << 6) >> shifts_2nd[1]; ind[2] = in[3] >> shifts_2nd[2]; ind[3] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[4] >> 16 | in[5] << 16) >> shifts_3rd[0]; ind[1] = in[5] >> shifts_3rd[1]; ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_3rd[2]; ind[3] = in[6] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_4th[0]; ind[1] = in[7] >> shifts_4th[1]; ind[2] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[2]; ind[3] = in[8] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[9] >> shifts_1st[0]; ind[1] = (in[9] >> 18 | in[10] << 14) >> shifts_1st[1]; ind[2] = in[10] >> shifts_1st[2]; ind[3] = (in[10] >> 22 | in[11] << 10) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[11] >> shifts_2nd[0]; ind[1] = (in[11] >> 26 | in[12] << 6) >> shifts_2nd[1]; ind[2] = in[12] >> shifts_2nd[2]; ind[3] = (in[12] >> 30 | in[13] << 2) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_3rd[0]; ind[1] = in[14] >> shifts_3rd[1]; ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_3rd[2]; ind[3] = in[15] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_4th[0]; ind[1] = in[16] >> shifts_4th[1]; ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_4th[2]; ind[3] = in[17] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 18; return in; } inline static const uint32_t* unpack19_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7ffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 6, 0}; uint32_t shifts_2nd[4] = {12, 0, 0, 5}; uint32_t shifts_3rd[4] = {0, 11, 0, 0}; uint32_t shifts_4th[4] = {4, 0, 10, 0}; uint32_t shifts_5th[4] = {0, 3, 0, 9}; uint32_t shifts_6th[4] = {0, 0, 2, 0}; uint32_t shifts_7th[4] = {8, 0, 0, 1}; uint32_t shifts_8th[4] = {0, 7, 0, 13}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 19 | in[1] << 13) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 25 | in[2] << 7) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[2] >> shifts_2nd[0]; ind[1] = (in[2] >> 31 | in[3] << 1) >> shifts_2nd[1]; ind[2] = (in[3] >> 18 | in[4] << 14) >> shifts_2nd[2]; ind[3] = in[4] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[4] >> 24 | in[5] << 8) >> shifts_3rd[0]; ind[1] = in[5] >> shifts_3rd[1]; ind[2] = (in[5] >> 30 | in[6] << 2) >> shifts_3rd[2]; ind[3] = (in[6] >> 17 | in[7] << 15) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[7] >> shifts_4th[0]; ind[1] = (in[7] >> 23 | in[8] << 9) >> shifts_4th[1]; ind[2] = in[8] >> shifts_4th[2]; ind[3] = (in[8] >> 29 | in[9] << 3) >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[9] >> 16 | in[10] << 16) >> shifts_5th[0]; ind[1] = in[10] >> shifts_5th[1]; ind[2] = (in[10] >> 22 | in[11] << 10) >> shifts_5th[2]; ind[3] = in[11] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[11] >> 28 | in[12] << 4) >> shifts_6th[0]; ind[1] = (in[12] >> 15 | in[13] << 17) >> shifts_6th[1]; ind[2] = in[13] >> shifts_6th[2]; ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[14] >> shifts_7th[0]; ind[1] = (in[14] >> 27 | in[15] << 5) >> shifts_7th[1]; ind[2] = (in[15] >> 14 | in[16] << 18) >> shifts_7th[2]; ind[3] = in[16] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[16] >> 20 | in[17] << 12) >> shifts_8th[0]; ind[1] = in[17] >> shifts_8th[1]; ind[2] = (in[17] >> 26 | in[18] << 6) >> shifts_8th[2]; ind[3] = in[18] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 19; return in; } inline static const uint32_t* unpack20_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xfffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 8, 0}; uint32_t shifts_2nd[4] = {0, 4, 0, 12}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 20 | in[1] << 12) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[2] >> 16 | in[3] << 16) >> shifts_2nd[0]; ind[1] = in[3] >> shifts_2nd[1]; ind[2] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[2]; ind[3] = in[4] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[5] >> shifts_1st[0]; ind[1] = (in[5] >> 20 | in[6] << 12) >> shifts_1st[1]; ind[2] = in[6] >> shifts_1st[2]; ind[3] = (in[6] >> 28 | in[7] << 4) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_2nd[0]; ind[1] = in[8] >> shifts_2nd[1]; ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_2nd[2]; ind[3] = in[9] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[10] >> shifts_1st[0]; ind[1] = (in[10] >> 20 | in[11] << 12) >> shifts_1st[1]; ind[2] = in[11] >> shifts_1st[2]; ind[3] = (in[11] >> 28 | in[12] << 4) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_2nd[0]; ind[1] = in[13] >> shifts_2nd[1]; ind[2] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[2]; ind[3] = in[14] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[15] >> shifts_1st[0]; ind[1] = (in[15] >> 20 | in[16] << 12) >> shifts_1st[1]; ind[2] = in[16] >> shifts_1st[2]; ind[3] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0]; ind[1] = in[18] >> shifts_2nd[1]; ind[2] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[2]; ind[3] = in[19] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 20; return in; } inline static const uint32_t* unpack21_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1fffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 10, 0}; uint32_t shifts_2nd[4] = {0, 9, 0, 0}; uint32_t shifts_3rd[4] = {8, 0, 0, 7}; uint32_t shifts_4th[4] = {0, 0, 6, 0}; uint32_t shifts_5th[4] = {0, 5, 0, 0}; uint32_t shifts_6th[4] = {4, 0, 0, 3}; uint32_t shifts_7th[4] = {0, 0, 2, 0}; uint32_t shifts_8th[4] = {0, 1, 0, 11}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 21 | in[1] << 11) >> shifts_1st[1]; ind[2] = in[1] >> shifts_1st[2]; ind[3] = (in[1] >> 31 | in[2] << 1) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[2] >> 20 | in[3] << 12) >> shifts_2nd[0]; ind[1] = in[3] >> shifts_2nd[1]; ind[2] = (in[3] >> 30 | in[4] << 2) >> shifts_2nd[2]; ind[3] = (in[4] >> 19 | in[5] << 13) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[5] >> shifts_3rd[0]; ind[1] = (in[5] >> 29 | in[6] << 3) >> shifts_3rd[1]; ind[2] = (in[6] >> 18 | in[7] << 14) >> shifts_3rd[2]; ind[3] = in[7] >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[7] >> 28 | in[8] << 4) >> shifts_4th[0]; ind[1] = (in[8] >> 17 | in[9] << 15) >> shifts_4th[1]; ind[2] = in[9] >> shifts_4th[2]; ind[3] = (in[9] >> 27 | in[10] << 5) >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_5th[0]; ind[1] = in[11] >> shifts_5th[1]; ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_5th[2]; ind[3] = (in[12] >> 15 | in[13] << 17) >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[13] >> shifts_6th[0]; ind[1] = (in[13] >> 25 | in[14] << 7) >> shifts_6th[1]; ind[2] = (in[14] >> 14 | in[15] << 18) >> shifts_6th[2]; ind[3] = in[15] >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[15] >> 24 | in[16] << 8) >> shifts_7th[0]; ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_7th[1]; ind[2] = in[17] >> shifts_7th[2]; ind[3] = (in[17] >> 23 | in[18] << 9) >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[18] >> 12 | in[19] << 20) >> shifts_8th[0]; ind[1] = in[19] >> shifts_8th[1]; ind[2] = (in[19] >> 22 | in[20] << 10) >> shifts_8th[2]; ind[3] = in[20] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 21; return in; } inline static const uint32_t* unpack22_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3fffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 2}; uint32_t shifts_2nd[4] = {0, 0, 4, 0}; uint32_t shifts_3rd[4] = {0, 6, 0, 0}; uint32_t shifts_4th[4] = {8, 0, 0, 10}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 22 | in[1] << 10) >> shifts_1st[1]; ind[2] = (in[1] >> 12 | in[2] << 20) >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[2] >> 24 | in[3] << 8) >> shifts_2nd[0]; ind[1] = (in[3] >> 14 | in[4] << 18) >> shifts_2nd[1]; ind[2] = in[4] >> shifts_2nd[2]; ind[3] = (in[4] >> 26 | in[5] << 6) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[5] >> 16 | in[6] << 16) >> shifts_3rd[0]; ind[1] = in[6] >> shifts_3rd[1]; ind[2] = (in[6] >> 28 | in[7] << 4) >> shifts_3rd[2]; ind[3] = (in[7] >> 18 | in[8] << 14) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[8] >> shifts_4th[0]; ind[1] = (in[8] >> 30 | in[9] << 2) >> shifts_4th[1]; ind[2] = (in[9] >> 20 | in[10] << 12) >> shifts_4th[2]; ind[3] = in[10] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[11] >> shifts_1st[0]; ind[1] = (in[11] >> 22 | in[12] << 10) >> shifts_1st[1]; ind[2] = (in[12] >> 12 | in[13] << 20) >> shifts_1st[2]; ind[3] = in[13] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[13] >> 24 | in[14] << 8) >> shifts_2nd[0]; ind[1] = (in[14] >> 14 | in[15] << 18) >> shifts_2nd[1]; ind[2] = in[15] >> shifts_2nd[2]; ind[3] = (in[15] >> 26 | in[16] << 6) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[16] >> 16 | in[17] << 16) >> shifts_3rd[0]; ind[1] = in[17] >> shifts_3rd[1]; ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_3rd[2]; ind[3] = (in[18] >> 18 | in[19] << 14) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[19] >> shifts_4th[0]; ind[1] = (in[19] >> 30 | in[20] << 2) >> shifts_4th[1]; ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_4th[2]; ind[3] = in[21] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 22; return in; } inline static const uint32_t* unpack23_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7fffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 5}; uint32_t shifts_2nd[4] = {0, 0, 0, 1}; uint32_t shifts_3rd[4] = {0, 0, 6, 0}; uint32_t shifts_4th[4] = {0, 0, 2, 0}; uint32_t shifts_5th[4] = {0, 7, 0, 0}; uint32_t shifts_6th[4] = {0, 3, 0, 0}; uint32_t shifts_7th[4] = {8, 0, 0, 0}; uint32_t shifts_8th[4] = {4, 0, 0, 9}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 23 | in[1] << 9) >> shifts_1st[1]; ind[2] = (in[1] >> 14 | in[2] << 18) >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[2] >> 28 | in[3] << 4) >> shifts_2nd[0]; ind[1] = (in[3] >> 19 | in[4] << 13) >> shifts_2nd[1]; ind[2] = (in[4] >> 10 | in[5] << 22) >> shifts_2nd[2]; ind[3] = in[5] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[5] >> 24 | in[6] << 8) >> shifts_3rd[0]; ind[1] = (in[6] >> 15 | in[7] << 17) >> shifts_3rd[1]; ind[2] = in[7] >> shifts_3rd[2]; ind[3] = (in[7] >> 29 | in[8] << 3) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[8] >> 20 | in[9] << 12) >> shifts_4th[0]; ind[1] = (in[9] >> 11 | in[10] << 21) >> shifts_4th[1]; ind[2] = in[10] >> shifts_4th[2]; ind[3] = (in[10] >> 25 | in[11] << 7) >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[11] >> 16 | in[12] << 16) >> shifts_5th[0]; ind[1] = in[12] >> shifts_5th[1]; ind[2] = (in[12] >> 30 | in[13] << 2) >> shifts_5th[2]; ind[3] = (in[13] >> 21 | in[14] << 11) >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[14] >> 12 | in[15] << 20) >> shifts_6th[0]; ind[1] = in[15] >> shifts_6th[1]; ind[2] = (in[15] >> 26 | in[16] << 6) >> shifts_6th[2]; ind[3] = (in[16] >> 17 | in[17] << 15) >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[17] >> shifts_7th[0]; ind[1] = (in[17] >> 31 | in[18] << 1) >> shifts_7th[1]; ind[2] = (in[18] >> 22 | in[19] << 10) >> shifts_7th[2]; ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[20] >> shifts_8th[0]; ind[1] = (in[20] >> 27 | in[21] << 5) >> shifts_8th[1]; ind[2] = (in[21] >> 18 | in[22] << 14) >> shifts_8th[2]; ind[3] = in[22] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 23; return in; } inline static const uint32_t* unpack24_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 8}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 24 | in[1] << 8) >> shifts_1st[1]; ind[2] = (in[1] >> 16 | in[2] << 16) >> shifts_1st[2]; ind[3] = in[2] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[3] >> shifts_1st[0]; ind[1] = (in[3] >> 24 | in[4] << 8) >> shifts_1st[1]; ind[2] = (in[4] >> 16 | in[5] << 16) >> shifts_1st[2]; ind[3] = in[5] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[6] >> shifts_1st[0]; ind[1] = (in[6] >> 24 | in[7] << 8) >> shifts_1st[1]; ind[2] = (in[7] >> 16 | in[8] << 16) >> shifts_1st[2]; ind[3] = in[8] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[9] >> shifts_1st[0]; ind[1] = (in[9] >> 24 | in[10] << 8) >> shifts_1st[1]; ind[2] = (in[10] >> 16 | in[11] << 16) >> shifts_1st[2]; ind[3] = in[11] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[12] >> shifts_1st[0]; ind[1] = (in[12] >> 24 | in[13] << 8) >> shifts_1st[1]; ind[2] = (in[13] >> 16 | in[14] << 16) >> shifts_1st[2]; ind[3] = in[14] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = in[15] >> shifts_1st[0]; ind[1] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[1]; ind[2] = (in[16] >> 16 | in[17] << 16) >> shifts_1st[2]; ind[3] = in[17] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[18] >> shifts_1st[0]; ind[1] = (in[18] >> 24 | in[19] << 8) >> shifts_1st[1]; ind[2] = (in[19] >> 16 | in[20] << 16) >> shifts_1st[2]; ind[3] = in[20] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = in[21] >> shifts_1st[0]; ind[1] = (in[21] >> 24 | in[22] << 8) >> shifts_1st[1]; ind[2] = (in[22] >> 16 | in[23] << 16) >> shifts_1st[2]; ind[3] = in[23] >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 24; return in; } inline static const uint32_t* unpack25_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1ffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {4, 0, 0, 0}; uint32_t shifts_3rd[4] = {0, 1, 0, 0}; uint32_t shifts_4th[4] = {0, 5, 0, 0}; uint32_t shifts_5th[4] = {0, 0, 2, 0}; uint32_t shifts_6th[4] = {0, 0, 6, 0}; uint32_t shifts_7th[4] = {0, 0, 0, 3}; uint32_t shifts_8th[4] = {0, 0, 0, 7}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 25 | in[1] << 7) >> shifts_1st[1]; ind[2] = (in[1] >> 18 | in[2] << 14) >> shifts_1st[2]; ind[3] = (in[2] >> 11 | in[3] << 21) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = in[3] >> shifts_2nd[0]; ind[1] = (in[3] >> 29 | in[4] << 3) >> shifts_2nd[1]; ind[2] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[2]; ind[3] = (in[5] >> 15 | in[6] << 17) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[6] >> 8 | in[7] << 24) >> shifts_3rd[0]; ind[1] = in[7] >> shifts_3rd[1]; ind[2] = (in[7] >> 26 | in[8] << 6) >> shifts_3rd[2]; ind[3] = (in[8] >> 19 | in[9] << 13) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[9] >> 12 | in[10] << 20) >> shifts_4th[0]; ind[1] = in[10] >> shifts_4th[1]; ind[2] = (in[10] >> 30 | in[11] << 2) >> shifts_4th[2]; ind[3] = (in[11] >> 23 | in[12] << 9) >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[12] >> 16 | in[13] << 16) >> shifts_5th[0]; ind[1] = (in[13] >> 9 | in[14] << 23) >> shifts_5th[1]; ind[2] = in[14] >> shifts_5th[2]; ind[3] = (in[14] >> 27 | in[15] << 5) >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[15] >> 20 | in[16] << 12) >> shifts_6th[0]; ind[1] = (in[16] >> 13 | in[17] << 19) >> shifts_6th[1]; ind[2] = in[17] >> shifts_6th[2]; ind[3] = (in[17] >> 31 | in[18] << 1) >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_7th[0]; ind[1] = (in[19] >> 17 | in[20] << 15) >> shifts_7th[1]; ind[2] = (in[20] >> 10 | in[21] << 22) >> shifts_7th[2]; ind[3] = in[21] >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[21] >> 28 | in[22] << 4) >> shifts_8th[0]; ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_8th[1]; ind[2] = (in[23] >> 14 | in[24] << 18) >> shifts_8th[2]; ind[3] = in[24] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 25; return in; } inline static const uint32_t* unpack26_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3ffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 2, 0, 0}; uint32_t shifts_3rd[4] = {0, 0, 4, 0}; uint32_t shifts_4th[4] = {0, 0, 0, 6}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 26 | in[1] << 6) >> shifts_1st[1]; ind[2] = (in[1] >> 20 | in[2] << 12) >> shifts_1st[2]; ind[3] = (in[2] >> 14 | in[3] << 18) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[3] >> 8 | in[4] << 24) >> shifts_2nd[0]; ind[1] = in[4] >> shifts_2nd[1]; ind[2] = (in[4] >> 28 | in[5] << 4) >> shifts_2nd[2]; ind[3] = (in[5] >> 22 | in[6] << 10) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[6] >> 16 | in[7] << 16) >> shifts_3rd[0]; ind[1] = (in[7] >> 10 | in[8] << 22) >> shifts_3rd[1]; ind[2] = in[8] >> shifts_3rd[2]; ind[3] = (in[8] >> 30 | in[9] << 2) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[9] >> 24 | in[10] << 8) >> shifts_4th[0]; ind[1] = (in[10] >> 18 | in[11] << 14) >> shifts_4th[1]; ind[2] = (in[11] >> 12 | in[12] << 20) >> shifts_4th[2]; ind[3] = in[12] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[13] >> shifts_1st[0]; ind[1] = (in[13] >> 26 | in[14] << 6) >> shifts_1st[1]; ind[2] = (in[14] >> 20 | in[15] << 12) >> shifts_1st[2]; ind[3] = (in[15] >> 14 | in[16] << 18) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[16] >> 8 | in[17] << 24) >> shifts_2nd[0]; ind[1] = in[17] >> shifts_2nd[1]; ind[2] = (in[17] >> 28 | in[18] << 4) >> shifts_2nd[2]; ind[3] = (in[18] >> 22 | in[19] << 10) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[19] >> 16 | in[20] << 16) >> shifts_3rd[0]; ind[1] = (in[20] >> 10 | in[21] << 22) >> shifts_3rd[1]; ind[2] = in[21] >> shifts_3rd[2]; ind[3] = (in[21] >> 30 | in[22] << 2) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[22] >> 24 | in[23] << 8) >> shifts_4th[0]; ind[1] = (in[23] >> 18 | in[24] << 14) >> shifts_4th[1]; ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_4th[2]; ind[3] = in[25] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 26; return in; } inline static const uint32_t* unpack27_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7ffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 0, 2, 0}; uint32_t shifts_3rd[4] = {0, 0, 0, 0}; uint32_t shifts_4th[4] = {4, 0, 0, 0}; uint32_t shifts_5th[4] = {0, 0, 0, 1}; uint32_t shifts_6th[4] = {0, 0, 0, 0}; uint32_t shifts_7th[4] = {0, 3, 0, 0}; uint32_t shifts_8th[4] = {0, 0, 0, 5}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 27 | in[1] << 5) >> shifts_1st[1]; ind[2] = (in[1] >> 22 | in[2] << 10) >> shifts_1st[2]; ind[3] = (in[2] >> 17 | in[3] << 15) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[3] >> 12 | in[4] << 20) >> shifts_2nd[0]; ind[1] = (in[4] >> 7 | in[5] << 25) >> shifts_2nd[1]; ind[2] = in[5] >> shifts_2nd[2]; ind[3] = (in[5] >> 29 | in[6] << 3) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[6] >> 24 | in[7] << 8) >> shifts_3rd[0]; ind[1] = (in[7] >> 19 | in[8] << 13) >> shifts_3rd[1]; ind[2] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[2]; ind[3] = (in[9] >> 9 | in[10] << 23) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = in[10] >> shifts_4th[0]; ind[1] = (in[10] >> 31 | in[11] << 1) >> shifts_4th[1]; ind[2] = (in[11] >> 26 | in[12] << 6) >> shifts_4th[2]; ind[3] = (in[12] >> 21 | in[13] << 11) >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[13] >> 16 | in[14] << 16) >> shifts_5th[0]; ind[1] = (in[14] >> 11 | in[15] << 21) >> shifts_5th[1]; ind[2] = (in[15] >> 6 | in[16] << 26) >> shifts_5th[2]; ind[3] = in[16] >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[16] >> 28 | in[17] << 4) >> shifts_6th[0]; ind[1] = (in[17] >> 23 | in[18] << 9) >> shifts_6th[1]; ind[2] = (in[18] >> 18 | in[19] << 14) >> shifts_6th[2]; ind[3] = (in[19] >> 13 | in[20] << 19) >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[20] >> 8 | in[21] << 24) >> shifts_7th[0]; ind[1] = in[21] >> shifts_7th[1]; ind[2] = (in[21] >> 30 | in[22] << 2) >> shifts_7th[2]; ind[3] = (in[22] >> 25 | in[23] << 7) >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[23] >> 20 | in[24] << 12) >> shifts_8th[0]; ind[1] = (in[24] >> 15 | in[25] << 17) >> shifts_8th[1]; ind[2] = (in[25] >> 10 | in[26] << 22) >> shifts_8th[2]; ind[3] = in[26] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 27; return in; } inline static const uint32_t* unpack28_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0xfffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 0, 0, 4}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 28 | in[1] << 4) >> shifts_1st[1]; ind[2] = (in[1] >> 24 | in[2] << 8) >> shifts_1st[2]; ind[3] = (in[2] >> 20 | in[3] << 12) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[3] >> 16 | in[4] << 16) >> shifts_2nd[0]; ind[1] = (in[4] >> 12 | in[5] << 20) >> shifts_2nd[1]; ind[2] = (in[5] >> 8 | in[6] << 24) >> shifts_2nd[2]; ind[3] = in[6] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = in[7] >> shifts_1st[0]; ind[1] = (in[7] >> 28 | in[8] << 4) >> shifts_1st[1]; ind[2] = (in[8] >> 24 | in[9] << 8) >> shifts_1st[2]; ind[3] = (in[9] >> 20 | in[10] << 12) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[10] >> 16 | in[11] << 16) >> shifts_2nd[0]; ind[1] = (in[11] >> 12 | in[12] << 20) >> shifts_2nd[1]; ind[2] = (in[12] >> 8 | in[13] << 24) >> shifts_2nd[2]; ind[3] = in[13] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[14] >> shifts_1st[0]; ind[1] = (in[14] >> 28 | in[15] << 4) >> shifts_1st[1]; ind[2] = (in[15] >> 24 | in[16] << 8) >> shifts_1st[2]; ind[3] = (in[16] >> 20 | in[17] << 12) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[17] >> 16 | in[18] << 16) >> shifts_2nd[0]; ind[1] = (in[18] >> 12 | in[19] << 20) >> shifts_2nd[1]; ind[2] = (in[19] >> 8 | in[20] << 24) >> shifts_2nd[2]; ind[3] = in[20] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = in[21] >> shifts_1st[0]; ind[1] = (in[21] >> 28 | in[22] << 4) >> shifts_1st[1]; ind[2] = (in[22] >> 24 | in[23] << 8) >> shifts_1st[2]; ind[3] = (in[23] >> 20 | in[24] << 12) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[24] >> 16 | in[25] << 16) >> shifts_2nd[0]; ind[1] = (in[25] >> 12 | in[26] << 20) >> shifts_2nd[1]; ind[2] = (in[26] >> 8 | in[27] << 24) >> shifts_2nd[2]; ind[3] = in[27] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 28; return in; } inline static const uint32_t* unpack29_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x1fffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 0, 0, 0}; uint32_t shifts_3rd[4] = {0, 0, 2, 0}; uint32_t shifts_4th[4] = {0, 0, 0, 0}; uint32_t shifts_5th[4] = {0, 0, 0, 0}; uint32_t shifts_6th[4] = {0, 1, 0, 0}; uint32_t shifts_7th[4] = {0, 0, 0, 0}; uint32_t shifts_8th[4] = {0, 0, 0, 3}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 29 | in[1] << 3) >> shifts_1st[1]; ind[2] = (in[1] >> 26 | in[2] << 6) >> shifts_1st[2]; ind[3] = (in[2] >> 23 | in[3] << 9) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[3] >> 20 | in[4] << 12) >> shifts_2nd[0]; ind[1] = (in[4] >> 17 | in[5] << 15) >> shifts_2nd[1]; ind[2] = (in[5] >> 14 | in[6] << 18) >> shifts_2nd[2]; ind[3] = (in[6] >> 11 | in[7] << 21) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[7] >> 8 | in[8] << 24) >> shifts_3rd[0]; ind[1] = (in[8] >> 5 | in[9] << 27) >> shifts_3rd[1]; ind[2] = in[9] >> shifts_3rd[2]; ind[3] = (in[9] >> 31 | in[10] << 1) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[10] >> 28 | in[11] << 4) >> shifts_4th[0]; ind[1] = (in[11] >> 25 | in[12] << 7) >> shifts_4th[1]; ind[2] = (in[12] >> 22 | in[13] << 10) >> shifts_4th[2]; ind[3] = (in[13] >> 19 | in[14] << 13) >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[14] >> 16 | in[15] << 16) >> shifts_5th[0]; ind[1] = (in[15] >> 13 | in[16] << 19) >> shifts_5th[1]; ind[2] = (in[16] >> 10 | in[17] << 22) >> shifts_5th[2]; ind[3] = (in[17] >> 7 | in[18] << 25) >> shifts_5th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[18] >> 4 | in[19] << 28) >> shifts_6th[0]; ind[1] = in[19] >> shifts_6th[1]; ind[2] = (in[19] >> 30 | in[20] << 2) >> shifts_6th[2]; ind[3] = (in[20] >> 27 | in[21] << 5) >> shifts_6th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[21] >> 24 | in[22] << 8) >> shifts_7th[0]; ind[1] = (in[22] >> 21 | in[23] << 11) >> shifts_7th[1]; ind[2] = (in[23] >> 18 | in[24] << 14) >> shifts_7th[2]; ind[3] = (in[24] >> 15 | in[25] << 17) >> shifts_7th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[25] >> 12 | in[26] << 20) >> shifts_8th[0]; ind[1] = (in[26] >> 9 | in[27] << 23) >> shifts_8th[1]; ind[2] = (in[27] >> 6 | in[28] << 26) >> shifts_8th[2]; ind[3] = in[28] >> shifts_8th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 29; return in; } inline static const uint32_t* unpack30_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x3fffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 0, 0, 0}; uint32_t shifts_3rd[4] = {0, 0, 0, 0}; uint32_t shifts_4th[4] = {0, 0, 0, 2}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 30 | in[1] << 2) >> shifts_1st[1]; ind[2] = (in[1] >> 28 | in[2] << 4) >> shifts_1st[2]; ind[3] = (in[2] >> 26 | in[3] << 6) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[3] >> 24 | in[4] << 8) >> shifts_2nd[0]; ind[1] = (in[4] >> 22 | in[5] << 10) >> shifts_2nd[1]; ind[2] = (in[5] >> 20 | in[6] << 12) >> shifts_2nd[2]; ind[3] = (in[6] >> 18 | in[7] << 14) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[7] >> 16 | in[8] << 16) >> shifts_3rd[0]; ind[1] = (in[8] >> 14 | in[9] << 18) >> shifts_3rd[1]; ind[2] = (in[9] >> 12 | in[10] << 20) >> shifts_3rd[2]; ind[3] = (in[10] >> 10 | in[11] << 22) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[11] >> 8 | in[12] << 24) >> shifts_4th[0]; ind[1] = (in[12] >> 6 | in[13] << 26) >> shifts_4th[1]; ind[2] = (in[13] >> 4 | in[14] << 28) >> shifts_4th[2]; ind[3] = in[14] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = in[15] >> shifts_1st[0]; ind[1] = (in[15] >> 30 | in[16] << 2) >> shifts_1st[1]; ind[2] = (in[16] >> 28 | in[17] << 4) >> shifts_1st[2]; ind[3] = (in[17] >> 26 | in[18] << 6) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[18] >> 24 | in[19] << 8) >> shifts_2nd[0]; ind[1] = (in[19] >> 22 | in[20] << 10) >> shifts_2nd[1]; ind[2] = (in[20] >> 20 | in[21] << 12) >> shifts_2nd[2]; ind[3] = (in[21] >> 18 | in[22] << 14) >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[22] >> 16 | in[23] << 16) >> shifts_3rd[0]; ind[1] = (in[23] >> 14 | in[24] << 18) >> shifts_3rd[1]; ind[2] = (in[24] >> 12 | in[25] << 20) >> shifts_3rd[2]; ind[3] = (in[25] >> 10 | in[26] << 22) >> shifts_3rd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[26] >> 8 | in[27] << 24) >> shifts_4th[0]; ind[1] = (in[27] >> 6 | in[28] << 26) >> shifts_4th[1]; ind[2] = (in[28] >> 4 | in[29] << 28) >> shifts_4th[2]; ind[3] = in[29] >> shifts_4th[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 30; return in; } inline static const uint32_t* unpack31_32_neon(const uint32_t* in, uint32_t* out) { uint32_t mask = 0x7fffffff; uint32_t ind[4]; uint32_t shifts_1st[4] = {0, 0, 0, 0}; uint32_t shifts_2nd[4] = {0, 0, 0, 1}; uint32x4_t reg_shift, reg_masks; uint32x4_t results; reg_masks = vdupq_n_u32(mask); // shift the first 4 outs ind[0] = in[0] >> shifts_1st[0]; ind[1] = (in[0] >> 31 | in[1] << 1) >> shifts_1st[1]; ind[2] = (in[1] >> 30 | in[2] << 2) >> shifts_1st[2]; ind[3] = (in[2] >> 29 | in[3] << 3) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 2nd 4 outs ind[0] = (in[3] >> 28 | in[4] << 4) >> shifts_1st[0]; ind[1] = (in[4] >> 27 | in[5] << 5) >> shifts_1st[1]; ind[2] = (in[5] >> 26 | in[6] << 6) >> shifts_1st[2]; ind[3] = (in[6] >> 25 | in[7] << 7) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 3rd 4 outs ind[0] = (in[7] >> 24 | in[8] << 8) >> shifts_1st[0]; ind[1] = (in[8] >> 23 | in[9] << 9) >> shifts_1st[1]; ind[2] = (in[9] >> 22 | in[10] << 10) >> shifts_1st[2]; ind[3] = (in[10] >> 21 | in[11] << 11) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 4th 4 outs ind[0] = (in[11] >> 20 | in[12] << 12) >> shifts_1st[0]; ind[1] = (in[12] >> 19 | in[13] << 13) >> shifts_1st[1]; ind[2] = (in[13] >> 18 | in[14] << 14) >> shifts_1st[2]; ind[3] = (in[14] >> 17 | in[15] << 15) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 5th 4 outs ind[0] = (in[15] >> 16 | in[16] << 16) >> shifts_1st[0]; ind[1] = (in[16] >> 15 | in[17] << 17) >> shifts_1st[1]; ind[2] = (in[17] >> 14 | in[18] << 18) >> shifts_1st[2]; ind[3] = (in[18] >> 13 | in[19] << 19) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 6th 4 outs ind[0] = (in[19] >> 12 | in[20] << 20) >> shifts_1st[0]; ind[1] = (in[20] >> 11 | in[21] << 21) >> shifts_1st[1]; ind[2] = (in[21] >> 10 | in[22] << 22) >> shifts_1st[2]; ind[3] = (in[22] >> 9 | in[23] << 23) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 7th 4 outs ind[0] = (in[23] >> 8 | in[24] << 24) >> shifts_1st[0]; ind[1] = (in[24] >> 7 | in[25] << 25) >> shifts_1st[1]; ind[2] = (in[25] >> 6 | in[26] << 26) >> shifts_1st[2]; ind[3] = (in[26] >> 5 | in[27] << 27) >> shifts_1st[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; // shift the 8th 4 outs ind[0] = (in[27] >> 4 | in[28] << 28) >> shifts_2nd[0]; ind[1] = (in[28] >> 3 | in[29] << 29) >> shifts_2nd[1]; ind[2] = (in[29] >> 2 | in[30] << 30) >> shifts_2nd[2]; ind[3] = in[30] >> shifts_2nd[3]; reg_shift = vld1q_u32(ind); results = vandq_u32(reg_shift, reg_masks); vst1q_u32(out, results); out += 4; in += 31; return in; } inline const uint32_t* unpack32_32_neon(const uint32_t* in, uint32_t* out) { for (const uint32_t* end = out + 32; out != end; out++) { *out = *in; in++; } return in; } int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; switch (num_bits) { case 0: for (int i = 0; i < num_loops; ++i) in = unpack0_32_neon(in, out + i * 32); break; case 1: for (int i = 0; i < num_loops; ++i) in = unpack1_32_neon(in, out + i * 32); break; case 2: for (int i = 0; i < num_loops; ++i) in = unpack2_32_neon(in, out + i * 32); break; case 3: for (int i = 0; i < num_loops; ++i) in = unpack3_32_neon(in, out + i * 32); break; case 4: for (int i = 0; i < num_loops; ++i) in = unpack4_32_neon(in, out + i * 32); break; case 5: for (int i = 0; i < num_loops; ++i) in = unpack5_32_neon(in, out + i * 32); break; case 6: for (int i = 0; i < num_loops; ++i) in = unpack6_32_neon(in, out + i * 32); break; case 7: for (int i = 0; i < num_loops; ++i) in = unpack7_32_neon(in, out + i * 32); break; case 8: for (int i = 0; i < num_loops; ++i) in = unpack8_32_neon(in, out + i * 32); break; case 9: for (int i = 0; i < num_loops; ++i) in = unpack9_32_neon(in, out + i * 32); break; case 10: for (int i = 0; i < num_loops; ++i) in = unpack10_32_neon(in, out + i * 32); break; case 11: for (int i = 0; i < num_loops; ++i) in = unpack11_32_neon(in, out + i * 32); break; case 12: for (int i = 0; i < num_loops; ++i) in = unpack12_32_neon(in, out + i * 32); break; case 13: for (int i = 0; i < num_loops; ++i) in = unpack13_32_neon(in, out + i * 32); break; case 14: for (int i = 0; i < num_loops; ++i) in = unpack14_32_neon(in, out + i * 32); break; case 15: for (int i = 0; i < num_loops; ++i) in = unpack15_32_neon(in, out + i * 32); break; case 16: for (int i = 0; i < num_loops; ++i) in = unpack16_32_neon(in, out + i * 32); break; case 17: for (int i = 0; i < num_loops; ++i) in = unpack17_32_neon(in, out + i * 32); break; case 18: for (int i = 0; i < num_loops; ++i) in = unpack18_32_neon(in, out + i * 32); break; case 19: for (int i = 0; i < num_loops; ++i) in = unpack19_32_neon(in, out + i * 32); break; case 20: for (int i = 0; i < num_loops; ++i) in = unpack20_32_neon(in, out + i * 32); break; case 21: for (int i = 0; i < num_loops; ++i) in = unpack21_32_neon(in, out + i * 32); break; case 22: for (int i = 0; i < num_loops; ++i) in = unpack22_32_neon(in, out + i * 32); break; case 23: for (int i = 0; i < num_loops; ++i) in = unpack23_32_neon(in, out + i * 32); break; case 24: for (int i = 0; i < num_loops; ++i) in = unpack24_32_neon(in, out + i * 32); break; case 25: for (int i = 0; i < num_loops; ++i) in = unpack25_32_neon(in, out + i * 32); break; case 26: for (int i = 0; i < num_loops; ++i) in = unpack26_32_neon(in, out + i * 32); break; case 27: for (int i = 0; i < num_loops; ++i) in = unpack27_32_neon(in, out + i * 32); break; case 28: for (int i = 0; i < num_loops; ++i) in = unpack28_32_neon(in, out + i * 32); break; case 29: for (int i = 0; i < num_loops; ++i) in = unpack29_32_neon(in, out + i * 32); break; case 30: for (int i = 0; i < num_loops; ++i) in = unpack30_32_neon(in, out + i * 32); break; case 31: for (int i = 0; i < num_loops; ++i) in = unpack31_32_neon(in, out + i * 32); break; case 32: for (int i = 0; i < num_loops; ++i) in = unpack32_32_neon(in, out + i * 32); break; } return batch_size; }