in aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm/8x8-neon.c [14:1231]
void pytorch_q8gemm_ukernel_8x8__neon(
size_t mr,
size_t nr,
size_t k,
const uint8_t* restrict a,
size_t a_stride,
const void* restrict w,
uint8_t* restrict c,
size_t c_stride,
size_t output_channel_index,
const union pytorch_qnnp_conv_quantization_params
quantization_params[restrict static 1]) {
int32x4_t vacc0x0123 = vld1q_s32(w);
w = (const void*)((uintptr_t)w + 16);
int32x4_t vacc0x4567 = vld1q_s32(w);
w = (const void*)((uintptr_t)w + 16);
int32x4_t vacc1x0123 = vacc0x0123;
int32x4_t vacc1x4567 = vacc0x4567;
int32x4_t vacc2x0123 = vacc0x0123;
int32x4_t vacc2x4567 = vacc0x4567;
int32x4_t vacc3x0123 = vacc0x0123;
int32x4_t vacc3x4567 = vacc0x4567;
int32x4_t vacc4x0123 = vacc0x0123;
int32x4_t vacc4x4567 = vacc0x4567;
int32x4_t vacc5x0123 = vacc0x0123;
int32x4_t vacc5x4567 = vacc0x4567;
int32x4_t vacc6x0123 = vacc0x0123;
int32x4_t vacc6x4567 = vacc0x4567;
int32x4_t vacc7x0123 = vacc0x0123;
int32x4_t vacc7x4567 = vacc0x4567;
const uint8_t* a0 = a;
const uint8_t* a1 = (const uint8_t*)((uintptr_t)a0 + a_stride);
if (mr < 2) {
a1 = a0;
}
const uint8_t* a2 = (const uint8_t*)((uintptr_t)a1 + a_stride);
if (mr <= 2) {
a2 = a1;
}
const uint8_t* a3 = (const uint8_t*)((uintptr_t)a2 + a_stride);
if (mr < 4) {
a3 = a2;
}
const uint8_t* a4 = (const uint8_t*)((uintptr_t)a3 + a_stride);
if (mr <= 4) {
a4 = a3;
}
const uint8_t* a5 = (const uint8_t*)((uintptr_t)a4 + a_stride);
if (mr < 6) {
a5 = a4;
}
const uint8_t* a6 = (const uint8_t*)((uintptr_t)a5 + a_stride);
if (mr <= 6) {
a6 = a5;
}
const uint8_t* a7 = (const uint8_t*)((uintptr_t)a6 + a_stride);
if (mr != 8) {
a7 = a6;
}
const uint8x8_t va_zero_point =
vld1_dup_u8((const uint8_t*)&quantization_params->neon.input_zero_point);
const uint8x8_t vb_zero_point =
vld1_u8((const uint8_t*)&quantization_params->neon.kernel_zero_points
[output_channel_index]);
for (; k >= 8; k -= 8) {
const uint8x8_t va0 = vld1_u8(a0);
const int16x8_t vxa0 =
vreinterpretq_s16_u16(sub_zero_point(va0, va_zero_point));
a0 += 8;
const uint8x8_t va1 = vld1_u8(a1);
const int16x8_t vxa1 =
vreinterpretq_s16_u16(sub_zero_point(va1, va_zero_point));
a1 += 8;
const uint8x8_t va2 = vld1_u8(a2);
const int16x8_t vxa2 =
vreinterpretq_s16_u16(sub_zero_point(va2, va_zero_point));
a2 += 8;
const uint8x8_t va3 = vld1_u8(a3);
const int16x8_t vxa3 =
vreinterpretq_s16_u16(sub_zero_point(va3, va_zero_point));
a3 += 8;
const uint8x8_t va4 = vld1_u8(a4);
const int16x8_t vxa4 =
vreinterpretq_s16_u16(sub_zero_point(va4, va_zero_point));
a4 += 8;
const uint8x8_t va5 = vld1_u8(a5);
const int16x8_t vxa5 =
vreinterpretq_s16_u16(sub_zero_point(va5, va_zero_point));
a5 += 8;
const uint8x8_t va6 = vld1_u8(a6);
const int16x8_t vxa6 =
vreinterpretq_s16_u16(sub_zero_point(va6, va_zero_point));
a6 += 8;
const uint8x8_t va7 = vld1_u8(a7);
const int16x8_t vxa7 =
vreinterpretq_s16_u16(sub_zero_point(va7, va_zero_point));
a7 += 8;
const uint8x8_t vb01234567c0 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c0 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
const uint8x8_t vb01234567c1 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c1 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
const uint8x8_t vb01234567c2 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c2 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
const uint8x8_t vb01234567c3 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c3 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
const uint8x8_t vb01234567c4 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c4 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa4), 0);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa4), 0);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa5), 0);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa5), 0);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa6), 0);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa6), 0);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa7), 0);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa7), 0);
const uint8x8_t vb01234567c5 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c5 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa4), 1);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa4), 1);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa5), 1);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa5), 1);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa6), 1);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa6), 1);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa7), 1);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa7), 1);
const uint8x8_t vb01234567c6 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c6 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa4), 2);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa4), 2);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa5), 2);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa5), 2);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa6), 2);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa6), 2);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa7), 2);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa7), 2);
const uint8x8_t vb01234567c7 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c7 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa4), 3);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa4), 3);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa5), 3);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa5), 3);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa6), 3);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa6), 3);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa7), 3);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa7), 3);
}
if (k != 0) {
const size_t a_predecrement = 8 - k;
const int64x1_t va_shift = vmov_n_s64(-8 * a_predecrement);
const uint8x8_t va0 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a0 - a_predecrement)), va_shift));
const int16x8_t vxa0 =
vreinterpretq_s16_u16(sub_zero_point(va0, va_zero_point));
const uint8x8_t va1 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a1 - a_predecrement)), va_shift));
const int16x8_t vxa1 =
vreinterpretq_s16_u16(sub_zero_point(va1, va_zero_point));
const uint8x8_t va2 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a2 - a_predecrement)), va_shift));
const int16x8_t vxa2 =
vreinterpretq_s16_u16(sub_zero_point(va2, va_zero_point));
const uint8x8_t va3 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a3 - a_predecrement)), va_shift));
const int16x8_t vxa3 =
vreinterpretq_s16_u16(sub_zero_point(va3, va_zero_point));
const uint8x8_t va4 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a4 - a_predecrement)), va_shift));
const int16x8_t vxa4 =
vreinterpretq_s16_u16(sub_zero_point(va4, va_zero_point));
const uint8x8_t va5 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a5 - a_predecrement)), va_shift));
const int16x8_t vxa5 =
vreinterpretq_s16_u16(sub_zero_point(va5, va_zero_point));
const uint8x8_t va6 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a6 - a_predecrement)), va_shift));
const int16x8_t vxa6 =
vreinterpretq_s16_u16(sub_zero_point(va6, va_zero_point));
const uint8x8_t va7 = vreinterpret_u8_u64(
vshl_u64(vreinterpret_u64_u8(vld1_u8(a7 - a_predecrement)), va_shift));
const int16x8_t vxa7 =
vreinterpretq_s16_u16(sub_zero_point(va7, va_zero_point));
const uint8x8_t vb01234567c0 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c0 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa4), 0);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa5), 0);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa6), 0);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa7), 0);
if (k >= 2) {
const uint8x8_t vb01234567c1 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c1 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa4), 1);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa5), 1);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa6), 1);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa7), 1);
if (k >= 3) {
const uint8x8_t vb01234567c2 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c2 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa4), 2);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa5), 2);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa6), 2);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa7), 2);
if (k >= 4) {
const uint8x8_t vb01234567c3 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c3 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa4), 3);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa5), 3);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa6), 3);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa7), 3);
if (k >= 5) {
const uint8x8_t vb01234567c4 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c4 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa0),
0);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa0),
0);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa1),
0);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa1),
0);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa2),
0);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa2),
0);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa3),
0);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa3),
0);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa4),
0);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa4),
0);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa5),
0);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa5),
0);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa6),
0);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa6),
0);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123,
vget_low_s16(vxb01234567c4),
vget_high_s16(vxa7),
0);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567,
vget_high_s16(vxb01234567c4),
vget_high_s16(vxa7),
0);
if (k >= 6) {
const uint8x8_t vb01234567c5 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c5 =
vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa0),
1);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa0),
1);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa1),
1);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa1),
1);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa2),
1);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa2),
1);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa3),
1);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa3),
1);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa4),
1);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa4),
1);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa5),
1);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa5),
1);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa6),
1);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa6),
1);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123,
vget_low_s16(vxb01234567c5),
vget_high_s16(vxa7),
1);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567,
vget_high_s16(vxb01234567c5),
vget_high_s16(vxa7),
1);
if (k >= 7) {
const uint8x8_t vb01234567c6 = vld1_u8(w);
w = (const void*)((uintptr_t)w + 8);
const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(
vsubl_u8(vb01234567c6, vb_zero_point));
vacc0x0123 = vmlal_lane_s16(
vacc0x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa0),
2);
vacc0x4567 = vmlal_lane_s16(
vacc0x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa0),
2);
vacc1x0123 = vmlal_lane_s16(
vacc1x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa1),
2);
vacc1x4567 = vmlal_lane_s16(
vacc1x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa1),
2);
vacc2x0123 = vmlal_lane_s16(
vacc2x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa2),
2);
vacc2x4567 = vmlal_lane_s16(
vacc2x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa2),
2);
vacc3x0123 = vmlal_lane_s16(
vacc3x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa3),
2);
vacc3x4567 = vmlal_lane_s16(
vacc3x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa3),
2);
vacc4x0123 = vmlal_lane_s16(
vacc4x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa4),
2);
vacc4x4567 = vmlal_lane_s16(
vacc4x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa4),
2);
vacc5x0123 = vmlal_lane_s16(
vacc5x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa5),
2);
vacc5x4567 = vmlal_lane_s16(
vacc5x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa5),
2);
vacc6x0123 = vmlal_lane_s16(
vacc6x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa6),
2);
vacc6x4567 = vmlal_lane_s16(
vacc6x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa6),
2);
vacc7x0123 = vmlal_lane_s16(
vacc7x0123,
vget_low_s16(vxb01234567c6),
vget_high_s16(vxa7),
2);
vacc7x4567 = vmlal_lane_s16(
vacc7x4567,
vget_high_s16(vxb01234567c6),
vget_high_s16(vxa7),
2);
}
}
}
}
}
}
}
const float32x4_t requantization_scale_c0123 =
vld1q_f32(
&quantization_params->neon.requantization_scales[output_channel_index]
);
const float32x4_t requantization_scale_c4567 =
vld1q_f32(
&quantization_params->neon.requantization_scales[
output_channel_index + 4]);
const float32x4_t vacc0x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc0x0123), requantization_scale_c0123);
const float32x4_t vacc1x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc1x0123), requantization_scale_c0123);
const float32x4_t vacc2x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc2x0123), requantization_scale_c0123);
const float32x4_t vacc3x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc3x0123), requantization_scale_c0123);
const float32x4_t vacc0x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc0x4567), requantization_scale_c4567);
const float32x4_t vacc1x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc1x4567), requantization_scale_c4567);
const float32x4_t vacc2x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc2x4567), requantization_scale_c4567);
const float32x4_t vacc3x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc3x4567), requantization_scale_c4567);
const float32x4_t vacc4x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc4x0123), requantization_scale_c0123);
const float32x4_t vacc5x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc5x0123), requantization_scale_c0123);
const float32x4_t vacc6x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc6x0123), requantization_scale_c0123);
const float32x4_t vacc7x0123_f =
vmulq_f32(vcvtq_f32_s32(vacc7x0123), requantization_scale_c0123);
const float32x4_t vacc4x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc4x4567), requantization_scale_c4567);
const float32x4_t vacc5x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc5x4567), requantization_scale_c4567);
const float32x4_t vacc6x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc6x4567), requantization_scale_c4567);
const float32x4_t vacc7x4567_f =
vmulq_f32(vcvtq_f32_s32(vacc7x4567), requantization_scale_c4567);
#ifdef __aarch64__
const int16x8_t voutput_zero_point =
vld1q_dup_s16(&quantization_params->neon.output_zero_point);
vacc0x0123 = vcvtnq_s32_f32(vacc0x0123_f);
vacc1x0123 = vcvtnq_s32_f32(vacc1x0123_f);
vacc2x0123 = vcvtnq_s32_f32(vacc2x0123_f);
vacc3x0123 = vcvtnq_s32_f32(vacc3x0123_f);
vacc0x4567 = vcvtnq_s32_f32(vacc0x4567_f);
vacc1x4567 = vcvtnq_s32_f32(vacc1x4567_f);
vacc2x4567 = vcvtnq_s32_f32(vacc2x4567_f);
vacc3x4567 = vcvtnq_s32_f32(vacc3x4567_f);
vacc4x0123 = vcvtnq_s32_f32(vacc4x0123_f);
vacc5x0123 = vcvtnq_s32_f32(vacc5x0123_f);
vacc6x0123 = vcvtnq_s32_f32(vacc6x0123_f);
vacc7x0123 = vcvtnq_s32_f32(vacc7x0123_f);
vacc4x4567 = vcvtnq_s32_f32(vacc4x4567_f);
vacc5x4567 = vcvtnq_s32_f32(vacc5x4567_f);
vacc6x4567 = vcvtnq_s32_f32(vacc6x4567_f);
vacc7x4567 = vcvtnq_s32_f32(vacc7x4567_f);
const int16x8_t vacc0x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
const int16x8_t vacc2x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
const int16x8_t vacc3x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
const int16x8_t vacc4x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
const int16x8_t vacc5x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
const int16x8_t vacc6x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
const int16x8_t vacc7x01234567 = vqaddq_s16(
vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
uint8x16_t vout0x01234567_1x01234567 =
vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
uint8x16_t vout2x01234567_3x01234567 =
vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
uint8x16_t vout4x01234567_5x01234567 =
vqmovun_high_s16(vqmovun_s16(vacc4x01234567), vacc5x01234567);
uint8x16_t vout6x01234567_7x01234567 =
vqmovun_high_s16(vqmovun_s16(vacc6x01234567), vacc7x01234567);
const uint8x16_t voutput_min =
vld1q_dup_u8(&quantization_params->neon.output_min);
const uint8x16_t voutput_max =
vld1q_dup_u8(&quantization_params->neon.output_max);
vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
vout4x01234567_5x01234567 = vmaxq_u8(vout4x01234567_5x01234567, voutput_min);
vout6x01234567_7x01234567 = vmaxq_u8(vout6x01234567_7x01234567, voutput_min);
vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
vout4x01234567_5x01234567 = vminq_u8(vout4x01234567_5x01234567, voutput_max);
vout6x01234567_7x01234567 = vminq_u8(vout6x01234567_7x01234567, voutput_max);
#else
const float32x4_t vfmin = vdupq_n_f32(quantization_params->neon.vfmin);
const float32x4_t vfmax = vdupq_n_f32(quantization_params->neon.vfmax);
const float32x4_t vfmagic = vdupq_n_f32(quantization_params->neon.vfmagic);
const int32x4_t vimagic = vdupq_n_s32(quantization_params->neon.vimagic);
const float32x4_t vacc0x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc0x0123_f, vfmin), vfmax);
const float32x4_t vacc1x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc1x0123_f, vfmin), vfmax);
const float32x4_t vacc2x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc2x0123_f, vfmin), vfmax);
const float32x4_t vacc3x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc3x0123_f, vfmin), vfmax);
const float32x4_t vacc0x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc0x4567_f, vfmin), vfmax);
const float32x4_t vacc1x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc1x4567_f, vfmin), vfmax);
const float32x4_t vacc2x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc2x4567_f, vfmin), vfmax);
const float32x4_t vacc3x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc3x4567_f, vfmin), vfmax);
const float32x4_t vacc4x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc4x0123_f, vfmin), vfmax);
const float32x4_t vacc5x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc5x0123_f, vfmin), vfmax);
const float32x4_t vacc6x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc6x0123_f, vfmin), vfmax);
const float32x4_t vacc7x0123_f_clamped =
vminq_f32(vmaxq_f32(vacc7x0123_f, vfmin), vfmax);
const float32x4_t vacc4x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc4x4567_f, vfmin), vfmax);
const float32x4_t vacc5x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc5x4567_f, vfmin), vfmax);
const float32x4_t vacc6x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc6x4567_f, vfmin), vfmax);
const float32x4_t vacc7x4567_f_clamped =
vminq_f32(vmaxq_f32(vacc7x4567_f, vfmin), vfmax);
vacc0x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc0x0123_f_clamped, vfmagic)), vimagic);
vacc1x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc1x0123_f_clamped, vfmagic)), vimagic);
vacc2x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc2x0123_f_clamped, vfmagic)), vimagic);
vacc3x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc3x0123_f_clamped, vfmagic)), vimagic);
vacc0x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc0x4567_f_clamped, vfmagic)), vimagic);
vacc1x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc1x4567_f_clamped, vfmagic)), vimagic);
vacc2x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc2x4567_f_clamped, vfmagic)), vimagic);
vacc3x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc3x4567_f_clamped, vfmagic)), vimagic);
vacc4x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc4x0123_f_clamped, vfmagic)), vimagic);
vacc5x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc5x0123_f_clamped, vfmagic)), vimagic);
vacc6x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc6x0123_f_clamped, vfmagic)), vimagic);
vacc7x0123 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc7x0123_f_clamped, vfmagic)), vimagic);
vacc4x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc4x4567_f_clamped, vfmagic)), vimagic);
vacc5x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc5x4567_f_clamped, vfmagic)), vimagic);
vacc6x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc6x4567_f_clamped, vfmagic)), vimagic);
vacc7x4567 = vsubq_s32(
vreinterpretq_s32_f32(vaddq_f32(vacc7x4567_f_clamped, vfmagic)), vimagic);
const int16x8_t vacc0x01234567 =
vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
const int16x8_t vacc1x01234567 =
vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
const int16x8_t vacc2x01234567 =
vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
const int16x8_t vacc3x01234567 =
vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
const int16x8_t vacc4x01234567 =
vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567));
const int16x8_t vacc5x01234567 =
vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567));
const int16x8_t vacc6x01234567 =
vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567));
const int16x8_t vacc7x01234567 =
vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567));
uint8x16_t vout0x01234567_1x01234567 =
vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
uint8x16_t vout2x01234567_3x01234567 =
vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
uint8x16_t vout4x01234567_5x01234567 =
vcombine_u8(vqmovun_s16(vacc4x01234567), vqmovun_s16(vacc5x01234567));
uint8x16_t vout6x01234567_7x01234567 =
vcombine_u8(vqmovun_s16(vacc6x01234567), vqmovun_s16(vacc7x01234567));
#endif
uint8_t* c0 = c;
uint8_t* c1 = (uint8_t*)((uintptr_t)c0 + c_stride);
if (mr < 2) {
c1 = c0;
}
uint8_t* c2 = (uint8_t*)((uintptr_t)c1 + c_stride);
if (mr <= 2) {
c2 = c1;
}
uint8_t* c3 = (uint8_t*)((uintptr_t)c2 + c_stride);
if (mr < 4) {
c3 = c2;
}
uint8_t* c4 = (uint8_t*)((uintptr_t)c3 + c_stride);
if (mr <= 4) {
c4 = c3;
}
uint8_t* c5 = (uint8_t*)((uintptr_t)c4 + c_stride);
if (mr < 6) {
c5 = c4;
}
uint8_t* c6 = (uint8_t*)((uintptr_t)c5 + c_stride);
if (mr <= 6) {
c6 = c5;
}
uint8_t* c7 = (uint8_t*)((uintptr_t)c6 + c_stride);
if (mr != 8) {
c7 = c6;
}
if (nr == 8) {
vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567));
vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567));
vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567));
vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567));
vst1_u8(c4, vget_low_u8(vout4x01234567_5x01234567));
vst1_u8(c5, vget_high_u8(vout4x01234567_5x01234567));
vst1_u8(c6, vget_low_u8(vout6x01234567_7x01234567));
vst1_u8(c7, vget_high_u8(vout6x01234567_7x01234567));
} else {
if (nr >= 4) {
vst1q_lane_u32(
__builtin_assume_aligned(c0, 1),
vreinterpretq_u32_u8(vout0x01234567_1x01234567),
0);
c0 += 4;
vst1q_lane_u32(
__builtin_assume_aligned(c1, 1),
vreinterpretq_u32_u8(vout0x01234567_1x01234567),
2);
c1 += 4;
vst1q_lane_u32(
__builtin_assume_aligned(c2, 1),
vreinterpretq_u32_u8(vout2x01234567_3x01234567),
0);
c2 += 4;
vst1q_lane_u32(
__builtin_assume_aligned(c3, 1),
vreinterpretq_u32_u8(vout2x01234567_3x01234567),
2);
c3 += 4;
vst1q_lane_u32(
__builtin_assume_aligned(c4, 1),
vreinterpretq_u32_u8(vout4x01234567_5x01234567),
0);
c4 += 4;
vst1q_lane_u32(
__builtin_assume_aligned(c5, 1),
vreinterpretq_u32_u8(vout4x01234567_5x01234567),
2);
c5 += 4;
vst1q_lane_u32(
__builtin_assume_aligned(c6, 1),
vreinterpretq_u32_u8(vout6x01234567_7x01234567),
0);
c6 += 4;
vst1q_lane_u32(
__builtin_assume_aligned(c7, 1),
vreinterpretq_u32_u8(vout6x01234567_7x01234567),
2);
c7 += 4;
vout0x01234567_1x01234567 =
vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
vout2x01234567_3x01234567 =
vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
vout4x01234567_5x01234567 =
vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
vout6x01234567_7x01234567 =
vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
nr -= 4;
}
if (nr >= 2) {
vst1q_lane_u16(
__builtin_assume_aligned(c0, 1),
vreinterpretq_u16_u8(vout0x01234567_1x01234567),
0);
c0 += 2;
vst1q_lane_u16(
__builtin_assume_aligned(c1, 1),
vreinterpretq_u16_u8(vout0x01234567_1x01234567),
4);
c1 += 2;
vst1q_lane_u16(
__builtin_assume_aligned(c2, 1),
vreinterpretq_u16_u8(vout2x01234567_3x01234567),
0);
c2 += 2;
vst1q_lane_u16(
__builtin_assume_aligned(c3, 1),
vreinterpretq_u16_u8(vout2x01234567_3x01234567),
4);
c3 += 2;
vst1q_lane_u16(
__builtin_assume_aligned(c4, 1),
vreinterpretq_u16_u8(vout4x01234567_5x01234567),
0);
c4 += 2;
vst1q_lane_u16(
__builtin_assume_aligned(c5, 1),
vreinterpretq_u16_u8(vout4x01234567_5x01234567),
4);
c5 += 2;
vst1q_lane_u16(
__builtin_assume_aligned(c6, 1),
vreinterpretq_u16_u8(vout6x01234567_7x01234567),
0);
c6 += 2;
vst1q_lane_u16(
__builtin_assume_aligned(c7, 1),
vreinterpretq_u16_u8(vout6x01234567_7x01234567),
4);
c7 += 2;
vout0x01234567_1x01234567 =
vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
vout2x01234567_3x01234567 =
vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
vout4x01234567_5x01234567 =
vextq_u8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
vout6x01234567_7x01234567 =
vextq_u8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
nr -= 2;
}
if (nr != 0) {
vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
vst1q_lane_u8(c4, vout4x01234567_5x01234567, 0);
vst1q_lane_u8(c5, vout4x01234567_5x01234567, 8);
vst1q_lane_u8(c6, vout6x01234567_7x01234567, 0);
vst1q_lane_u8(c7, vout6x01234567_7x01234567, 8);
}
}
}