void pytorch_q8dwconv_ukernel_mp8x25_per_channel__sse2()

in aten/src/ATen/native/quantized/cpu/qnnpack/src/q8dwconv/mp8x25-sse2-per-channel.c [13:1098]


void pytorch_q8dwconv_ukernel_mp8x25_per_channel__sse2(
    size_t channels,
    size_t output_width,
    const uint8_t** input,
    const void* weights,
    int32_t* outacc32,
    uint8_t* output,
    size_t input_stride,
    size_t output_increment,
    const union pytorch_qnnp_conv_quantization_params
        quantization_params[RESTRICT_STATIC 1]) {
  const __m128i vinput_zero_point = _mm_load_si128(
      (const __m128i*)quantization_params->sse2.input_zero_point);
  const __m128i vzero = _mm_setzero_si128();

  do {
    int32_t* outacc = outacc32;
    const void* w = weights;
    {
      const uint8_t* i00 = input[0];
      const uint8_t* i01 = input[1];
      const uint8_t* i02 = input[2];
      const uint8_t* i10 = input[3];
      const uint8_t* i11 = input[4];
      const uint8_t* i12 = input[5];
      const uint8_t* i20 = input[6];
      const uint8_t* i21 = input[7];
      const uint8_t* i22 = input[8];
      const uint8_t* i23 = input[9];

      size_t c = channels;
      for (; c >= 8; c -= 8) {
        __m128i vacc_lo = _mm_loadu_si128((const __m128i*)w);
        __m128i vacc_hi = _mm_loadu_si128((const __m128i*)((uintptr_t)w + 16));
        const __m128i vkernel_zero_point = _mm_loadl_epi64(
            (const __m128i*)
            &quantization_params->sse2.kernel_zero_points[channels - c]);

        const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
        i00 += 8;
        const __m128i vxi00 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
        const __m128i vk00 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
        const __m128i vxk00 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk00, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
        const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod00_odd, vprod00_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod00_odd, vprod00_even));

        const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
        i01 += 8;
        const __m128i vxi01 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
        const __m128i vk01 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
        const __m128i vxk01 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk01, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
        const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));

        const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
        i02 += 8;
        const __m128i vxi02 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
        const __m128i vk02 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
        const __m128i vxk02 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk02, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
        const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));

        const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
        i10 += 8;
        const __m128i vxi10 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
        const __m128i vk10 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
        const __m128i vxk10 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk10, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
        const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));

        const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
        i11 += 8;
        const __m128i vxi11 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
        const __m128i vk11 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
        const __m128i vxk11 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk11, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
        const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));

        const __m128i vi12 = _mm_loadl_epi64((const __m128i*)i12);
        i12 += 8;
        const __m128i vxi12 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
        const __m128i vk12 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
        const __m128i vxk12 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk12, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
        const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));

        const __m128i vi20 = _mm_loadl_epi64((const __m128i*)i20);
        i20 += 8;
        const __m128i vxi20 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
        const __m128i vk20 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 80));
        const __m128i vxk20 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk20, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
        const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));

        const __m128i vi21 = _mm_loadl_epi64((const __m128i*)i21);
        i21 += 8;
        const __m128i vxi21 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
        const __m128i vk21 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 88));
        const __m128i vxk21 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk21, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
        const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));

        const __m128i vi22 = _mm_loadl_epi64((const __m128i*)i22);
        i22 += 8;
        const __m128i vxi22 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
        const __m128i vk22 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 96));
        const __m128i vxk22 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk22, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
        const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));

        const __m128i vi23 = _mm_loadl_epi64((const __m128i*)i23);
        i23 += 8;
        const __m128i vxi23 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
        const __m128i vk23 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 104));
        const __m128i vxk23 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk23, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
        const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));

        w = (const void*)((uintptr_t)w + 112);
        _mm_storeu_si128((__m128i*)outacc, vacc_lo);
        outacc += 4;
        _mm_storeu_si128((__m128i*)outacc, vacc_hi);
        outacc += 4;
      }
      if (c != 0) {
        const size_t i_predecrement = 8 - c;
        const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
        const __m128i vkernel_zero_point = _mm_loadl_epi64(
            (const __m128i*)
            &quantization_params->sse2.kernel_zero_points[channels - c]);
        i00 -= i_predecrement;
        i01 -= i_predecrement;
        i02 -= i_predecrement;
        i10 -= i_predecrement;
        i11 -= i_predecrement;
        i12 -= i_predecrement;
        i20 -= i_predecrement;
        i21 -= i_predecrement;
        i22 -= i_predecrement;
        i23 -= i_predecrement;

        __m128i vacc_lo = _mm_loadu_si128((const __m128i*)w);
        __m128i vacc_hi = _mm_loadu_si128((const __m128i*)((uintptr_t)w + 16));

        const __m128i vi00 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
        const __m128i vxi00 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
        const __m128i vk00 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
        const __m128i vxk00 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk00, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
        const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod00_odd, vprod00_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod00_odd, vprod00_even));

        const __m128i vi01 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
        const __m128i vxi01 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
        const __m128i vk01 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
        const __m128i vxk01 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk01, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
        const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));

        const __m128i vi02 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
        const __m128i vxi02 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
        const __m128i vk02 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
        const __m128i vxk02 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk02, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
        const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));

        const __m128i vi10 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
        const __m128i vxi10 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
        const __m128i vk10 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
        const __m128i vxk10 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk10, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
        const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));

        const __m128i vi11 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
        const __m128i vxi11 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
        const __m128i vk11 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
        const __m128i vxk11 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk11, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
        const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));

        const __m128i vi12 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i12), vi_shift);
        const __m128i vxi12 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
        const __m128i vk12 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
        const __m128i vxk12 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk12, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
        const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));

        const __m128i vi20 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i20), vi_shift);
        const __m128i vxi20 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
        const __m128i vk20 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 80));
        const __m128i vxk20 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk20, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
        const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));

        const __m128i vi21 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i21), vi_shift);
        const __m128i vxi21 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
        const __m128i vk21 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 88));
        const __m128i vxk21 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk21, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
        const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));

        const __m128i vi22 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i22), vi_shift);
        const __m128i vxi22 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
        const __m128i vk22 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 96));
        const __m128i vxk22 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk22, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
        const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));

        const __m128i vi23 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i23), vi_shift);
        const __m128i vxi23 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
        const __m128i vk23 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 104));
        const __m128i vxk23 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk23, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
        const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));

        w = (const void*)((uintptr_t)w + 112);
        _mm_storeu_si128((__m128i*)outacc, vacc_lo);
        outacc += 4;
        _mm_storeu_si128((__m128i*)outacc, vacc_hi);
        outacc += 4;
      }
    }
    {
      const uint8_t* i00 = input[10];
      const uint8_t* i01 = input[11];
      const uint8_t* i02 = input[12];
      const uint8_t* i10 = input[13];
      const uint8_t* i11 = input[14];
      const uint8_t* i12 = input[15];
      const uint8_t* i20 = input[16];
      const uint8_t* i21 = input[17];
      const uint8_t* i22 = input[18];
      const uint8_t* i23 = input[19];
      outacc = outacc32;

      size_t c = channels;
      for (; c >= 8; c -= 8) {
        const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
        const __m128i vkernel_zero_point = _mm_loadl_epi64(
            (const __m128i*)
            &quantization_params->sse2.kernel_zero_points[channels - c]);
        i00 += 8;
        const __m128i vxi00 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
        const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
        const __m128i vxk00 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk00, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
        const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
        __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
        __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);

        const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
        i01 += 8;
        const __m128i vxi01 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
        const __m128i vk01 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
        const __m128i vxk01 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk01, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
        const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));

        const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
        i02 += 8;
        const __m128i vxi02 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
        const __m128i vk02 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
        const __m128i vxk02 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk02, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
        const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));

        const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
        i10 += 8;
        const __m128i vxi10 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
        const __m128i vk10 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
        const __m128i vxk10 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk10, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
        const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));

        const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
        i11 += 8;
        const __m128i vxi11 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
        const __m128i vk11 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
        const __m128i vxk11 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk11, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
        const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));

        const __m128i vi12 = _mm_loadl_epi64((const __m128i*)i12);
        i12 += 8;
        const __m128i vxi12 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
        const __m128i vk12 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
        const __m128i vxk12 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk12, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
        const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));

        const __m128i vi20 = _mm_loadl_epi64((const __m128i*)i20);
        i20 += 8;
        const __m128i vxi20 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
        const __m128i vk20 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
        const __m128i vxk20 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk20, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
        const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));

        const __m128i vi21 = _mm_loadl_epi64((const __m128i*)i21);
        i21 += 8;
        const __m128i vxi21 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
        const __m128i vk21 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
        const __m128i vxk21 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk21, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
        const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));

        const __m128i vi22 = _mm_loadl_epi64((const __m128i*)i22);
        i22 += 8;
        const __m128i vxi22 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
        const __m128i vk22 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
        const __m128i vxk22 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk22, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
        const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));

        const __m128i vi23 = _mm_loadl_epi64((const __m128i*)i23);
        i23 += 8;
        const __m128i vxi23 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
        const __m128i vk23 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
        const __m128i vxk23 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk23, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
        const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));

        w = (const void*)((uintptr_t)w + 80);
        vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
        vacc_hi =
            _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
        _mm_storeu_si128((__m128i*)outacc, vacc_lo);
        outacc += 4;
        _mm_storeu_si128((__m128i*)outacc, vacc_hi);
        outacc += 4;
      }
      if (c != 0) {
        const size_t i_predecrement = 8 - c;
        const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
        const __m128i vkernel_zero_point = _mm_loadl_epi64(
            (const __m128i*)
            &quantization_params->sse2.kernel_zero_points[channels - c]);
        i00 -= i_predecrement;
        i01 -= i_predecrement;
        i02 -= i_predecrement;
        i10 -= i_predecrement;
        i11 -= i_predecrement;
        i12 -= i_predecrement;
        i20 -= i_predecrement;
        i21 -= i_predecrement;
        i22 -= i_predecrement;
        i23 -= i_predecrement;

        const __m128i vi00 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
        const __m128i vxi00 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
        const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
        const __m128i vxk00 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk00, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
        const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
        __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
        __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);

        const __m128i vi01 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
        const __m128i vxi01 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
        const __m128i vk01 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
        const __m128i vxk01 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk01, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
        const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));

        const __m128i vi02 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
        const __m128i vxi02 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
        const __m128i vk02 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
        const __m128i vxk02 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk02, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
        const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));

        const __m128i vi10 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
        const __m128i vxi10 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
        const __m128i vk10 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
        const __m128i vxk10 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk10, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
        const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));

        const __m128i vi11 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
        const __m128i vxi11 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
        const __m128i vk11 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
        const __m128i vxk11 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk11, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
        const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));

        const __m128i vi12 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i12), vi_shift);
        const __m128i vxi12 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi12, vzero), vinput_zero_point);
        const __m128i vk12 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 40));
        const __m128i vxk12 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk12, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod12_odd = _mm_mullo_epi16(vxi12, vxk12);
        const __m128i vprod12_even = _mm_mulhi_epi16(vxi12, vxk12);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod12_odd, vprod12_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod12_odd, vprod12_even));

        const __m128i vi20 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i20), vi_shift);
        const __m128i vxi20 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi20, vzero), vinput_zero_point);
        const __m128i vk20 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 48));
        const __m128i vxk20 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk20, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod20_odd = _mm_mullo_epi16(vxi20, vxk20);
        const __m128i vprod20_even = _mm_mulhi_epi16(vxi20, vxk20);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod20_odd, vprod20_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod20_odd, vprod20_even));

        const __m128i vi21 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i21), vi_shift);
        const __m128i vxi21 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi21, vzero), vinput_zero_point);
        const __m128i vk21 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 56));
        const __m128i vxk21 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk21, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod21_odd = _mm_mullo_epi16(vxi21, vxk21);
        const __m128i vprod21_even = _mm_mulhi_epi16(vxi21, vxk21);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod21_odd, vprod21_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod21_odd, vprod21_even));

        const __m128i vi22 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i22), vi_shift);
        const __m128i vxi22 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi22, vzero), vinput_zero_point);
        const __m128i vk22 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 64));
        const __m128i vxk22 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk22, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod22_odd = _mm_mullo_epi16(vxi22, vxk22);
        const __m128i vprod22_even = _mm_mulhi_epi16(vxi22, vxk22);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod22_odd, vprod22_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod22_odd, vprod22_even));

        const __m128i vi23 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i23), vi_shift);
        const __m128i vxi23 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi23, vzero), vinput_zero_point);
        const __m128i vk23 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 72));
        const __m128i vxk23 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk23, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod23_odd = _mm_mullo_epi16(vxi23, vxk23);
        const __m128i vprod23_even = _mm_mulhi_epi16(vxi23, vxk23);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod23_odd, vprod23_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod23_odd, vprod23_even));

        w = (const void*)((uintptr_t)w + 80);
        vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
        vacc_hi =
            _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
        _mm_storeu_si128((__m128i*)outacc, vacc_lo);
        outacc += 4;
        _mm_storeu_si128((__m128i*)outacc, vacc_hi);
        outacc += 4;
      }
    }
    {
      const uint8_t* i00 = input[20];
      const uint8_t* i01 = input[21];
      const uint8_t* i02 = input[22];
      const uint8_t* i10 = input[23];
      const uint8_t* i11 = input[24];
      input = (const uint8_t**)((uintptr_t)input + input_stride);
      outacc = outacc32;
      size_t c = channels;
      for (; c >= 8; c -= 8) {
        const __m128i vi00 = _mm_loadl_epi64((const __m128i*)i00);
        const __m128i vkernel_zero_point = _mm_loadl_epi64(
            (const __m128i*)
            &quantization_params->sse2.kernel_zero_points[channels - c]);
        i00 += 8;
        const __m128i vxi00 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
        const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
        const __m128i vxk00 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk00, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
        const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
        __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
        __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);

        const __m128i vi01 = _mm_loadl_epi64((const __m128i*)i01);
        i01 += 8;
        const __m128i vxi01 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
        const __m128i vk01 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
        const __m128i vxk01 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk01, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
        const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));

        const __m128i vi02 = _mm_loadl_epi64((const __m128i*)i02);
        i02 += 8;
        const __m128i vxi02 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
        const __m128i vk02 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
        const __m128i vxk02 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk02, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
        const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));

        const __m128i vi10 = _mm_loadl_epi64((const __m128i*)i10);
        i10 += 8;
        const __m128i vxi10 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
        const __m128i vk10 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
        const __m128i vxk10 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk10, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
        const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));

        const __m128i vi11 = _mm_loadl_epi64((const __m128i*)i11);
        i11 += 8;
        const __m128i vxi11 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
        const __m128i vk11 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
        const __m128i vxk11 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk11, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
        const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));

        w = (const void*)((uintptr_t)w + 40);

        vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
        vacc_hi =
            _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
        outacc += 8;

        const __m128 vmultiplier_lo =
            _mm_loadu_ps(&quantization_params->sse2.requantization_scales[channels - c]);
        const __m128 vmultiplier_hi =
            _mm_loadu_ps(&quantization_params->sse2.requantization_scales[channels - c + 4]);

        vacc_lo = _mm_cvtps_epi32(
                      _mm_mul_ps(
                        _mm_cvtepi32_ps(vacc_lo),
                        vmultiplier_lo
                        )
                      );
        vacc_hi = _mm_cvtps_epi32(
                      _mm_mul_ps(
                        _mm_cvtepi32_ps(vacc_hi),
                        vmultiplier_hi
                        )
                      );

        const __m128i voutput_zero_point = _mm_load_si128(
            (const __m128i*)quantization_params->sse2.output_zero_point);
        __m128i vout = _mm_adds_epi16(
            _mm_packs_epi32(vacc_lo, vacc_hi), voutput_zero_point);
        vout = _mm_packus_epi16(vout, vout);
        vout = _mm_max_epu8(
            vout,
            _mm_load_si128(
                (const __m128i*)quantization_params->sse2.output_min));
        vout = _mm_min_epu8(
            vout,
            _mm_load_si128(
                (const __m128i*)quantization_params->sse2.output_max));

        _mm_storel_epi64((__m128i*)output, vout);
        output += 8;
      }
      if (c != 0) {
        const size_t i_predecrement = 8 - c;
        const __m128i vi_shift = _mm_cvtsi32_si128(8 * i_predecrement);
        const __m128i vkernel_zero_point = _mm_loadl_epi64(
            (const __m128i*)
            &quantization_params->sse2.kernel_zero_points[channels - c]);
        i00 -= i_predecrement;
        i01 -= i_predecrement;
        i02 -= i_predecrement;
        i10 -= i_predecrement;
        i11 -= i_predecrement;

        const __m128i vi00 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i00), vi_shift);
        const __m128i vxi00 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi00, vzero), vinput_zero_point);
        const __m128i vk00 = _mm_loadl_epi64((const __m128i*)((uintptr_t)w));
        const __m128i vxk00 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk00, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod00_odd = _mm_mullo_epi16(vxi00, vxk00);
        const __m128i vprod00_even = _mm_mulhi_epi16(vxi00, vxk00);
        __m128i vacc_lo = _mm_unpacklo_epi16(vprod00_odd, vprod00_even);
        __m128i vacc_hi = _mm_unpackhi_epi16(vprod00_odd, vprod00_even);

        const __m128i vi01 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i01), vi_shift);
        const __m128i vxi01 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi01, vzero), vinput_zero_point);
        const __m128i vk01 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 8));
        const __m128i vxk01 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk01, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod01_odd = _mm_mullo_epi16(vxi01, vxk01);
        const __m128i vprod01_even = _mm_mulhi_epi16(vxi01, vxk01);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod01_odd, vprod01_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod01_odd, vprod01_even));

        const __m128i vi02 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i02), vi_shift);
        const __m128i vxi02 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi02, vzero), vinput_zero_point);
        const __m128i vk02 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 16));
        const __m128i vxk02 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk02, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod02_odd = _mm_mullo_epi16(vxi02, vxk02);
        const __m128i vprod02_even = _mm_mulhi_epi16(vxi02, vxk02);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod02_odd, vprod02_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod02_odd, vprod02_even));

        const __m128i vi10 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i10), vi_shift);
        const __m128i vxi10 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi10, vzero), vinput_zero_point);
        const __m128i vk10 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 24));
        const __m128i vxk10 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk10, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod10_odd = _mm_mullo_epi16(vxi10, vxk10);
        const __m128i vprod10_even = _mm_mulhi_epi16(vxi10, vxk10);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod10_odd, vprod10_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod10_odd, vprod10_even));

        const __m128i vi11 =
            _mm_srl_epi64(_mm_loadl_epi64((const __m128i*)i11), vi_shift);
        const __m128i vxi11 =
            _mm_sub_epi16(_mm_unpacklo_epi8(vi11, vzero), vinput_zero_point);
        const __m128i vk11 =
            _mm_loadl_epi64((const __m128i*)((uintptr_t)w + 32));
        const __m128i vxk11 =
            _mm_sub_epi16(
                _mm_unpacklo_epi8(vk11, vzero),
                _mm_unpacklo_epi8(vkernel_zero_point, vzero));
        const __m128i vprod11_odd = _mm_mullo_epi16(vxi11, vxk11);
        const __m128i vprod11_even = _mm_mulhi_epi16(vxi11, vxk11);
        vacc_lo = _mm_add_epi32(
            vacc_lo, _mm_unpacklo_epi16(vprod11_odd, vprod11_even));
        vacc_hi = _mm_add_epi32(
            vacc_hi, _mm_unpackhi_epi16(vprod11_odd, vprod11_even));

        vacc_lo = _mm_add_epi32(vacc_lo, _mm_loadu_si128((__m128i*)outacc));
        vacc_hi =
            _mm_add_epi32(vacc_hi, _mm_loadu_si128((__m128i*)(outacc + 4)));
        outacc += 8;

        const __m128 vmultiplier_lo =
            _mm_loadu_ps(&quantization_params->sse2.requantization_scales[channels - c]);
        const __m128 vmultiplier_hi =
            _mm_loadu_ps(&quantization_params->sse2.requantization_scales[channels - c + 4]);

        vacc_lo = _mm_cvtps_epi32(
                      _mm_mul_ps(
                        _mm_cvtepi32_ps(vacc_lo),
                        vmultiplier_lo
                        )
                      );
        vacc_hi = _mm_cvtps_epi32(
                      _mm_mul_ps(
                        _mm_cvtepi32_ps(vacc_hi),
                        vmultiplier_hi
                        )
                      );

        const __m128i voutput_zero_point = _mm_load_si128(
            (const __m128i*)quantization_params->sse2.output_zero_point);
        __m128i vout = _mm_adds_epi16(
            _mm_packs_epi32(vacc_lo, vacc_hi), voutput_zero_point);
        vout = _mm_packus_epi16(vout, vout);
        vout = _mm_max_epu8(
            vout,
            _mm_load_si128(
                (const __m128i*)quantization_params->sse2.output_min));
        vout = _mm_min_epu8(
            vout,
            _mm_load_si128(
                (const __m128i*)quantization_params->sse2.output_max));

        if (c & 4) {
          *((uint32_t*)output) = (uint32_t)_mm_cvtsi128_si32(vout);
          output += 4;
          vout = _mm_srli_epi64(vout, 32);
        }
        if (c & 2) {
          *((uint16_t*)output) = (uint16_t)_mm_extract_epi16(vout, 0);
          output += 2;
          vout = _mm_srli_epi32(vout, 16);
        }
        if (c & 1) {
          *((uint8_t*)output) = (uint8_t)_mm_cvtsi128_si32(vout);
          output += 1;
        }
      }
    }
    output = (uint8_t*)((uintptr_t)output + output_increment);
  } while (--output_width != 0);
}