static inline void KernelMacroBlockIntrinsics()

in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h [4803:5314]


  static inline void KernelMacroBlockIntrinsics(
      const int8* scratch_block_data, const int8* filter_workspace,
      const int32* bias_data, uint8* output_block_data,
      const DepthwiseConvDotProdParams* function_params) {
    const int workspace_height_stride =
        function_params->workspace_height_stride;
    const int output_width_micro_repeats =
        function_params->output_width_micro_repeats;
    const int depth_micro_repeats = function_params->depth_micro_repeats;
    const int output_depth = function_params->output_depth;
    constexpr int kStrideVal = 2;
    TFLITE_DCHECK_EQ(function_params->stride, kStrideVal);

    const int output_width_overall_micro_repeats =
        function_params->output_width_overall_micro_repeats;
    const int block_height = function_params->outbound_block_height;
    const int residual_width = function_params->output_residual_width;
    const int output_height_stride = function_params->output_height_stride;
    constexpr int kBiasIncrement = 4;

    const int32 output_activation_min =
        function_params->quantized_activation_min;
    const int32 output_activation_max =
        function_params->quantized_activation_max;
    const int32 output_multiplier = function_params->output_multiplier;
    const int32 output_shift = function_params->output_shift;
    const int32 output_offset = function_params->output_offset;
    TFLITE_DCHECK_GE(output_activation_min, 0);
    TFLITE_DCHECK_LT(output_activation_min, 256);
    TFLITE_DCHECK_GE(output_activation_max, 0);
    TFLITE_DCHECK_LT(output_activation_max, 256);
    TFLITE_DCHECK_GE(output_offset, -32878);
    TFLITE_DCHECK_LT(output_offset, 32768);

    TFLITE_DCHECK_GE(depth_micro_repeats, 1);

    const int16x8_t output_offset_vec =
        vdupq_n_s16(static_cast<int16>(output_offset));
    const uint8x16_t output_activation_min_vec =
        vdupq_n_u8(static_cast<uint8>(output_activation_min));
    const uint8x16_t output_activation_max_vec =
        vdupq_n_u8(static_cast<uint8>(output_activation_max));

    for (int j_depth = 0; j_depth < (depth_micro_repeats * 1 + 0); ++j_depth) {
      int8x16_t filter_reg_0_a;
      int8x16_t filter_reg_0_b;
      int8x16_t filter_reg_1_a;
      int8x16_t filter_reg_1_b;
      int8x16_t filter_reg_2_a;
      int8x16_t filter_reg_2_b;

      filter_reg_0_a = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_0_b = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_1_a = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_1_b = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_2_a = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_2_b = vld1q_s8(filter_workspace);
      filter_workspace += 16;

      const int32x4_t adjusted_bias_data_s_0 = vld1q_s32(bias_data);
      bias_data += kBiasIncrement;
      const int32x4_t adjusted_bias_data_s_1 = vld1q_s32(bias_data);
      bias_data += kBiasIncrement;

      if (block_height == 2) {
        const int8* scratch_data = scratch_block_data;
        uint8* output_data = output_block_data + 8 * j_depth;

        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
        int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
        int8x16_t input_bank_c_reg;  //  left 4, right 4, xxx, xxx.

        // Load first sub-micro block of data into operational banks.
        input_bank_a_reg =
            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
                                           // uninitialized variable.
        input_bank_a_reg = vld1q_lane_8x4(
            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
        input_bank_b_reg = vld1q_dup_s8x4(
            scratch_data +
            2 * workspace_height_stride);  // Load lane 0, avoiding
                                           // uninitialized variable.
        input_bank_b_reg = vld1q_lane_8x4(
            scratch_data + 3 * workspace_height_stride, input_bank_b_reg, 2);
        input_bank_c_reg = vld1q_dup_s8x4(
            scratch_data +
            4 * workspace_height_stride);  // Load lane 0, avoiding
                                           // uninitialized variable.

        int32x4_t acc0;
        int32x4_t acc1;

        // When output_width_micro_repeats < output_width_overall_micro_repeats,
        // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
        // residual_width < 2.
        const int adjusted_width_micro_repeats =
            (output_width_micro_repeats < output_width_overall_micro_repeats) &&
                    (residual_width < 2)
                ? output_width_micro_repeats
                : output_width_overall_micro_repeats;

        int i_width = 0;
        for (; i_width < adjusted_width_micro_repeats; ++i_width) {
          const int8* input_data = scratch_data + 4 + 4 * i_width;

          // Load next sub-micro block of data.
          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
          input_bank_a_reg = vld1q_lane_8x4(
              input_data + workspace_height_stride, input_bank_a_reg, 3);
          input_bank_b_reg = vld1q_lane_8x4(
              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
          input_bank_b_reg = vld1q_lane_8x4(
              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
          input_bank_c_reg = vld1q_lane_8x4(
              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);

          int16x8_t acc_s16_0_1;
          uint8x8_t acc_u8_0_1;
          // Iterate over input width shifts within 4x4 blocks.
          {
            acc0 = adjusted_bias_data_s_0;
            acc1 = adjusted_bias_data_s_0;

            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);

            // Fixed-point multiplication.
            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc0, -output_shift);
            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc1, -output_shift);
            // Add the output offset.
            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
            // Apply the activation function.
            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
            acc_u8_0_1 =
                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
            acc_u8_0_1 =
                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);

            acc0 = adjusted_bias_data_s_1;
            acc1 = adjusted_bias_data_s_1;

            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);

            // Fixed-point multiplication.
            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc0, -output_shift);
            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc1, -output_shift);
            // Add the output offset.
            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
            // Apply the activation function.
            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
            acc_u8_0_1 =
                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
            acc_u8_0_1 =
                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
                          1);

            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);

            output_data += output_depth;
          }

          // output_width == four_over_stride.
          acc0 = adjusted_bias_data_s_0;
          acc1 = adjusted_bias_data_s_0;

          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);

          // Fixed-point multiplication.
          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
              acc0, -output_shift);
          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
              acc1, -output_shift);
          // Add the output offset.
          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
          // Apply the activation function.
          acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
          acc_u8_0_1 =
              vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
          acc_u8_0_1 =
              vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

          vst1_lane_8x4(output_data, acc_u8_0_1, 0);
          vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);

          acc0 = adjusted_bias_data_s_1;
          acc1 = adjusted_bias_data_s_1;

          acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
          acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
          acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
          acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);

          // Fixed-point multiplication.
          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
          acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
              acc0, -output_shift);
          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
          acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
              acc1, -output_shift);
          // Add the output offset.
          acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
          acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
          // Apply the activation function.
          acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
          acc_u8_0_1 =
              vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
          acc_u8_0_1 =
              vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

          vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
          vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1, 1);

          input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
          input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
          input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);

          output_data += output_depth;
        }
        for (; i_width < output_width_overall_micro_repeats; ++i_width) {
          // output_width == 1.
          const int8* input_data = scratch_data + 4 + 4 * i_width;

          // Load next sub-micro block of data.
          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
          input_bank_a_reg = vld1q_lane_8x4(
              input_data + workspace_height_stride, input_bank_a_reg, 3);
          input_bank_b_reg = vld1q_lane_8x4(
              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);
          input_bank_b_reg = vld1q_lane_8x4(
              input_data + 3 * workspace_height_stride, input_bank_b_reg, 3);
          input_bank_c_reg = vld1q_lane_8x4(
              input_data + 4 * workspace_height_stride, input_bank_c_reg, 1);

          int16x8_t acc_s16_0_1;
          uint8x8_t acc_u8_0_1;
          // Iterate over input width shifts within 4x4 blocks.
          {
            acc0 = adjusted_bias_data_s_0;
            acc1 = adjusted_bias_data_s_0;

            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 2);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_c_reg, 0);

            // Fixed-point multiplication.
            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc0, -output_shift);
            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc1, -output_shift);
            // Add the output offset.
            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
            // Apply the activation function.
            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
            acc_u8_0_1 =
                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
            acc_u8_0_1 =
                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

            vst1_lane_8x4(output_data, acc_u8_0_1, 0);
            vst1_lane_8x4(output_data + output_height_stride, acc_u8_0_1, 1);

            acc0 = adjusted_bias_data_s_1;
            acc1 = adjusted_bias_data_s_1;

            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_0_b, input_bank_a_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_1_b, input_bank_a_reg, 2);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_2_b, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_b_reg, 2);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_c_reg, 0);

            // Fixed-point multiplication.
            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc0, -output_shift);
            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc1, -output_shift);
            // Add the output offset.
            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
            // Apply the activation function.
            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
            acc_u8_0_1 =
                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
            acc_u8_0_1 =
                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

            vst1_lane_8x4(output_data + 4, acc_u8_0_1, 0);
            vst1_lane_8x4(output_data + 4 + output_height_stride, acc_u8_0_1,
                          1);

            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
            input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);

            output_data += output_depth;
          }
        }
      } else {
        TFLITE_DCHECK_EQ(block_height, 1);
        // Work through one slice, by row, at a time.
        const int8* scratch_data = scratch_block_data;
        uint8* output_data = output_block_data + 8 * j_depth;

        //
        int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
        int8x16_t input_bank_b_reg;  //  left 2, right 2, xxx, xxx.

        // Load first sub-micro block of data into operational banks.
        input_bank_a_reg =
            vld1q_dup_s8x4(scratch_data);  // Load lane 0, avoiding
                                           // uninitialized variable.
        input_bank_a_reg = vld1q_lane_8x4(
            scratch_data + workspace_height_stride, input_bank_a_reg, 2);
        input_bank_b_reg = vld1q_dup_s8x4(
            scratch_data +
            2 * workspace_height_stride);  // Load lane 0, avoiding
                                           // uninitialized variable.

        int32x4_t acc0;
        int32x4_t acc1;

        for (int i_width = 0; i_width < output_width_overall_micro_repeats;
             ++i_width) {
          const int output_width =
              i_width == output_width_micro_repeats ? residual_width : 2;

          TFLITE_DCHECK_LE(output_width, 2);
          TFLITE_DCHECK_GE(output_width, 1);
          TFLITE_DCHECK_LE(output_width * kStrideVal, 4);
          const int8* input_data = scratch_data + 4 + 4 * i_width;

          // Load next sub-micro block of data.
          input_bank_a_reg = vld1q_lane_8x4(input_data, input_bank_a_reg, 1);
          input_bank_a_reg = vld1q_lane_8x4(
              input_data + workspace_height_stride, input_bank_a_reg, 3);
          input_bank_b_reg = vld1q_lane_8x4(
              input_data + 2 * workspace_height_stride, input_bank_b_reg, 1);

          int16x8_t acc_s16_0_1;
          uint8x8_t acc_u8_0_1;

          // Iterate over input width shifts within 4x4 blocks.
          {
            acc0 = adjusted_bias_data_s_0;

            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);

            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc0, -output_shift);

            // Second sub-block accumulation.
            acc1 = adjusted_bias_data_s_1;

            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);

            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc1, -output_shift);

            // Add the output offset.
            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
            // Apply the activation function.
            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
            acc_u8_0_1 =
                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
            acc_u8_0_1 =
                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

            // This stores the results for both sub-blocks together.
            vst1_u8(output_data, acc_u8_0_1);

            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);

            output_data += output_depth;
          }
          if (output_width == 2) {
            acc0 = adjusted_bias_data_s_0;

            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg, 0);
            acc0 =
                vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg, 2);

            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
            acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc0, -output_shift);

            // Second sub-block accumulation.
            acc1 = adjusted_bias_data_s_1;

            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_2_b, input_bank_b_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_0_b, input_bank_a_reg, 0);
            acc1 =
                vdotq_four_lane_s32(acc1, filter_reg_1_b, input_bank_a_reg, 2);

            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
            acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                acc1, -output_shift);

            // Add the output offset.
            acc_s16_0_1 = vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
            acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
            // Apply the activation function.
            acc_u8_0_1 = vqmovun_s16(acc_s16_0_1);
            acc_u8_0_1 =
                vmax_u8(acc_u8_0_1, vget_low_u8(output_activation_min_vec));
            acc_u8_0_1 =
                vmin_u8(acc_u8_0_1, vget_low_u8(output_activation_max_vec));

            // This stores the results for both sub-blocks together.
            vst1_u8(output_data, acc_u8_0_1);

            input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
            input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);

            output_data += output_depth;
          }
        }
      }
    }
  }