static inline void KernelMacroBlockIntrinsics()

in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h [4159:4787]


  static inline void KernelMacroBlockIntrinsics(
      const int8* scratch_block_data, const int8* filter_workspace,
      const int32* bias_data, uint8* output_block_data,
      const DepthwiseConvDotProdParams* function_params) {
    TFLITE_DCHECK_EQ(function_params->stride, 1);
    const int workspace_height_stride =
        function_params->workspace_height_stride;
    const int output_width_micro_repeats =
        function_params->output_width_micro_repeats;
    const int depth_micro_repeats = function_params->depth_micro_repeats;
    const int output_depth = function_params->output_depth;

    const int output_width_overall_micro_repeats =
        function_params->output_width_overall_micro_repeats;
    const int block_height = function_params->outbound_block_height;
    const int residual_width = function_params->output_residual_width;
    const int output_height_stride = function_params->output_height_stride;
    constexpr int kBiasIncrement = 4;

    TFLITE_DCHECK(depth_micro_repeats > 0);

    const int32 output_activation_min =
        function_params->quantized_activation_min;
    const int32 output_activation_max =
        function_params->quantized_activation_max;
    const int32 output_multiplier = function_params->output_multiplier;
    const int32 output_shift = function_params->output_shift;
    const int32 output_offset = function_params->output_offset;
    TFLITE_DCHECK_GE(output_activation_min, 0);
    TFLITE_DCHECK_LT(output_activation_min, 256);
    TFLITE_DCHECK_GE(output_activation_max, 0);
    TFLITE_DCHECK_LT(output_activation_max, 256);
    TFLITE_DCHECK_GE(output_offset, -32878);
    TFLITE_DCHECK_LT(output_offset, 32768);

    const int16x8_t output_offset_vec =
        vdupq_n_s16(static_cast<int16>(output_offset));
    const uint8x16_t output_activation_min_vec =
        vdupq_n_u8(static_cast<uint8>(output_activation_min));
    const uint8x16_t output_activation_max_vec =
        vdupq_n_u8(static_cast<uint8>(output_activation_max));

    uint8* output_data_depthwise = output_block_data;
    for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
      // Simulate NEON-register transposition of subset of filter.
      int8x16_t filter_reg_0_a;
      int8x16_t filter_reg_0_b;
      int8x16_t filter_reg_1_a;
      int8x16_t filter_reg_1_b;
      int8x16_t filter_reg_2_a;
      int8x16_t filter_reg_2_b;
      int8x16_t filter_reg_0_a_shifted;
      int8x16_t filter_reg_1_a_shifted;
      int8x16_t filter_reg_2_a_shifted;

      filter_reg_0_a = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_0_b = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_1_a = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_1_b = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_2_a = vld1q_s8(filter_workspace);
      filter_workspace += 16;
      filter_reg_2_b = vld1q_s8(filter_workspace);
      filter_workspace += 16;

      filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
      filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
      filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);

      // When output_width_micro_repeats < output_width_overall_micro_repeats,
      // 0 < residual_width <= 2, and so residual_width == 1 is then true iff
      // residual_width < 2.
      const int adjusted_width_micro_repeats =
          (output_width_micro_repeats < output_width_overall_micro_repeats) &&
                  (residual_width < 4)
              ? output_width_micro_repeats
              : output_width_overall_micro_repeats;

      if (block_height == 4) {
        for (int s = 0; s < 2; ++s) {
          // Work through one slice, by row, at a time.
          uint8* output_data_base = output_data_depthwise + 4 * s;

          const int8* next_input_data = scratch_block_data;
          uint8* output_data = output_data_base;

          const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
          bias_data += kBiasIncrement;

          int8x16_t input_bank_a_reg;  //  left 0, right 0, left 1, right 1.
          int8x16_t input_bank_b_reg;  //  left 2, right 2, left 3, right 3.
          int8x16_t input_bank_c_reg;  //  left 4, right 4, left 5, right 5.

          // Load first sub-micro block of data into operational banks.
          input_bank_a_reg =
              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
                                                // uninitialized variable.
          input_bank_a_reg = vld1q_lane_8x4(
              next_input_data + workspace_height_stride, input_bank_a_reg, 2);
          input_bank_b_reg = vld1q_dup_s8x4(
              next_input_data +
              2 * workspace_height_stride);  // Load lane 0, avoiding
                                             // uninitialized variable.
          input_bank_b_reg =
              vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
                             input_bank_b_reg, 2);
          input_bank_c_reg = vld1q_dup_s8x4(
              next_input_data +
              4 * workspace_height_stride);  // Load lane 0, avoiding
                                             // uninitialized variable.
          input_bank_c_reg =
              vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
                             input_bank_c_reg, 2);

          int32x4_t acc0;
          int32x4_t acc1;
          int32x4_t acc2;
          int32x4_t acc3;

          acc0 = adjusted_bias_data;
          acc1 = adjusted_bias_data;
          acc2 = adjusted_bias_data;
          acc3 = adjusted_bias_data;

          acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg, 0);
          acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg, 0);
          acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg, 0);
          acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg, 2);

          int i_width = 0;
          for (; i_width < adjusted_width_micro_repeats; ++i_width) {
            next_input_data += 4;

            // Iterate over input width shifts within 4x4 blocks.
            {
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
                                         0);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
                                         2);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
                                         2);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
                                         2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
                                         2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
                                         2);

              // Fixed-point multiplication.
              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc0, -output_shift);
              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc1, -output_shift);
              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc2, -output_shift);
              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc3, -output_shift);
              // Add the output offset.
              int16x8_t acc_s16_0_1 =
                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
              int16x8_t acc_s16_2_3 =
                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
              // Apply the activation function.
              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
                                                  vqmovun_s16(acc_s16_2_3));
              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);

              vst1q_lane_8x4(output_data, acc_u8_all, 0);
              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
                             2);
              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
                             3);

              output_data += output_depth;
            }
            // Load next sub-micro block of data.
            input_bank_a_reg =
                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
            input_bank_a_reg = vld1q_lane_8x4(
                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
            input_bank_b_reg =
                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
                               input_bank_b_reg, 1);
            input_bank_b_reg =
                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
                               input_bank_b_reg, 3);
            input_bank_c_reg =
                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
                               input_bank_c_reg, 1);
            input_bank_c_reg =
                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
                               input_bank_c_reg, 3);

            {
              acc0 = adjusted_bias_data;
              acc1 = adjusted_bias_data;
              acc2 = adjusted_bias_data;
              acc3 = adjusted_bias_data;

              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
                                         input_bank_a_reg, 0);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
                                         input_bank_a_reg, 2);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
                                         input_bank_b_reg, 0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
                                         input_bank_a_reg, 2);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
                                         input_bank_b_reg, 0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
                                         input_bank_b_reg, 2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
                                         input_bank_b_reg, 0);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
                                         input_bank_b_reg, 2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
                                         input_bank_c_reg, 0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
                                         input_bank_b_reg, 2);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
                                         input_bank_c_reg, 0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
                                         input_bank_c_reg, 2);

              // Fixed-point multiplication.
              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc0, -output_shift);
              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc1, -output_shift);
              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc2, -output_shift);
              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc3, -output_shift);
              // Add the output offset.
              int16x8_t acc_s16_0_1 =
                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
              int16x8_t acc_s16_2_3 =
                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
              // Apply the activation function.
              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
                                                  vqmovun_s16(acc_s16_2_3));
              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);

              vst1q_lane_8x4(output_data, acc_u8_all, 0);
              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
                             2);
              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
                             3);

              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);

              output_data += output_depth;
            }

            {
              acc0 = adjusted_bias_data;
              acc1 = adjusted_bias_data;
              acc2 = adjusted_bias_data;
              acc3 = adjusted_bias_data;

              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
                                         0);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
                                         2);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
                                         0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
                                         2);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
                                         0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
                                         2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
                                         0);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
                                         2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
                                         2);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
                                         2);

              // Fixed-point multiplication.
              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc0, -output_shift);
              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc1, -output_shift);
              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc2, -output_shift);
              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc3, -output_shift);
              // Add the output offset.
              int16x8_t acc_s16_0_1 =
                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
              int16x8_t acc_s16_2_3 =
                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
              // Apply the activation function.
              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
                                                  vqmovun_s16(acc_s16_2_3));
              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);

              vst1q_lane_8x4(output_data, acc_u8_all, 0);
              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
                             2);
              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
                             3);

              output_data += output_depth;
            }

            {
              acc0 = adjusted_bias_data;
              acc1 = adjusted_bias_data;
              acc2 = adjusted_bias_data;
              acc3 = adjusted_bias_data;

              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a_shifted,
                                         input_bank_a_reg, 0);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a_shifted,
                                         input_bank_a_reg, 2);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a_shifted,
                                         input_bank_b_reg, 0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a_shifted,
                                         input_bank_a_reg, 2);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a_shifted,
                                         input_bank_b_reg, 0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a_shifted,
                                         input_bank_b_reg, 2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a_shifted,
                                         input_bank_b_reg, 0);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a_shifted,
                                         input_bank_b_reg, 2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a_shifted,
                                         input_bank_c_reg, 0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a_shifted,
                                         input_bank_b_reg, 2);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a_shifted,
                                         input_bank_c_reg, 0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a_shifted,
                                         input_bank_c_reg, 2);

              // Fixed-point multiplication.
              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc0, -output_shift);
              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc1, -output_shift);
              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc2, -output_shift);
              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc3, -output_shift);
              // Add the output offset.
              int16x8_t acc_s16_0_1 =
                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
              int16x8_t acc_s16_2_3 =
                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
              // Apply the activation function.
              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
                                                  vqmovun_s16(acc_s16_2_3));
              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);

              vst1q_lane_8x4(output_data, acc_u8_all, 0);
              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
                             2);
              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
                             3);

              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 16);
              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 16);
              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 16);

              output_data += output_depth;
              acc0 = adjusted_bias_data;
              acc1 = adjusted_bias_data;
              acc2 = adjusted_bias_data;
              acc3 = adjusted_bias_data;

              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
                                         0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
                                         0);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
                                         2);
            }
          }

          if (i_width < output_width_overall_micro_repeats) {
            next_input_data += 4;
            const int output_width = residual_width;

            // Load next sub-micro block of data.
            input_bank_a_reg =
                vld1q_lane_8x4(next_input_data, input_bank_a_reg, 1);
            input_bank_a_reg = vld1q_lane_8x4(
                next_input_data + workspace_height_stride, input_bank_a_reg, 3);
            input_bank_b_reg =
                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
                               input_bank_b_reg, 1);
            input_bank_b_reg =
                vld1q_lane_8x4(next_input_data + 3 * workspace_height_stride,
                               input_bank_b_reg, 3);
            input_bank_c_reg =
                vld1q_lane_8x4(next_input_data + 4 * workspace_height_stride,
                               input_bank_c_reg, 1);
            input_bank_c_reg =
                vld1q_lane_8x4(next_input_data + 5 * workspace_height_stride,
                               input_bank_c_reg, 3);

            // Iterate over input width shifts within 4x4 blocks.
            for (int x = 0; x < output_width; ++x) {
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_0_a, input_bank_a_reg,
                                         0);
              acc0 = vdotq_four_lane_s32(acc0, filter_reg_1_a, input_bank_a_reg,
                                         2);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_0_a, input_bank_a_reg,
                                         2);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_2_a, input_bank_b_reg,
                                         2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_1_a, input_bank_b_reg,
                                         2);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_2_a, input_bank_c_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_1_a, input_bank_c_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_2_a, input_bank_c_reg,
                                         2);

              // Fixed-point multiplication.
              acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
              acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc0, -output_shift);
              acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
              acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc1, -output_shift);
              acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
              acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc2, -output_shift);
              acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
              acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc3, -output_shift);
              // Add the output offset.
              int16x8_t acc_s16_0_1 =
                  vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
              int16x8_t acc_s16_2_3 =
                  vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
              acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
              acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
              // Apply the activation function.
              uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
                                                  vqmovun_s16(acc_s16_2_3));
              acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
              acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);

              vst1q_lane_8x4(output_data, acc_u8_all, 0);
              vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
              vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
                             2);
              vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
                             3);

              input_bank_a_reg = vshrq_n_u64(input_bank_a_reg, 8);
              input_bank_b_reg = vshrq_n_u64(input_bank_b_reg, 8);
              input_bank_c_reg = vshrq_n_u64(input_bank_c_reg, 8);

              output_data += output_depth;

              acc0 = adjusted_bias_data;
              acc1 = adjusted_bias_data;
              acc2 = adjusted_bias_data;
              acc3 = adjusted_bias_data;

              acc0 = vdotq_four_lane_s32(acc0, filter_reg_2_a, input_bank_b_reg,
                                         0);
              acc1 = vdotq_four_lane_s32(acc1, filter_reg_1_a, input_bank_b_reg,
                                         0);
              acc2 = vdotq_four_lane_s32(acc2, filter_reg_0_a, input_bank_b_reg,
                                         0);
              acc3 = vdotq_four_lane_s32(acc3, filter_reg_0_a, input_bank_b_reg,
                                         2);
            }
          }
          // scratch_block_data += 4 * workspace_height_stride;
          output_data_base += 4 * output_height_stride;

          // Move to next sub-block: advance to second set of filters, to new
          // bias.
          filter_reg_0_a = filter_reg_0_b;
          filter_reg_1_a = filter_reg_1_b;
          filter_reg_2_a = filter_reg_2_b;
          filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
          filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
          filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
        }
      } else {
        // Block height < 4.
        uint8* output_data_base = output_data_depthwise;

        const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
        bias_data += kBiasIncrement;
        const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
        bias_data += kBiasIncrement;

        for (int k_height = 0; k_height < block_height; ++k_height) {
          const int8* next_input_data =
              scratch_block_data + k_height * workspace_height_stride;
          uint8* output_data = output_data_base;

          int8x16_t input_bank_p_reg;  //  left 0, right 0, left 1, right 1.
          int8x16_t input_bank_q_reg;  //  left 2, right 2, left 3, right 3.

          // Load first sub-micro block of data into operational banks.
          input_bank_p_reg =
              vld1q_dup_s8x4(next_input_data);  // Load lane 0, avoiding
                                                // uninitialized variable.
          input_bank_p_reg = vld1q_lane_8x4(
              next_input_data + workspace_height_stride, input_bank_p_reg, 2);
          input_bank_q_reg = vld1q_dup_s8x4(
              next_input_data +
              2 * workspace_height_stride);  // Load lane 0, avoiding
                                             // uninitialized variable.

          for (int i_width = 0; i_width < output_width_overall_micro_repeats;
               ++i_width) {
            next_input_data += 4;
            const int output_width =
                i_width == output_width_micro_repeats ? residual_width : 4;

            // Load next sub-micro block of data.
            input_bank_p_reg =
                vld1q_lane_8x4(next_input_data, input_bank_p_reg, 1);
            input_bank_p_reg = vld1q_lane_8x4(
                next_input_data + workspace_height_stride, input_bank_p_reg, 3);
            input_bank_q_reg =
                vld1q_lane_8x4(next_input_data + 2 * workspace_height_stride,
                               input_bank_q_reg, 1);
            // Iterate over input width shifts within 4x4 blocks.
            for (int x = 0; x < output_width; ++x) {
              int32x4_t acc_a = adjusted_bias_data_a;
              int32x4_t acc_b = adjusted_bias_data_b;
              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_0_a,
                                          input_bank_p_reg, 0);
              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_1_a,
                                          input_bank_p_reg, 2);
              acc_a = vdotq_four_lane_s32(acc_a, filter_reg_2_a,
                                          input_bank_q_reg, 0);
              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_0_b,
                                          input_bank_p_reg, 0);
              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_1_b,
                                          input_bank_p_reg, 2);
              acc_b = vdotq_four_lane_s32(acc_b, filter_reg_2_b,
                                          input_bank_q_reg, 0);

              // Fixed-point multiplication.
              acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
              acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
              acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc_a, -output_shift);
              acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
                  acc_b, -output_shift);
              // Add the output offset.
              int16x8_t acc_s16_0_0 =
                  vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
              acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
              // Apply the activation function.
              uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
              acc_u8_0_0 =
                  vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
              acc_u8_0_0 =
                  vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));

              vst1_u8(output_data, acc_u8_0_0);

              input_bank_p_reg = vshrq_n_u64(input_bank_p_reg, 8);
              input_bank_q_reg = vshrq_n_u64(input_bank_q_reg, 8);

              output_data += output_depth;
            }
          }
          output_data_base += output_height_stride;
        }
      }
      output_data_depthwise += 8;
    }
  }  // NOLINT(readability/fn_size) Manually unrolled.