in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_transitional.h [3109:3715]
static inline void KernelMacroBlockIntrinsics(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
const int workspace_height_stride =
function_params->workspace_height_stride;
const int input_width_overall_micro_repeats =
function_params->input_width_overall_micro_repeats;
const int output_width_micro_repeats =
function_params->output_width_micro_repeats;
const int depth_micro_repeats = function_params->depth_micro_repeats;
const int depth = function_params->input_depth;
const int output_width_overall_micro_repeats =
function_params->output_width_overall_micro_repeats;
const int block_height = function_params->outbound_block_height;
const int residual_width = function_params->output_residual_width;
const int output_height_stride = function_params->output_height_stride;
constexpr int kBiasIncrement = 4;
TFLITE_DCHECK(depth_micro_repeats > 0);
const int width_micro_stride = 4 * 8;
const int depth_micro_stride =
width_micro_stride * input_width_overall_micro_repeats;
const int32 output_activation_min =
function_params->quantized_activation_min;
const int32 output_activation_max =
function_params->quantized_activation_max;
const int32 output_multiplier = function_params->output_multiplier;
const int32 output_shift = function_params->output_shift;
const int32 output_offset = function_params->output_offset;
TFLITE_DCHECK_GE(output_activation_min, 0);
TFLITE_DCHECK_LT(output_activation_min, 256);
TFLITE_DCHECK_GE(output_activation_max, 0);
TFLITE_DCHECK_LT(output_activation_max, 256);
TFLITE_DCHECK_GE(output_offset, -32878);
TFLITE_DCHECK_LT(output_offset, 32768);
const int16x8_t output_offset_vec =
vdupq_n_s16(static_cast<int16>(output_offset));
const uint8x16_t output_activation_min_vec =
vdupq_n_u8(static_cast<uint8>(output_activation_min));
const uint8x16_t output_activation_max_vec =
vdupq_n_u8(static_cast<uint8>(output_activation_max));
const int8* input_data_depthwise = scratch_block_data;
uint8* output_data_depthwise = output_block_data;
for (int j_depth = 0; j_depth < depth_micro_repeats; ++j_depth) {
// Simulate NEON-register transposition of subset of filter.
int8x16_t filter_reg_0_a;
int8x16_t filter_reg_0_b;
int8x16_t filter_reg_1_a;
int8x16_t filter_reg_1_b;
int8x16_t filter_reg_2_a;
int8x16_t filter_reg_2_b;
int8x16_t filter_reg_0_a_shifted;
int8x16_t filter_reg_1_a_shifted;
int8x16_t filter_reg_2_a_shifted;
filter_reg_0_a = vld1q_s8(filter_workspace);
filter_workspace += 16;
filter_reg_0_b = vld1q_s8(filter_workspace);
filter_workspace += 16;
filter_reg_1_a = vld1q_s8(filter_workspace);
filter_workspace += 16;
filter_reg_1_b = vld1q_s8(filter_workspace);
filter_workspace += 16;
filter_reg_2_a = vld1q_s8(filter_workspace);
filter_workspace += 16;
filter_reg_2_b = vld1q_s8(filter_workspace);
filter_workspace += 16;
filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
if (block_height == 4) {
for (int s = 0; s < 2; ++s) {
// Work through one slice, by row, at a time.
const int8* input_data_base = input_data_depthwise + 2 * 8 * s;
uint8* output_data_base = output_data_depthwise + 4 * s;
const int8* next_input_data = input_data_base;
uint8* output_data = output_data_base;
const int32x4_t adjusted_bias_data = vld1q_s32(bias_data);
bias_data += kBiasIncrement;
// Load first sub-micro block of data into operational banks.
int8x16_t left_bank_0_reg = vld1q_s8(next_input_data);
int8x16_t left_bank_1_reg =
vld1q_s8(next_input_data + workspace_height_stride);
int8x16_t left_bank_2_reg =
vld1q_s8(next_input_data + 2 * workspace_height_stride);
int8x16_t left_bank_3_reg =
vld1q_s8(next_input_data + 3 * workspace_height_stride);
int8x16_t left_bank_4_reg =
vld1q_s8(next_input_data + 4 * workspace_height_stride);
int8x16_t left_bank_5_reg =
vld1q_s8(next_input_data + 5 * workspace_height_stride);
int32x4_t acc0;
int32x4_t acc1;
int32x4_t acc2;
int32x4_t acc3;
acc0 = adjusted_bias_data;
acc1 = adjusted_bias_data;
acc2 = adjusted_bias_data;
acc3 = adjusted_bias_data;
acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
for (int i_width = 0; i_width < output_width_micro_repeats;
++i_width) {
next_input_data += width_micro_stride;
// Iterate over input width shifts within 4x4 blocks.
{
acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
// Fixed-point multiplication.
acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc0, -output_shift);
acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc1, -output_shift);
acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc2, -output_shift);
acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc3, -output_shift);
// Add the output offset.
int16x8_t acc_s16_0_1 =
vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
int16x8_t acc_s16_2_3 =
vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
// Apply the activation function.
uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
vqmovun_s16(acc_s16_2_3));
acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
vst1q_lane_8x4(output_data, acc_u8_all, 0);
vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
2);
vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3);
output_data += depth;
}
// Load next sub-micro block of data.
int8x16_t right_bank_0_reg;
int8x16_t right_bank_1_reg;
int8x16_t right_bank_2_reg;
int8x16_t right_bank_3_reg;
int8x16_t right_bank_4_reg;
int8x16_t right_bank_5_reg;
// Loading of next block always valid.
right_bank_0_reg = vld1q_s8(next_input_data);
right_bank_1_reg =
vld1q_s8(next_input_data + workspace_height_stride);
right_bank_2_reg =
vld1q_s8(next_input_data + 2 * workspace_height_stride);
right_bank_3_reg =
vld1q_s8(next_input_data + 3 * workspace_height_stride);
right_bank_4_reg =
vld1q_s8(next_input_data + 4 * workspace_height_stride);
right_bank_5_reg =
vld1q_s8(next_input_data + 5 * workspace_height_stride);
{
acc0 = adjusted_bias_data;
acc1 = adjusted_bias_data;
acc2 = adjusted_bias_data;
acc3 = adjusted_bias_data;
acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
// Fixed-point multiplication.
acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc0, -output_shift);
acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc1, -output_shift);
acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc2, -output_shift);
acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc3, -output_shift);
// Add the output offset.
int16x8_t acc_s16_0_1 =
vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
int16x8_t acc_s16_2_3 =
vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
// Apply the activation function.
uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
vqmovun_s16(acc_s16_2_3));
acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
vst1q_lane_8x4(output_data, acc_u8_all, 0);
vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
2);
vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3);
left_bank_0_reg = vrev32q_u16(left_bank_0_reg);
left_bank_1_reg = vrev32q_u16(left_bank_1_reg);
left_bank_2_reg = vrev32q_u16(left_bank_2_reg);
left_bank_3_reg = vrev32q_u16(left_bank_3_reg);
left_bank_4_reg = vrev32q_u16(left_bank_4_reg);
left_bank_5_reg = vrev32q_u16(left_bank_5_reg);
vtrn1_s8x2_in_place(&left_bank_0_reg, &right_bank_0_reg);
vtrn1_s8x2_in_place(&left_bank_1_reg, &right_bank_1_reg);
vtrn1_s8x2_in_place(&left_bank_2_reg, &right_bank_2_reg);
vtrn1_s8x2_in_place(&left_bank_3_reg, &right_bank_3_reg);
vtrn1_s8x2_in_place(&left_bank_4_reg, &right_bank_4_reg);
vtrn1_s8x2_in_place(&left_bank_5_reg, &right_bank_5_reg);
output_data += depth;
}
{
acc0 = adjusted_bias_data;
acc1 = adjusted_bias_data;
acc2 = adjusted_bias_data;
acc3 = adjusted_bias_data;
acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
// Fixed-point multiplication.
acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc0, -output_shift);
acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc1, -output_shift);
acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc2, -output_shift);
acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc3, -output_shift);
// Add the output offset.
int16x8_t acc_s16_0_1 =
vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
int16x8_t acc_s16_2_3 =
vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
// Apply the activation function.
uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
vqmovun_s16(acc_s16_2_3));
acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
vst1q_lane_8x4(output_data, acc_u8_all, 0);
vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
2);
vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3);
output_data += depth;
}
{
acc0 = adjusted_bias_data;
acc1 = adjusted_bias_data;
acc2 = adjusted_bias_data;
acc3 = adjusted_bias_data;
acc0 = vdotq_s32(acc0, filter_reg_0_a_shifted, left_bank_0_reg);
acc0 = vdotq_s32(acc0, filter_reg_1_a_shifted, left_bank_1_reg);
acc0 = vdotq_s32(acc0, filter_reg_2_a_shifted, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_0_a_shifted, left_bank_1_reg);
acc1 = vdotq_s32(acc1, filter_reg_1_a_shifted, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_2_a_shifted, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_0_a_shifted, left_bank_2_reg);
acc2 = vdotq_s32(acc2, filter_reg_1_a_shifted, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_2_a_shifted, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_0_a_shifted, left_bank_3_reg);
acc3 = vdotq_s32(acc3, filter_reg_1_a_shifted, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_2_a_shifted, left_bank_5_reg);
// Fixed-point multiplication.
acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc0, -output_shift);
acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc1, -output_shift);
acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc2, -output_shift);
acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc3, -output_shift);
// Add the output offset.
int16x8_t acc_s16_0_1 =
vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
int16x8_t acc_s16_2_3 =
vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
// Apply the activation function.
uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
vqmovun_s16(acc_s16_2_3));
acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
vst1q_lane_8x4(output_data, acc_u8_all, 0);
vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
2);
vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3);
left_bank_0_reg = right_bank_0_reg;
left_bank_1_reg = right_bank_1_reg;
left_bank_2_reg = right_bank_2_reg;
left_bank_3_reg = right_bank_3_reg;
left_bank_4_reg = right_bank_4_reg;
left_bank_5_reg = right_bank_5_reg;
output_data += depth;
acc0 = adjusted_bias_data;
acc1 = adjusted_bias_data;
acc2 = adjusted_bias_data;
acc3 = adjusted_bias_data;
acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
}
}
if (residual_width > 0) {
next_input_data += width_micro_stride;
const int output_width = residual_width;
// Load next sub-micro block of data.
int8x16_t right_bank_0_reg;
int8x16_t right_bank_1_reg;
int8x16_t right_bank_2_reg;
int8x16_t right_bank_3_reg;
int8x16_t right_bank_4_reg;
int8x16_t right_bank_5_reg;
// Logic: (output_width - 1) * stride_val < 2.
const bool no_right_block = output_width < 3;
if (no_right_block) {
// Only needed for sanitizer checks.
right_bank_0_reg = vdupq_n_s8(0);
right_bank_1_reg = vdupq_n_s8(0);
right_bank_2_reg = vdupq_n_s8(0);
right_bank_3_reg = vdupq_n_s8(0);
right_bank_4_reg = vdupq_n_s8(0);
right_bank_5_reg = vdupq_n_s8(0);
} else {
right_bank_0_reg = vld1q_s8(next_input_data);
right_bank_1_reg =
vld1q_s8(next_input_data + workspace_height_stride);
right_bank_2_reg =
vld1q_s8(next_input_data + 2 * workspace_height_stride);
right_bank_3_reg =
vld1q_s8(next_input_data + 3 * workspace_height_stride);
right_bank_4_reg =
vld1q_s8(next_input_data + 4 * workspace_height_stride);
right_bank_5_reg =
vld1q_s8(next_input_data + 5 * workspace_height_stride);
}
// Iterate over input width shifts within 4x4 blocks.
for (int x = 0; x < output_width; ++x) {
acc0 = vdotq_s32(acc0, filter_reg_0_a, left_bank_0_reg);
acc0 = vdotq_s32(acc0, filter_reg_1_a, left_bank_1_reg);
acc1 = vdotq_s32(acc1, filter_reg_0_a, left_bank_1_reg);
acc1 = vdotq_s32(acc1, filter_reg_2_a, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_1_a, left_bank_3_reg);
acc2 = vdotq_s32(acc2, filter_reg_2_a, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_1_a, left_bank_4_reg);
acc3 = vdotq_s32(acc3, filter_reg_2_a, left_bank_5_reg);
// Fixed-point multiplication.
acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
acc0 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc0, -output_shift);
acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
acc1 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc1, -output_shift);
acc2 = vqrdmulhq_n_s32(acc2, output_multiplier);
acc2 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc2, -output_shift);
acc3 = vqrdmulhq_n_s32(acc3, output_multiplier);
acc3 = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc3, -output_shift);
// Add the output offset.
int16x8_t acc_s16_0_1 =
vcombine_s16(vqmovn_s32(acc0), vqmovn_s32(acc1));
int16x8_t acc_s16_2_3 =
vcombine_s16(vqmovn_s32(acc2), vqmovn_s32(acc3));
acc_s16_0_1 = vqaddq_s16(acc_s16_0_1, output_offset_vec);
acc_s16_2_3 = vqaddq_s16(acc_s16_2_3, output_offset_vec);
// Apply the activation function.
uint8x16_t acc_u8_all = vcombine_u8(vqmovun_s16(acc_s16_0_1),
vqmovun_s16(acc_s16_2_3));
acc_u8_all = vmaxq_u8(acc_u8_all, output_activation_min_vec);
acc_u8_all = vminq_u8(acc_u8_all, output_activation_max_vec);
vst1q_lane_8x4(output_data, acc_u8_all, 0);
vst1q_lane_8x4(output_data + output_height_stride, acc_u8_all, 1);
vst1q_lane_8x4(output_data + 2 * output_height_stride, acc_u8_all,
2);
vst1q_lane_8x4(output_data + 3 * output_height_stride, acc_u8_all,
3);
biregister_rotate_8(&left_bank_0_reg, &right_bank_0_reg);
biregister_rotate_8(&left_bank_1_reg, &right_bank_1_reg);
biregister_rotate_8(&left_bank_2_reg, &right_bank_2_reg);
biregister_rotate_8(&left_bank_3_reg, &right_bank_3_reg);
biregister_rotate_8(&left_bank_4_reg, &right_bank_4_reg);
biregister_rotate_8(&left_bank_5_reg, &right_bank_5_reg);
output_data += depth;
acc0 = adjusted_bias_data;
acc1 = adjusted_bias_data;
acc2 = adjusted_bias_data;
acc3 = adjusted_bias_data;
acc0 = vdotq_s32(acc0, filter_reg_2_a, left_bank_2_reg);
acc1 = vdotq_s32(acc1, filter_reg_1_a, left_bank_2_reg);
acc2 = vdotq_s32(acc2, filter_reg_0_a, left_bank_2_reg);
acc3 = vdotq_s32(acc3, filter_reg_0_a, left_bank_3_reg);
}
}
input_data_base += 4 * workspace_height_stride;
output_data_base += 4 * output_height_stride;
// Move to next sub-block: advance to second set of filters, to new
// bias.
filter_reg_0_a = filter_reg_0_b;
filter_reg_1_a = filter_reg_1_b;
filter_reg_2_a = filter_reg_2_b;
filter_reg_0_a_shifted = vshlq_n_u32(filter_reg_0_a, 8);
filter_reg_1_a_shifted = vshlq_n_u32(filter_reg_1_a, 8);
filter_reg_2_a_shifted = vshlq_n_u32(filter_reg_2_a, 8);
}
} else {
const int8* input_data_base = input_data_depthwise;
uint8* output_data_base = output_data_depthwise;
const int32x4_t adjusted_bias_data_a = vld1q_s32(bias_data);
bias_data += kBiasIncrement;
const int32x4_t adjusted_bias_data_b = vld1q_s32(bias_data);
bias_data += kBiasIncrement;
for (int k_height = 0; k_height < block_height; ++k_height) {
const int8* next_input_data = input_data_base;
uint8* output_data = output_data_base;
// Load first sub-micro block of data into operational banks.
int8x16_t left_bank_0_reg_a = vld1q_s8(next_input_data);
int8x16_t left_bank_1_reg_a =
vld1q_s8(next_input_data + workspace_height_stride);
int8x16_t left_bank_2_reg_a =
vld1q_s8(next_input_data + 2 * workspace_height_stride);
int8x16_t left_bank_0_reg_b = vld1q_s8(next_input_data + 16);
int8x16_t left_bank_1_reg_b =
vld1q_s8(next_input_data + workspace_height_stride + 16);
int8x16_t left_bank_2_reg_b =
vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
for (int i_width = 0; i_width < output_width_overall_micro_repeats;
++i_width) {
next_input_data += width_micro_stride;
const int output_width =
i_width == output_width_micro_repeats ? residual_width : 4;
int8x16_t right_bank_0_reg_a;
int8x16_t right_bank_1_reg_a;
int8x16_t right_bank_2_reg_a;
int8x16_t right_bank_0_reg_b;
int8x16_t right_bank_1_reg_b;
int8x16_t right_bank_2_reg_b;
// Logic: (output_width - 1) * stride_val < 2.
const bool no_right_block = output_width < 3;
// Load next sub-micro block of data.
if (no_right_block) {
// Only needed for sanitizer checks.
right_bank_0_reg_a = vdupq_n_s8(0);
right_bank_1_reg_a = vdupq_n_s8(0);
right_bank_2_reg_a = vdupq_n_s8(0);
right_bank_0_reg_b = vdupq_n_s8(0);
right_bank_1_reg_b = vdupq_n_s8(0);
right_bank_2_reg_b = vdupq_n_s8(0);
} else {
right_bank_0_reg_a = vld1q_s8(next_input_data);
right_bank_1_reg_a =
vld1q_s8(next_input_data + workspace_height_stride);
right_bank_2_reg_a =
vld1q_s8(next_input_data + 2 * workspace_height_stride);
right_bank_0_reg_b = vld1q_s8(next_input_data + 16);
right_bank_1_reg_b =
vld1q_s8(next_input_data + workspace_height_stride + 16);
right_bank_2_reg_b =
vld1q_s8(next_input_data + 2 * workspace_height_stride + 16);
}
// Iterate over input width shifts within 4x4 blocks.
for (int x = 0; x < output_width; ++x) {
int32x4_t acc_a = adjusted_bias_data_a;
int32x4_t acc_b = adjusted_bias_data_b;
acc_a = vdotq_s32(acc_a, filter_reg_0_a, left_bank_0_reg_a);
acc_a = vdotq_s32(acc_a, filter_reg_1_a, left_bank_1_reg_a);
acc_a = vdotq_s32(acc_a, filter_reg_2_a, left_bank_2_reg_a);
acc_b = vdotq_s32(acc_b, filter_reg_0_b, left_bank_0_reg_b);
acc_b = vdotq_s32(acc_b, filter_reg_1_b, left_bank_1_reg_b);
acc_b = vdotq_s32(acc_b, filter_reg_2_b, left_bank_2_reg_b);
// Fixed-point multiplication.
acc_a = vqrdmulhq_n_s32(acc_a, output_multiplier);
acc_b = vqrdmulhq_n_s32(acc_b, output_multiplier);
acc_a = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc_a, -output_shift);
acc_b = DivideByPOT<DepthwiseConvOutputRounding::kUpward>::Run(
acc_b, -output_shift);
// Add the output offset.
int16x8_t acc_s16_0_0 =
vcombine_s16(vqmovn_s32(acc_a), vqmovn_s32(acc_b));
acc_s16_0_0 = vqaddq_s16(acc_s16_0_0, output_offset_vec);
// Apply the activation function.
uint8x8_t acc_u8_0_0 = vqmovun_s16(acc_s16_0_0);
acc_u8_0_0 =
vmax_u8(acc_u8_0_0, vget_low_u8(output_activation_min_vec));
acc_u8_0_0 =
vmin_u8(acc_u8_0_0, vget_low_u8(output_activation_max_vec));
vst1_u8(output_data, acc_u8_0_0);
biregister_rotate_8(&left_bank_0_reg_a, &right_bank_0_reg_a);
biregister_rotate_8(&left_bank_1_reg_a, &right_bank_1_reg_a);
biregister_rotate_8(&left_bank_2_reg_a, &right_bank_2_reg_a);
biregister_rotate_8(&left_bank_0_reg_b, &right_bank_0_reg_b);
biregister_rotate_8(&left_bank_1_reg_b, &right_bank_1_reg_b);
biregister_rotate_8(&left_bank_2_reg_b, &right_bank_2_reg_b);
output_data += depth;
}
}
input_data_base += workspace_height_stride;
output_data_base += output_height_stride;
}
}
input_data_depthwise += depth_micro_stride;
output_data_depthwise += 8;
}
} // NOLINT(readability/fn_size) Manually unrolled.