in tensorflow/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h [112:961]
static inline void Run(const int32* output_multiplier_ptr,
const int32* output_shift_ptr, const int8* input_ptr,
const int8* filter_ptr, const int32* bias_ptr,
int8* output_ptr, int64_t input_depth,
int64_t input_row_size, int32 output_window_height,
int32 output_window_width,
const DepthwiseConvParams* params_ptr) {
const int64_t input_width_increment = 2 * input_depth;
const int64_t input_height_increment = 2 * input_row_size;
const int64_t output_height_increment = 2 * params_ptr->output_row_size;
TFLITE_DCHECK_EQ(params_ptr->filter_offset, 0);
#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
asm volatile(
// Performs depthwise convolutions for a window specified by
// |output_window_height| and |output_window_width|. The inner-most loop
// processes 2x2 outputs, and any leftovers at the end.
//
// Algorithm works as follows:
//
// 1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
// values.
// 2. For 2 output heights at a time:
// i. For 2 output widths at a time, load inputs for a 2x1 (2
// height, 1 width) output window (4x3 input window).
// Registers v9--v20 hold input values. Mul-add with
// accumulators v21--v24. Then run activation, downquantize
// and store. Repeat for the next 2x1 output window,
// leveraging overlapping inputs.
// ii. Handle single leftover width if exists.
// 3. Handle single leftover height if exists.
// i. For 2 output widths at a time, load inputs for a 1x2 (1
// height, 2 width) output window (3x4 input window).
// Registers v9--v20 hold input values. Mul-add with
// accumulators v21--v24. Then run activation, downquantize
// and store. Repeat for the next 1x2 output window,
// leveraging overlapping inputs.
// ii. Handle single leftover width if exists.
//
// Loads are placed as soon as the register is no longer needed and
// interleaved with arithmetic operations to take advantage of
// dual-issue pipelines. We also add input offsets as far from the loads
// as possible to give loads enough cycles to fetch data from memory.
//
// This logic is copied and modified from the non-per-channel quantized
// part.
// However, the challenges are how to plan the registers allocation
// wisely: 25 NEON registers are already reserved for inputs, filters,
// and outputs; also, 2 registers (v30, v31) are used for output
// min/max, while another 2 registers (v26, v29) are used for input
// offset & output offset, so that's total 25 + 2 + 2 = 29 already.
// But we need 4 more registers to hold the output multiplier & output
// right shift (we only have 3).
//
// So here's the plan:
// v27 (which held duplicated output multiplier previously) will hold
// the first 4 values of the output_multiplier_ptr (we have 8 in total);
// v30 (which held duplicated output right shift previously) will hold
// the first 4 values of the output_shift_ptr (we have 8 in total);
// lastly, v28 will hold the last 4 values of output_mulitplier and v31
// (previously occupied by activations) will hold the last 4 values of
// output_shift. Then v25 will be used for output activation min while
// output activation max will just reuse oother registers, like v24.
//
// Set "constant" registers. These registers may be replaced with temp
// values from time to time when there are not enough NEON registers.
// We use x9--x15 general purpose registers as they are caller-saved
// temporary registers (see
// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf). // NOLINT
"ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
"ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
"cmp %w[output_window_height], #2\n"
"dup v26.8h, w9\n"
"ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
"dup v29.8h, w2\n"
"ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
"ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
"add x10, %[bias_ptr], #16\n"
"ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
"dup v25.16b, w4\n"
// Deal with output multiplier & output shift.
"ld1 {v27.4s, v28.4s}, [%[output_multiplier_ptr]]\n"
"ld1 {v30.4s, v31.4s}, [%[output_shift_ptr]]\n"
// Load filters and add offsets.
"ld1 {v0.8b}, [%[filter_ptr]], x3\n"
"ld1 {v1.8b}, [%[filter_ptr]], x3\n"
"sshll v0.8h, v0.8b, #0\n"
"ld1 {v2.8b}, [%[filter_ptr]], x3\n"
"sshll v1.8h, v1.8b, #0\n"
"ld1 {v3.8b}, [%[filter_ptr]], x3\n"
"sshll v2.8h, v2.8b, #0\n"
"ld1 {v4.8b}, [%[filter_ptr]], x3\n"
"sshll v3.8h, v3.8b, #0\n"
"ld1 {v5.8b}, [%[filter_ptr]], x3\n"
"sshll v4.8h, v4.8b, #0\n"
"ld1 {v6.8b}, [%[filter_ptr]], x3\n"
"sshll v5.8h, v5.8b, #0\n"
"ld1 {v7.8b}, [%[filter_ptr]], x3\n"
"sshll v6.8h, v6.8b, #0\n"
"ld1 {v8.8b}, [%[filter_ptr]], x3\n"
"sshll v7.8h, v7.8b, #0\n"
"sshll v8.8h, v8.8b, #0\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
//"loop_%=:\n"
DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
// This loop processes 2x2 outputs. To avoid register exhaustion,
// inputs for the left 2 outputs are loaded first, then the right
// two outputs.
"mov x11, %[input_ptr]\n"
"mov x12, x11\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, x11, %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
// The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
// 1 width) in anticipation for the next iteration. Make sure
// |output_window_width| is large enough to handle the additional
// loads, otherwise jump to specific the appropriate label to handle
// smaller widths.
"cmp w5, #2\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v22.4s}, [x10]\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
//"loop_%=:\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
// Mul-add left outputs.
"smlal v21.4s, v0.4h, v9.4h\n"
"subs w5, w5, #2\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"cmp w5, #3\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
// Mul-add right outputs.
"smlal v21.4s, v0.4h, v10.4h\n"
"add x11, x11, %[input_width_increment]\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"mov x12, x11\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"add x13, x11, %[input_row_size]\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"add x14, x13, %[input_row_size]\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"ld1 {v12.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"ld1 {v15.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"ld1 {v16.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"ld1 {v18.8b}, [x15], %[input_depth]\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"ld1 {v19.8b}, [x15], %[input_depth]\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"ld1 {v20.8b}, [x15], %[input_depth]\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
// At this point, there will be one of 2 width or 1 width leftover,
// not both.
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
// Handle last 2 columns if exists.
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
// Mul-add left outputs.
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"ld1 {v9.8b}, [x12]\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"ld1 {v12.8b}, [x13]\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"ld1 {v15.8b}, [x14]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"ld1 {v18.8b}, [x15]\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"st1 {v23.8b}, [x7], x3\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
// Mul-add right outputs.
"smlal v21.4s, v0.4h, v10.4h\n"
"smlal2 v22.4s, v0.8h, v10.8h\n"
"smlal v23.4s, v0.4h, v13.4h\n"
"smlal2 v24.4s, v0.8h, v13.8h\n"
"smlal v21.4s, v1.4h, v11.4h\n"
"smlal2 v22.4s, v1.8h, v11.8h\n"
"smlal v23.4s, v1.4h, v14.4h\n"
"smlal2 v24.4s, v1.8h, v14.8h\n"
"smlal v21.4s, v2.4h, v9.4h\n"
"smlal2 v22.4s, v2.8h, v9.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v16.4h\n"
"smlal2 v24.4s, v3.8h, v16.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v17.4h\n"
"smlal2 v24.4s, v4.8h, v17.8h\n"
"smlal v21.4s, v5.4h, v12.4h\n"
"smlal2 v22.4s, v5.8h, v12.8h\n"
"smlal v23.4s, v5.4h, v15.4h\n"
"smlal2 v24.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v16.4h\n"
"smlal2 v22.4s, v6.8h, v16.8h\n"
"smlal v23.4s, v6.4h, v19.4h\n"
"smlal2 v24.4s, v6.8h, v19.8h\n"
"smlal v21.4s, v7.4h, v17.4h\n"
"smlal2 v22.4s, v7.8h, v17.8h\n"
"smlal v23.4s, v7.4h, v20.4h\n"
"smlal2 v24.4s, v7.8h, v20.8h\n"
"smlal v21.4s, v8.4h, v15.4h\n"
"smlal2 v22.4s, v8.8h, v15.8h\n"
"smlal v23.4s, v8.4h, v18.4h\n"
"smlal2 v24.4s, v8.8h, v18.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v23.4s, v0.4h, v12.4h\n"
"smlal2 v24.4s, v0.8h, v12.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v13.4h\n"
"smlal2 v24.4s, v1.8h, v13.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v14.4h\n"
"smlal2 v24.4s, v2.8h, v14.8h\n"
"smlal v21.4s, v3.4h, v12.4h\n"
"smlal2 v22.4s, v3.8h, v12.8h\n"
"smlal v23.4s, v3.4h, v15.4h\n"
"smlal2 v24.4s, v3.8h, v15.8h\n"
"smlal v21.4s, v4.4h, v13.4h\n"
"smlal2 v22.4s, v4.8h, v13.8h\n"
"smlal v23.4s, v4.4h, v16.4h\n"
"smlal2 v24.4s, v4.8h, v16.8h\n"
"smlal v21.4s, v5.4h, v14.4h\n"
"smlal2 v22.4s, v5.8h, v14.8h\n"
"smlal v23.4s, v5.4h, v17.4h\n"
"smlal2 v24.4s, v5.8h, v17.8h\n"
"smlal v21.4s, v6.4h, v15.4h\n"
"smlal2 v22.4s, v6.8h, v15.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v16.4h\n"
"smlal2 v22.4s, v7.8h, v16.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v17.4h\n"
"smlal2 v22.4s, v8.8h, v17.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"st1 {v21.8b}, [x6], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [x7], x3\n"
DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
"subs %w[output_window_height], %w[output_window_height], #2\n"
"add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
"cmp %w[output_window_height], #2\n"
"add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
"cmp %w[output_window_height], #1\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
"mov x12, %[input_ptr]\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"add x14, x13, %[input_row_size]\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x15, x14, %[input_row_size]\n"
"mov w5, %w[output_window_width]\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"mov x6, %[output_ptr]\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"add x7, %[output_ptr], x1\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
// The height 1 / width 2 loop loads an extra 1x1 output in anticipation
// for the next iteration. Make sure |output_window_width| is large
// enough to handle the additional load, otherwise jump to the
// appropriate label to handle smaller widths.
"cmp w5, #2\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"ld1 {v22.4s}, [x10]\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
"cmp w5, #1\n"
"beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
//"loop_%=:\n"
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
// Load inputs for 3x4 input window which corresponds to a 1x2 output
// window.
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"subs w5, w5, #2\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"cmp w5, #3\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"mov x12, %[input_ptr]\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"ld1 {v9.8b}, [x12], %[input_depth]\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"ld1 {v10.8b}, [x12], %[input_depth]\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"ld1 {v11.8b}, [x12], %[input_depth]\n"
"add x13, %[input_ptr], %[input_row_size]\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"add x14, x13, %[input_row_size]\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"add x15, x14, %[input_row_size]\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"ld1 {v13.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"ld1 {v14.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"ld1 {v15.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"ld1 {v17.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"ld1 {v18.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"ld1 {v19.8b}, [x14], %[input_depth]\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"ld1 {v22.4s}, [x10]\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"ld1 {v24.4s}, [x10]\n"
"saddw v9.8h, v26.8h, v9.8b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"saddw v10.8h, v26.8h, v10.8b\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"saddw v11.8h, v26.8h, v11.8b\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"saddw v13.8h, v26.8h, v13.8b\n"
"saddw v14.8h, v26.8h, v14.8b\n"
"saddw v15.8h, v26.8h, v15.8b\n"
"ld1 {v21.4s}, [%[bias_ptr]]\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"ld1 {v23.4s}, [%[bias_ptr]]\n"
"saddw v17.8h, v26.8h, v17.8b\n"
"saddw v18.8h, v26.8h, v18.8b\n"
"saddw v19.8h, v26.8h, v19.8b\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
// At this point, there will be one of 2 width or 1 width leftover,
// not both.
"cmp w5, #2\n"
"blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
// Handle last two horizontal outputs if exists.
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"ld1 {v12.8b}, [x12], %[input_depth]\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"ld1 {v16.8b}, [x13], %[input_depth]\n"
"smlal v23.4s, v0.4h, v10.4h\n"
"ld1 {v20.8b}, [x14], %[input_depth]\n"
"smlal2 v24.4s, v0.8h, v10.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v23.4s, v1.4h, v11.4h\n"
"smlal2 v24.4s, v1.8h, v11.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"saddw v12.8h, v26.8h, v12.8b\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v23.4s, v2.4h, v12.4h\n"
"smlal2 v24.4s, v2.8h, v12.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v23.4s, v3.4h, v14.4h\n"
"smlal2 v24.4s, v3.8h, v14.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v23.4s, v4.4h, v15.4h\n"
"smlal2 v24.4s, v4.8h, v15.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"saddw v16.8h, v26.8h, v16.8b\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v23.4s, v5.4h, v16.4h\n"
"smlal2 v24.4s, v5.8h, v16.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v23.4s, v6.4h, v18.4h\n"
"smlal2 v24.4s, v6.8h, v18.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v23.4s, v7.4h, v19.4h\n"
"smlal2 v24.4s, v7.8h, v19.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"saddw v20.8h, v26.8h, v20.8b\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"smlal v23.4s, v8.4h, v20.4h\n"
"smlal2 v24.4s, v8.8h, v20.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrdmulh v23.4s, v23.4s, v27.4s\n"
"sqrdmulh v24.4s, v24.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqrshl v23.4s, v23.4s, v30.4s\n"
"sqrshl v24.4s, v24.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqxtn v23.4h, v23.4s\n"
"sqxtn2 v23.8h, v24.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqadd v23.8h, v23.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"sqxtn2 v21.16b, v23.8h\n"
"dup v24.16b, w0\n"
"smax v21.16b, v21.16b, v25.16b\n"
"smin v21.16b, v21.16b, v24.16b\n"
"st1 {v21.8b}, [%[output_ptr]], x3\n"
"mov v23.d[0], v21.d[1]\n"
"st1 {v23.8b}, [%[output_ptr]], x3\n"
"b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
// Handle bottom right output if exists.
DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
"smlal v21.4s, v0.4h, v9.4h\n"
"smlal2 v22.4s, v0.8h, v9.8h\n"
"smlal v21.4s, v1.4h, v10.4h\n"
"smlal2 v22.4s, v1.8h, v10.8h\n"
"smlal v21.4s, v2.4h, v11.4h\n"
"smlal2 v22.4s, v2.8h, v11.8h\n"
"smlal v21.4s, v3.4h, v13.4h\n"
"smlal2 v22.4s, v3.8h, v13.8h\n"
"smlal v21.4s, v4.4h, v14.4h\n"
"smlal2 v22.4s, v4.8h, v14.8h\n"
"smlal v21.4s, v5.4h, v15.4h\n"
"smlal2 v22.4s, v5.8h, v15.8h\n"
"smlal v21.4s, v6.4h, v17.4h\n"
"smlal2 v22.4s, v6.8h, v17.8h\n"
"smlal v21.4s, v7.4h, v18.4h\n"
"smlal2 v22.4s, v7.8h, v18.8h\n"
"smlal v21.4s, v8.4h, v19.4h\n"
"smlal2 v22.4s, v8.8h, v19.8h\n"
"sqrdmulh v21.4s, v21.4s, v27.4s\n"
"sqrdmulh v22.4s, v22.4s, v28.4s\n"
"sqrshl v21.4s, v21.4s, v30.4s\n"
"sqrshl v22.4s, v22.4s, v31.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v29.8h\n"
"sqxtn v21.8b, v21.8h\n"
"dup v24.16b, w0\n"
"smax v21.8b, v21.8b, v25.8b\n"
"smin v21.8b, v21.8b, v24.8b\n"
"st1 {v21.8b}, [%[output_ptr]]\n"
DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
:
// Outputs.
[filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
[output_ptr] "+r"(output_ptr),
[output_window_height] "+r"(output_window_height)
:
// Inputs.
[output_multiplier_ptr] "r"(output_multiplier_ptr),
[output_shift_ptr] "r"(output_shift_ptr),
[bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
[input_depth] "r"(input_depth),
[output_window_width] "r"(output_window_width),
[input_width_increment] "r"(input_width_increment),
[input_height_increment] "r"(input_height_increment),
[output_height_increment] "r"(output_height_increment),
[params_ptr] "r"(params_ptr)
:
// Clobbers.
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
"v30", "v31",
// We use these general-purpose registers.
"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
"x9", "x10", "x11", "x12", "x13", "x14", "x15");
#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
}