in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h [8927:9280]
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
// Note that argument registers may be reused after parameter loading.
// x0 %[scratch_block_data]
// x1 %[filter_workspace]
// x2 %[bias_data]
// x3 %[output_block_data]
// x4 %[function_params]
#define DC_KERNEL_MULT_STRIDE_1 "1"
#define DC_KERNEL_MULT_STRIDE_2 "2"
#define DC_KERNEL_MULT_STRIDE_3 "3"
#define DC_KERNEL_MULT_STRIDE_4 "4"
#define DC_KERNEL_MULT_STRIDE_5 "5"
#define DC_KERNEL_MULT_STRIDE_6 "6"
#define DC_KERNEL_MULT_STRIDE_7 "7"
#define DC_KERNEL_MULT_STRIDE_8 "8"
#define DC_KERNEL_MULT_STRIDE_9 "9"
#define DC_KERNEL_MULT_STRIDE_10 "10"
#define DC_KERNEL_MULT_STRIDE_11 "11"
#define DC_KERNEL_MULT_STRIDE_12 "12"
#define DC_KERNEL_MULT_STRIDE_13 "13"
asm volatile(
"ldr w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ldp w11, w6, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldpsw x9, x10, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldrsw x12, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"ldrsw x13, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"ldr w14, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"add x17, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n" // =40
"add x5, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n" // =44
"add x7, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n" // =32
"add x19, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n" // =36
"add %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n" // =28
"sxtw x11, w11\n"
"ld1r { v0.8h }, [%[function_params]]\n"
"ld1r { v1.4s }, [x7]\n"
"ld1r { v2.4s }, [x19]\n"
"ld1r { v3.8b }, [x17]\n"
"ld1r { v4.8b }, [x5]\n"
"cmp w15, #2\n" // =2
"ccmp w6, w11, #0, lt\n"
"lsl x5, x6, #2\n"
"csel w6, w6, w11, lt\n"
"mov x8, xzr\n"
"add x16, %[scratch_block_data], #4\n" // =4
"lsl x17, x10, #1\n"
"add %[function_params], x10, x10, lsl #1\n"
"sxtw x6, w6\n"
"add x7, x9, x13\n"
"b " DC_KERNEL_MULT_STRIDE_13 "f\n"
DC_KERNEL_MULT_STRIDE_1 ":\n" // in Loop: Header=BB206_13 Depth=1
"ldr w20, [%[scratch_block_data]]\n"
"add x21, %[scratch_block_data], x10\n"
"ldp q5, q6, [%[filter_workspace]]\n"
"ldp q7, q16, [%[filter_workspace], #32]\n"
"fmov s21, w20\n"
"mov v21.s[1], w20\n"
"ld1 { v21.s }[2], [x21]\n"
"ldp q17, q18, [%[filter_workspace], #64]\n"
"ldp q19, q20, [%[bias_data]], #32\n"
"ldr s22, [%[scratch_block_data], x17]\n"
"ubfiz x19, x8, #3, #29\n"
"add %[filter_workspace], %[filter_workspace], #96\n" // =96
"add x19, %[output_block_data], x19\n"
"cmp w14, #2\n" // =2
"mov v21.s[3], w20\n"
"mov x20, xzr\n"
"b.ne " DC_KERNEL_MULT_STRIDE_7 "f\n"
// %bb.2: // in Loop: Header=BB206_13 Depth=1
"dup v22.4s, v22.s[0]\n"
"add x21, %[scratch_block_data], %[function_params]\n"
"add x22, %[scratch_block_data], x10, lsl #2\n"
"ld1 { v22.s }[2], [x21]\n"
"ld1r { v23.4s }, [x22]\n"
"mov x21, xzr\n"
"b " DC_KERNEL_MULT_STRIDE_4 "f\n"
DC_KERNEL_MULT_STRIDE_3 ":\n" // in Loop: Header=BB206_4 Depth=2
"and x22, x20, #0xfffffffc\n"
"add x23, x16, x22\n"
"lsl x24, x10, #2\n"
"mov x22, x23\n"
"ld1 { v21.s }[1], [x22], x24\n"
"add x24, x23, x17\n"
"ld1 { v22.s }[1], [x24]\n"
"add x24, x23, x10\n"
"ld1 { v21.s }[3], [x24]\n"
"add x23, x23, %[function_params]\n"
"ld1 { v22.s }[3], [x23]\n"
"mov v25.16b, v19.16b\n"
"mov v27.16b, v20.16b\n"
"ld1 { v23.s }[1], [x22]\n"
"ushr v29.2d, v21.2d, #16\n"
".word 0x4f9de0b9 // sdot v25.4s, v5.16b, v29.4b[0]\n"
".word 0x4f9de0db // sdot v27.4s, v6.16b, v29.4b[0]\n"
"mov v26.16b, v19.16b\n"
"mov v28.16b, v20.16b\n"
".word 0x4f9de8f9 // sdot v25.4s, v7.16b, v29.4b[2]\n"
".word 0x4f9dea1b // sdot v27.4s, v16.16b, v29.4b[2]\n"
"ushr v29.2d, v22.2d, #16\n"
".word 0x4f9de0ba // sdot v26.4s, v5.16b, v29.4b[0]\n"
".word 0x4f9de0dc // sdot v28.4s, v6.16b, v29.4b[0]\n"
"mov v24.16b, v19.16b\n"
".word 0x4f9de8fa // sdot v26.4s, v7.16b, v29.4b[2]\n"
".word 0x4f9dea1c // sdot v28.4s, v16.16b, v29.4b[2]\n"
".word 0x4f9de239 // sdot v25.4s, v17.16b, v29.4b[0]\n"
".word 0x4f9de25b // sdot v27.4s, v18.16b, v29.4b[0]\n"
"ushr v29.2d, v23.2d, #16\n"
".word 0x4f9de23a // sdot v26.4s, v17.16b, v29.4b[0]\n"
".word 0x4f9de25c // sdot v28.4s, v18.16b, v29.4b[0]\n"
"mov v29.16b, v19.16b\n"
".word 0x4f95e0b8 // sdot v24.4s, v5.16b, v21.4b[0]\n"
".word 0x4f96e0bd // sdot v29.4s, v5.16b, v22.4b[0]\n"
".word 0x4f95e8f8 // sdot v24.4s, v7.16b, v21.4b[2]\n"
".word 0x4f96e8fd // sdot v29.4s, v7.16b, v22.4b[2]\n"
".word 0x4f96e238 // sdot v24.4s, v17.16b, v22.4b[0]\n"
".word 0x4f97e23d // sdot v29.4s, v17.16b, v23.4b[0]\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v29.4s\n"
"sqadd v24.8h, v24.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x22, x19, x9\n"
"mov v29.16b, v20.16b\n"
"umin v24.8b, v24.8b, v4.8b\n"
"str s24, [x19]\n"
"st1 { v24.s }[1], [x22]\n"
"mov v24.16b, v20.16b\n"
".word 0x4f95e0dd // sdot v29.4s, v6.16b, v21.4b[0]\n"
".word 0x4f96e0d8 // sdot v24.4s, v6.16b, v22.4b[0]\n"
".word 0x4f95ea1d // sdot v29.4s, v16.16b, v21.4b[2]\n"
".word 0x4f96ea18 // sdot v24.4s, v16.16b, v22.4b[2]\n"
".word 0x4f96e25d // sdot v29.4s, v18.16b, v22.4b[0]\n"
".word 0x4f97e258 // sdot v24.4s, v18.16b, v23.4b[0]\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqxtn v29.4h, v29.4s\n"
"sqxtn2 v29.8h, v24.4s\n"
"sqadd v24.8h, v29.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"umax v24.8b, v24.8b, v3.8b\n"
"sqrdmulh v26.4s, v26.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"add x22, x22, #4\n" // =4
"umin v24.8b, v24.8b, v4.8b\n"
"sqrshl v26.4s, v26.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"str s24, [x19, #4]\n"
"st1 { v24.s }[1], [x22]\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v24.8h, v25.8h, v0.8h\n"
"sqrdmulh v27.4s, v27.4s, v1.4s\n"
"sqxtun v24.8b, v24.8h\n"
"sqrdmulh v28.4s, v28.4s, v1.4s\n"
"sqrshl v27.4s, v27.4s, v2.4s\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x23, x19, x13\n"
"add x24, x19, x7\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqxtn v27.4h, v27.4s\n"
"umin v24.8b, v24.8b, v4.8b\n"
"str s24, [x23]\n"
"st1 { v24.s }[1], [x24]\n"
"sqxtn2 v27.8h, v28.4s\n"
"sqadd v24.8h, v27.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x25, x24, #4\n" // =4
"umin v24.8b, v24.8b, v4.8b\n"
"add x21, x21, #1\n" // =1
"ushr v21.2d, v21.2d, #32\n"
"ushr v22.2d, v22.2d, #32\n"
"ushr v23.2d, v23.2d, #32\n"
"add x19, x23, x13\n"
"str s24, [x23, #4]\n"
"st1 { v24.s }[1], [x25]\n"
"add x20, x20, #4\n" // =4
DC_KERNEL_MULT_STRIDE_4 ":\n" // Parent Loop BB206_13 Depth=1
// => This Inner Loop Header: Depth=2
"cmp x21, x6\n"
"b.lt " DC_KERNEL_MULT_STRIDE_3 "b\n"
"b " DC_KERNEL_MULT_STRIDE_6 "f\n"
DC_KERNEL_MULT_STRIDE_5 ":\n" // in Loop: Header=BB206_6 Depth=2
"and x22, x20, #0xfffffffc\n"
"add x22, x16, x22\n"
"lsl x23, x10, #2\n"
"mov x25, x22\n"
"add x24, x22, x17\n"
"ld1 { v21.s }[1], [x25], x23\n"
"ld1 { v22.s }[1], [x24]\n"
"add x23, x22, x10\n"
"add x22, x22, %[function_params]\n"
"ld1 { v21.s }[3], [x23]\n"
"ld1 { v22.s }[3], [x22]\n"
"mov v24.16b, v19.16b\n"
"ld1 { v23.s }[1], [x25]\n"
"mov v25.16b, v19.16b\n"
".word 0x4f95e0b8 // sdot v24.4s, v5.16b, v21.4b[0]\n"
".word 0x4f96e0b9 // sdot v25.4s, v5.16b, v22.4b[0]\n"
".word 0x4f95e8f8 // sdot v24.4s, v7.16b, v21.4b[2]\n"
".word 0x4f96e8f9 // sdot v25.4s, v7.16b, v22.4b[2]\n"
".word 0x4f96e238 // sdot v24.4s, v17.16b, v22.4b[0]\n"
".word 0x4f97e239 // sdot v25.4s, v17.16b, v23.4b[0]\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqxtn v24.4h, v24.4s\n"
"sqxtn2 v24.8h, v25.4s\n"
"sqadd v24.8h, v24.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x22, x19, x9\n"
"mov v25.16b, v20.16b\n"
"umin v24.8b, v24.8b, v4.8b\n"
"str s24, [x19]\n"
"st1 { v24.s }[1], [x22]\n"
"mov v24.16b, v20.16b\n"
".word 0x4f95e0d9 // sdot v25.4s, v6.16b, v21.4b[0]\n"
".word 0x4f96e0d8 // sdot v24.4s, v6.16b, v22.4b[0]\n"
".word 0x4f95ea19 // sdot v25.4s, v16.16b, v21.4b[2]\n"
".word 0x4f96ea18 // sdot v24.4s, v16.16b, v22.4b[2]\n"
".word 0x4f96e259 // sdot v25.4s, v18.16b, v22.4b[0]\n"
".word 0x4f97e258 // sdot v24.4s, v18.16b, v23.4b[0]\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v24.4s\n"
"sqadd v24.8h, v25.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"add x22, x22, #4\n" // =4
"umin v24.8b, v24.8b, v4.8b\n"
"add x21, x21, #1\n" // =1
"ushr v21.2d, v21.2d, #16\n"
"ushr v22.2d, v22.2d, #16\n"
"ushr v23.2d, v23.2d, #16\n"
"str s24, [x19, #4]\n"
"st1 { v24.s }[1], [x22]\n"
"add x19, x19, x13\n"
"add x20, x20, #4\n" // =4
DC_KERNEL_MULT_STRIDE_6 ":\n" // Parent Loop BB206_13 Depth=1
// => This Inner Loop Header: Depth=2
"cmp x21, x11\n"
"b.lt " DC_KERNEL_MULT_STRIDE_5 "b\n"
"b " DC_KERNEL_MULT_STRIDE_12 "f\n"
DC_KERNEL_MULT_STRIDE_7 ":\n" // in Loop: Header=BB206_13 Depth=1
"mov x21, xzr\n"
"dup v22.4s, v22.s[0]\n"
"b " DC_KERNEL_MULT_STRIDE_11 "f\n"
DC_KERNEL_MULT_STRIDE_8 ":\n" // in Loop: Header=BB206_11 Depth=2
"and x22, x20, #0xfffffffc\n"
"add x22, x16, x22\n"
"mov x23, x22\n"
"ld1 { v21.s }[1], [x23], x17\n"
"add x22, x22, x10\n"
"mov v23.16b, v19.16b\n"
"mov v24.16b, v20.16b\n"
"ld1 { v22.s }[1], [x23]\n"
"ld1 { v21.s }[3], [x22]\n"
"cmp w15, #2\n" // =2
"ccmp x5, x20, #0, ne\n"
".word 0x4f96e237 // sdot v23.4s, v17.16b, v22.4b[0]\n"
".word 0x4f96e258 // sdot v24.4s, v18.16b, v22.4b[0]\n"
".word 0x4f95e0b7 // sdot v23.4s, v5.16b, v21.4b[0]\n"
".word 0x4f95e0d8 // sdot v24.4s, v6.16b, v21.4b[0]\n"
".word 0x4f95e8f7 // sdot v23.4s, v7.16b, v21.4b[2]\n"
".word 0x4f95ea18 // sdot v24.4s, v16.16b, v21.4b[2]\n"
"sqrdmulh v23.4s, v23.4s, v1.4s\n"
"sqrdmulh v24.4s, v24.4s, v1.4s\n"
"sqrshl v23.4s, v23.4s, v2.4s\n"
"sqrshl v24.4s, v24.4s, v2.4s\n"
"sqxtn v25.4h, v23.4s\n"
"sqxtn2 v25.8h, v24.4s\n"
"sqadd v24.8h, v25.8h, v0.8h\n"
"sqxtun v24.8b, v24.8h\n"
"umax v24.8b, v24.8b, v3.8b\n"
"umin v24.8b, v24.8b, v4.8b\n"
"ushr v23.2d, v21.2d, #16\n"
"str d24, [x19]\n"
"ushr v24.2d, v22.2d, #16\n"
"add x19, x19, x13\n"
"b.eq " DC_KERNEL_MULT_STRIDE_10 "f\n"
// %bb.9: // in Loop: Header=BB206_11 Depth=2
"mov v25.16b, v19.16b\n"
"mov v26.16b, v20.16b\n"
".word 0x4f98e239 // sdot v25.4s, v17.16b, v24.4b[0]\n"
".word 0x4f98e25a // sdot v26.4s, v18.16b, v24.4b[0]\n"
".word 0x4f97e0b9 // sdot v25.4s, v5.16b, v23.4b[0]\n"
".word 0x4f97e0da // sdot v26.4s, v6.16b, v23.4b[0]\n"
".word 0x4f97e8f9 // sdot v25.4s, v7.16b, v23.4b[2]\n"
".word 0x4f97ea1a // sdot v26.4s, v16.16b, v23.4b[2]\n"
"ushr v23.2d, v21.2d, #32\n"
"sqrdmulh v21.4s, v25.4s, v1.4s\n"
"ushr v24.2d, v22.2d, #32\n"
"sqrdmulh v22.4s, v26.4s, v1.4s\n"
"sqrshl v21.4s, v21.4s, v2.4s\n"
"sqrshl v22.4s, v22.4s, v2.4s\n"
"sqxtn v21.4h, v21.4s\n"
"sqxtn2 v21.8h, v22.4s\n"
"sqadd v21.8h, v21.8h, v0.8h\n"
"sqxtun v21.8b, v21.8h\n"
"umax v21.8b, v21.8b, v3.8b\n"
"umin v21.8b, v21.8b, v4.8b\n"
"str d21, [x19]\n"
"add x19, x19, x13\n"
DC_KERNEL_MULT_STRIDE_10 ":\n" // in Loop: Header=BB206_11 Depth=2
"add x21, x21, #1\n" // =1
"add x20, x20, #4\n" // =4
"mov v22.16b, v24.16b\n"
"mov v21.16b, v23.16b\n"
DC_KERNEL_MULT_STRIDE_11 ":\n" // Parent Loop BB206_13 Depth=1
// => This Inner Loop Header: Depth=2
"cmp x21, x11\n"
"b.lt " DC_KERNEL_MULT_STRIDE_8 "b\n"
DC_KERNEL_MULT_STRIDE_12 ":\n" // in Loop: Header=BB206_13 Depth=1
"add x8, x8, #1\n" // =1
DC_KERNEL_MULT_STRIDE_13 ":\n" // =>This Loop Header: Depth=1
// Child Loop BB206_11 Depth 2
// Child Loop BB206_4 Depth 2
// Child Loop BB206_6 Depth 2
"cmp x8, x12\n"
"b.lt " DC_KERNEL_MULT_STRIDE_1 "b\n"
:
// Outputs.
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
// Inputs.
[ function_params ] "r"(function_params)
:
// Clobbers.
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
// We use these general-purpose registers.
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25");
}