in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h [7875:8258]
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
// Note that argument registers may be reused after parameter loading.
// x0 %[scratch_block_data]
// x1 %[filter_workspace]
// x2 %[bias_data]
// x3 %[output_block_data]
// x4 %[function_params]
#define DC_KERNEL_NO_MULT_STRIDE_1 "1"
#define DC_KERNEL_NO_MULT_STRIDE_2 "2"
#define DC_KERNEL_NO_MULT_STRIDE_3 "3"
#define DC_KERNEL_NO_MULT_STRIDE_4 "4"
#define DC_KERNEL_NO_MULT_STRIDE_5 "5"
#define DC_KERNEL_NO_MULT_STRIDE_6 "6"
#define DC_KERNEL_NO_MULT_STRIDE_7 "7"
#define DC_KERNEL_NO_MULT_STRIDE_8 "8"
#define DC_KERNEL_NO_MULT_STRIDE_9 "9"
#define DC_KERNEL_NO_MULT_STRIDE_10 "10"
#define DC_KERNEL_NO_MULT_STRIDE_11 "11"
#define DC_KERNEL_NO_MULT_STRIDE_12 "12"
#define DC_KERNEL_NO_MULT_STRIDE_13 "13"
#define DC_KERNEL_NO_MULT_STRIDE_14 "14"
#define DC_KERNEL_NO_MULT_STRIDE_15 "15"
#define DC_KERNEL_NO_MULT_STRIDE_16 "16"
#define DC_KERNEL_NO_MULT_STRIDE_17 "17"
#define DC_KERNEL_NO_MULT_STRIDE_18 "18"
#define DC_KERNEL_NO_MULT_STRIDE_19 "19"
#ifdef __linux__
asm volatile(
// Compiled code used block of 48 for spill out of total stack of 208.
// However, an 8-byte spill was sneaked in to #120.
// Spillage increased to 64 and these are mapped to #48.
"sub sp, sp, #64\n" // =208
"ldp w13, w14, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ldrsw x15, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldp w11, w16, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldr x7, [%[function_params]]\n"
"ldpsw x9, x10, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldrsw x26, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"ldr w27, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"add x17, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n" // =40
"add x12, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n" // =44
"add x5, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n" // =32
"add x6, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n" // =36
"add %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n" // =28
"sxtw x11, w11\n"
"ld1r { v0.8h }, [%[function_params]]\n"
"ld1r { v1.8b }, [x17]\n"
"ld1r { v2.8b }, [x12]\n"
"ld1r { v3.4s }, [x5]\n"
"ld1r { v4.4s }, [x6]\n"
"cmp w13, #1\n" // =1
"lsl x28, x15, #5\n"
"lsl w15, w7, #1\n"
"ccmp w16, w11, #0, eq\n"
"sxtw %[function_params], w7\n"
"sxtw x7, w15\n"
"csel w15, w16, w11, lt\n"
"mov x8, xzr\n"
"lsl x17, x10, #1\n"
"add x5, x10, x10, lsl #1\n"
"lsl x6, x10, #2\n"
"sxtw x19, w15\n"
"lsl x21, x16, #5\n"
// implicit-def: $q19
// implicit-def: $q20
// implicit-def: $q21
// implicit-def: $q22
// implicit-def: $q23
// implicit-def: $q5
// implicit-def: $q6
// implicit-def: $q16
// implicit-def: $q7
// implicit-def: $q17
// implicit-def: $q18
"str %[filter_workspace], [sp, #48]\n" // 8-byte Folded Spill
"stp %[scratch_block_data], %[output_block_data], [sp, #32]\n" // 16-byte Folded Spill
"str x26, [sp, #24]\n" // 8-byte Folded Spill
"str w27, [sp, #20]\n" // 4-byte Folded Spill
"str x28, [sp, #8]\n" // 8-byte Folded Spill
"b " DC_KERNEL_NO_MULT_STRIDE_19 "f\n"
DC_KERNEL_NO_MULT_STRIDE_1 ":\n" // in Loop: Header=BB227_19 Depth=1
"and x15, x8, #0x1fffffff\n"
"add w16, w8, w8, lsl #1\n"
"add x22, %[output_block_data], x15, lsl #3\n"
"lsl w15, w16, #5\n"
"cmp w27, #2\n" // =2
"add x23, %[filter_workspace], x15\n"
"mov x15, xzr\n"
"b.ne " DC_KERNEL_NO_MULT_STRIDE_11 "f\n"
// %bb.2: // in Loop: Header=BB227_19 Depth=1
"sxtw x16, w8\n"
"ubfiz x12, x8, #3, #29\n"
"mov x25, xzr\n"
"madd x26, x28, x16, %[scratch_block_data]\n"
"add x27, %[output_block_data], x12\n"
"add x28, x22, x9\n"
"mov x29, %[bias_data]\n"
"b " DC_KERNEL_NO_MULT_STRIDE_9 "f\n"
DC_KERNEL_NO_MULT_STRIDE_3 ":\n" // in Loop: Header=BB227_9 Depth=2
"add %[scratch_block_data], x26, x25, lsl #4\n"
"add x16, x23, x25, lsl #4\n"
"ldr q8, [%[scratch_block_data], x6]\n"
"ldr q24, [x29]\n"
"ldr q25, [x16]\n"
"ldr q26, [x16, #32]\n"
"ldr q27, [x16, #64]\n"
"ldr q30, [%[scratch_block_data]]\n"
"ldr q29, [%[scratch_block_data], x10]\n"
"ldr q28, [%[scratch_block_data], x17]\n"
"ldr q31, [%[scratch_block_data], x5]\n"
"mov x30, xzr\n"
"add %[filter_workspace], %[scratch_block_data], #32\n" // =32
"add %[scratch_block_data], x27, x25, lsl #2\n"
"mov x24, x15\n"
"mov v9.16b, v8.16b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_5 "f\n"
DC_KERNEL_NO_MULT_STRIDE_4 ":\n" // in Loop: Header=BB227_5 Depth=3
"mov v23.16b, v24.16b\n"
"mov v10.16b, v24.16b\n"
".word 0x4e9e9737 // sdot v23.4s, v25.16b, v30.16b\n"
".word 0x4e9c972a // sdot v10.4s, v25.16b, v28.16b\n"
".word 0x4e9d9757 // sdot v23.4s, v26.16b, v29.16b\n"
".word 0x4e9f974a // sdot v10.4s, v26.16b, v31.16b\n"
".word 0x4e9c9777 // sdot v23.4s, v27.16b, v28.16b\n"
".word 0x4e88976a // sdot v10.4s, v27.16b, v8.16b\n"
"sqrdmulh v23.4s, v23.4s, v3.4s\n"
"ubfiz x12, x30, #5, #27\n"
"rev32 v13.8h, v28.8h\n"
"sqrdmulh v28.4s, v10.4s, v3.4s\n"
"sqrshl v23.4s, v23.4s, v4.4s\n"
"add x12, %[filter_workspace], x12\n"
"sqrshl v28.4s, v28.4s, v4.4s\n"
"sqxtn v23.4h, v23.4s\n"
"ldr q19, [x12]\n"
"ldr q20, [x12, x10]\n"
"ldr q21, [x12, x17]\n"
"ldr q22, [x12, x5]\n"
"ldr q8, [x12, x6]\n"
"sqxtn2 v23.8h, v28.4s\n"
"sqadd v23.8h, v23.8h, v0.8h\n"
"sqxtun v23.8b, v23.8h\n"
"madd x16, x30, x7, %[scratch_block_data]\n"
"rev32 v11.8h, v30.8h\n"
"umax v23.8b, v23.8b, v1.8b\n"
"rev32 v12.8h, v29.8h\n"
"mov v28.16b, v24.16b\n"
"add x12, x16, x9\n"
"umin v23.8b, v23.8b, v2.8b\n"
"trn1 v29.8h, v11.8h, v19.8h\n"
"rev32 v14.8h, v31.8h\n"
"str s23, [x16]\n"
"st1 { v23.s }[1], [x12]\n"
"mov v23.16b, v24.16b\n"
"trn1 v30.8h, v12.8h, v20.8h\n"
"trn1 v31.8h, v13.8h, v21.8h\n"
".word 0x4e9d973c // sdot v28.4s, v25.16b, v29.16b\n"
"rev32 v9.8h, v9.8h\n"
"trn1 v10.8h, v14.8h, v22.8h\n"
".word 0x4e9f9737 // sdot v23.4s, v25.16b, v31.16b\n"
".word 0x4e9e975c // sdot v28.4s, v26.16b, v30.16b\n"
"trn1 v9.8h, v9.8h, v8.8h\n"
".word 0x4e8a9757 // sdot v23.4s, v26.16b, v10.16b\n"
".word 0x4e9f977c // sdot v28.4s, v27.16b, v31.16b\n"
".word 0x4e899777 // sdot v23.4s, v27.16b, v9.16b\n"
"sqrdmulh v28.4s, v28.4s, v3.4s\n"
"sqrdmulh v23.4s, v23.4s, v3.4s\n"
"sqrshl v28.4s, v28.4s, v4.4s\n"
"sqrshl v23.4s, v23.4s, v4.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqxtn2 v28.8h, v23.4s\n"
"sqadd v23.8h, v28.8h, v0.8h\n"
"sqxtun v23.8b, v23.8h\n"
"add x12, x16, %[function_params]\n"
"umax v23.8b, v23.8b, v1.8b\n"
"add x16, x12, x9\n"
"umin v23.8b, v23.8b, v2.8b\n"
"add x30, x30, #1\n" // =1
"str s23, [x12]\n"
"st1 { v23.s }[1], [x16]\n"
"add x24, x24, x7\n"
"mov v30.16b, v19.16b\n"
"mov v29.16b, v20.16b\n"
"mov v28.16b, v21.16b\n"
"mov v31.16b, v22.16b\n"
"mov v9.16b, v8.16b\n"
"mov v23.16b, v8.16b\n"
DC_KERNEL_NO_MULT_STRIDE_5 ":\n" // Parent Loop BB227_19 Depth=1
// Parent Loop BB227_9 Depth=2
// => This Inner Loop Header: Depth=3
"cmp x30, x19\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_4 "b\n"
"b " DC_KERNEL_NO_MULT_STRIDE_7 "f\n"
DC_KERNEL_NO_MULT_STRIDE_6 ":\n" // in Loop: Header=BB227_7 Depth=3
"mov v8.16b, v24.16b\n"
"mov v10.16b, v24.16b\n"
".word 0x4e9e9728 // sdot v8.4s, v25.16b, v30.16b\n"
".word 0x4e9d9748 // sdot v8.4s, v26.16b, v29.16b\n"
".word 0x4e9c972a // sdot v10.4s, v25.16b, v28.16b\n"
".word 0x4e9c9768 // sdot v8.4s, v27.16b, v28.16b\n"
".word 0x4e9f974a // sdot v10.4s, v26.16b, v31.16b\n"
".word 0x4e89976a // sdot v10.4s, v27.16b, v9.16b\n"
"sqrdmulh v8.4s, v8.4s, v3.4s\n"
"sqrdmulh v10.4s, v10.4s, v3.4s\n"
"sqrshl v8.4s, v8.4s, v4.4s\n"
"sqrshl v10.4s, v10.4s, v4.4s\n"
"sqxtn v8.4h, v8.4s\n"
"sqxtn2 v8.8h, v10.4s\n"
"sqadd v8.8h, v8.8h, v0.8h\n"
"sqxtun v8.8b, v8.8h\n"
"umax v8.8b, v8.8b, v1.8b\n"
"add x12, x28, x24\n"
"rev32 v30.8h, v30.8h\n"
"rev32 v29.8h, v29.8h\n"
"rev32 v28.8h, v28.8h\n"
"rev32 v31.8h, v31.8h\n"
"rev32 v9.8h, v9.8h\n"
"umin v8.8b, v8.8b, v2.8b\n"
"add x30, x30, #1\n" // =1
"trn1 v30.8h, v30.8h, v19.8h\n"
"trn1 v29.8h, v29.8h, v20.8h\n"
"trn1 v31.8h, v31.8h, v22.8h\n"
"trn1 v28.8h, v28.8h, v21.8h\n"
"trn1 v9.8h, v9.8h, v23.8h\n"
"str s8, [x22, x24]\n"
"st1 { v8.s }[1], [x12]\n"
"add x24, x24, x7\n"
DC_KERNEL_NO_MULT_STRIDE_7 ":\n" // Parent Loop BB227_19 Depth=1
// Parent Loop BB227_9 Depth=2
// => This Inner Loop Header: Depth=3
"cmp x30, x11\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_6 "b\n"
// %bb.8: // in Loop: Header=BB227_9 Depth=2
"add x29, x29, #16\n" // =16
"add x25, x25, #1\n" // =1
"add x15, x15, #4\n" // =4
DC_KERNEL_NO_MULT_STRIDE_9 ":\n" // Parent Loop BB227_19 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB227_5 Depth 3
// Child Loop BB227_7 Depth 3
"cmp x25, #2\n" // =2
"b.ne " DC_KERNEL_NO_MULT_STRIDE_3 "b\n"
// %bb.10: // in Loop: Header=BB227_19 Depth=1
"ldr %[filter_workspace], [sp, #48]\n" // 8-byte Folded Reload
"ldp %[scratch_block_data], %[output_block_data], [sp, #32]\n" // 16-byte Folded Reload
"ldr x26, [sp, #24]\n" // 8-byte Folded Reload
"ldr w27, [sp, #20]\n" // 4-byte Folded Reload
"ldr x28, [sp, #8]\n" // 8-byte Folded Reload
"b " DC_KERNEL_NO_MULT_STRIDE_18 "f\n"
DC_KERNEL_NO_MULT_STRIDE_11 ":\n" // in Loop: Header=BB227_19 Depth=1
"mul w12, w28, w8\n"
"add x12, %[scratch_block_data], w12, sxtw\n"
"add x16, x12, x10\n"
"ldp q8, q9, [x16]\n"
"add x16, x12, x17\n"
"ldp q24, q25, [x23]\n"
"ldp q26, q27, [x23, #32]\n"
"ldp q28, q29, [x23, #64]\n"
"ldp q10, q12, [x16]\n"
"ldp q30, q31, [%[bias_data]]\n"
"ldp q13, q11, [x12]\n"
"mov x24, xzr\n"
"add x23, x12, #32\n" // =32
"b " DC_KERNEL_NO_MULT_STRIDE_17 "f\n"
DC_KERNEL_NO_MULT_STRIDE_12 ":\n" // in Loop: Header=BB227_17 Depth=2
"cmp w11, w14\n"
"ccmp x21, x15, #0, eq\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_14 "f\n"
// %bb.13: // in Loop: Header=BB227_17 Depth=2
"and x12, x15, #0xffffffe0\n"
"add x12, x23, x12\n"
"add x16, x12, x10\n"
"add x25, x12, x17\n"
"ldp q5, q7, [x12]\n"
"ldp q6, q17, [x16]\n"
"ldp q16, q18, [x25]\n"
DC_KERNEL_NO_MULT_STRIDE_14 ":\n" // in Loop: Header=BB227_17 Depth=2
"mov v14.16b, v30.16b\n"
"mov v15.16b, v31.16b\n"
".word 0x4e8d970e // sdot v14.4s, v24.16b, v13.16b\n"
".word 0x4e88974e // sdot v14.4s, v26.16b, v8.16b\n"
".word 0x4e8b972f // sdot v15.4s, v25.16b, v11.16b\n"
".word 0x4e8a978e // sdot v14.4s, v28.16b, v10.16b\n"
".word 0x4e89976f // sdot v15.4s, v27.16b, v9.16b\n"
".word 0x4e8c97af // sdot v15.4s, v29.16b, v12.16b\n"
"sqrdmulh v14.4s, v14.4s, v3.4s\n"
"sqrdmulh v15.4s, v15.4s, v3.4s\n"
"sqrshl v14.4s, v14.4s, v4.4s\n"
"sqrshl v15.4s, v15.4s, v4.4s\n"
"sqxtn v14.4h, v14.4s\n"
"sqxtn2 v14.8h, v15.4s\n"
"sqadd v14.8h, v14.8h, v0.8h\n"
"sqxtun v14.8b, v14.8h\n"
"rev32 v13.8h, v13.8h\n"
"rev32 v8.8h, v8.8h\n"
"rev32 v10.8h, v10.8h\n"
"rev32 v11.8h, v11.8h\n"
"rev32 v9.8h, v9.8h\n"
"rev32 v12.8h, v12.8h\n"
"cmp w13, #1\n" // =1
"umax v14.8b, v14.8b, v1.8b\n"
"trn1 v13.8h, v13.8h, v5.8h\n"
"trn1 v11.8h, v11.8h, v7.8h\n"
"ccmp x21, x15, #0, le\n"
"trn1 v8.8h, v8.8h, v6.8h\n"
"trn1 v9.8h, v9.8h, v17.8h\n"
"trn1 v10.8h, v10.8h, v16.8h\n"
"umin v14.8b, v14.8b, v2.8b\n"
"trn1 v12.8h, v12.8h, v18.8h\n"
"str d14, [x22]\n"
"b.eq " DC_KERNEL_NO_MULT_STRIDE_16 "f\n"
// %bb.15: // in Loop: Header=BB227_17 Depth=2
"mov v14.16b, v30.16b\n"
".word 0x4e8d970e // sdot v14.4s, v24.16b, v13.16b\n"
"mov v13.16b, v31.16b\n"
".word 0x4e8b972d // sdot v13.4s, v25.16b, v11.16b\n"
".word 0x4e88974e // sdot v14.4s, v26.16b, v8.16b\n"
".word 0x4e89976d // sdot v13.4s, v27.16b, v9.16b\n"
".word 0x4e8a978e // sdot v14.4s, v28.16b, v10.16b\n"
".word 0x4e8c97ad // sdot v13.4s, v29.16b, v12.16b\n"
"sqrdmulh v8.4s, v14.4s, v3.4s\n"
"sqrdmulh v9.4s, v13.4s, v3.4s\n"
"sqrshl v8.4s, v8.4s, v4.4s\n"
"sqrshl v9.4s, v9.4s, v4.4s\n"
"sqxtn v8.4h, v8.4s\n"
"sqxtn2 v8.8h, v9.4s\n"
"sqadd v8.8h, v8.8h, v0.8h\n"
"sqxtun v8.8b, v8.8h\n"
"umax v8.8b, v8.8b, v1.8b\n"
"umin v8.8b, v8.8b, v2.8b\n"
"str d8, [x22, %[function_params]]\n"
"mov v13.16b, v5.16b\n"
"mov v8.16b, v6.16b\n"
"mov v10.16b, v16.16b\n"
"mov v11.16b, v7.16b\n"
"mov v9.16b, v17.16b\n"
"mov v12.16b, v18.16b\n"
DC_KERNEL_NO_MULT_STRIDE_16 ":\n" // in Loop: Header=BB227_17 Depth=2
"add x24, x24, #1\n" // =1
"add x22, x22, x7\n"
"add x15, x15, #32\n" // =32
DC_KERNEL_NO_MULT_STRIDE_17 ":\n" // Parent Loop BB227_19 Depth=1
// => This Inner Loop Header: Depth=2
"cmp x24, x11\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_12 "b\n"
DC_KERNEL_NO_MULT_STRIDE_18 ":\n" // in Loop: Header=BB227_19 Depth=1
"add %[bias_data], x2, #32\n" // =32
"add x8, x8, #1\n" // =1
DC_KERNEL_NO_MULT_STRIDE_19 ":\n" // =>This Loop Header: Depth=1
// Child Loop BB227_17 Depth 2
// Child Loop BB227_9 Depth 2
// Child Loop BB227_5 Depth 3
// Child Loop BB227_7 Depth 3
"cmp x8, x26\n"
"b.lt " DC_KERNEL_NO_MULT_STRIDE_1 "b\n"
// %bb.20:
// Compiled intrinsics total stack 208, now 64 for spillage only.
"add sp, sp, #64\n" // =208
:
// Outputs.
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
// Inputs.
[ function_params ] "r"(function_params)
:
// Clobbers.
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
// We use these general-purpose registers.
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
"x28", "x29", "x30");
#endif // __linux__
} // NOLINT(readability/fn_size) Manually unrolled.