in tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h [9967:10776]
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, int8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
// Note that argument registers may be reused after parameter loading.
// x0 %[scratch_block_data]
// x1 %[filter_workspace]
// x2 %[bias_data]
// x3 %[output_block_data]
// x4 %[function_params]
#define DC_KERNEL_NO_MULT_1 "1"
#define DC_KERNEL_NO_MULT_2 "2"
#define DC_KERNEL_NO_MULT_3 "3"
#define DC_KERNEL_NO_MULT_4 "4"
#define DC_KERNEL_NO_MULT_5 "5"
#define DC_KERNEL_NO_MULT_6 "6"
#define DC_KERNEL_NO_MULT_7 "7"
#define DC_KERNEL_NO_MULT_8 "8"
#define DC_KERNEL_NO_MULT_9 "9"
#define DC_KERNEL_NO_MULT_10 "10"
#define DC_KERNEL_NO_MULT_11 "11"
#define DC_KERNEL_NO_MULT_12 "12"
#define DC_KERNEL_NO_MULT_13 "13"
#define DC_KERNEL_NO_MULT_14 "14"
#define DC_KERNEL_NO_MULT_15 "15"
#define DC_KERNEL_NO_MULT_16 "16"
#define DC_KERNEL_NO_MULT_17 "17"
#define DC_KERNEL_NO_MULT_18 "18"
#define DC_KERNEL_NO_MULT_19 "19"
#define DC_KERNEL_NO_MULT_20 "20"
#define DC_KERNEL_NO_MULT_21 "21"
#define DC_KERNEL_NO_MULT_22 "22"
#define DC_KERNEL_NO_MULT_23 "23"
#define DC_KERNEL_NO_MULT_24 "24"
#define DC_KERNEL_NO_MULT_25 "25"
#define DC_KERNEL_NO_MULT_26 "26"
#define DC_KERNEL_NO_MULT_27 "27"
#define DC_KERNEL_NO_MULT_28 "28"
#define DC_KERNEL_NO_MULT_29 "29"
#define DC_KERNEL_NO_MULT_30 "30"
#define DC_KERNEL_NO_MULT_31 "31"
#define DC_KERNEL_NO_MULT_32 "32"
#define DC_KERNEL_NO_MULT_33 "33"
asm volatile(
// Compiled code used block of 384 for spill out of total stack of 528.
"sub sp, sp, #384\n" // =528
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"str %[scratch_block_data], [sp, #376]\n" // 8-byte Folded Spill
"cmp w8, #1\n" // =1
"str x8, [sp, #56]\n" // 8-byte Folded Spill
"b.lt " DC_KERNEL_NO_MULT_33 "f\n"
// %bb.1:
"stp xzr, xzr, [sp, #72]\n" // 16-byte Folded Spill
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"str xzr, [sp, #88]\n" // 8-byte Folded Spill
"ldpsw x22, x5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldr x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_MULTPLIPLIER_PER_CHANNEL) "]\n"
"str w8, [sp, #340]\n" // 4-byte Folded Spill
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_MICRO_REPEATS) "]\n"
"ldrb w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
"str x11, [sp, #40]\n" // 8-byte Folded Spill
"ldr x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT_PER_CHANNEL) "]\n"
"str w8, [sp, #344]\n" // 4-byte Folded Spill
"ldr w8, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"ldrsw x7, [%[function_params]]\n"
"str x11, [sp, #32]\n" // 8-byte Folded Spill
"ldrsw x11, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"str w8, [sp, #348]\n" // 4-byte Folded Spill
"ldrb w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
"ldr x26, [sp, #376]\n" // 8-byte Folded Reload
"mov x23, %[output_block_data]\n"
"add x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n" // =28
"dup v5.16b, w8\n"
"fmov s3, w8\n"
"lsl x8, x11, #5\n"
"dup v6.16b, w9\n"
"fmov s4, w9\n"
"str x8, [sp, #48]\n" // 8-byte Folded Spill
"add x8, x5, x26\n"
"lsl x9, x7, #1\n"
"ld1r { v0.8h }, [x10]\n"
"add x13, x5, x5, lsl #1\n"
"add x10, x22, x7\n"
"add x28, x8, #32\n" // =32
"add x8, x23, x9\n"
"str x13, [sp, #312]\n" // 8-byte Folded Spill
"add x13, x13, x26\n"
"str x8, [sp, #360]\n" // 8-byte Folded Spill
"add x8, x23, x10\n"
"str x8, [sp, #352]\n" // 8-byte Folded Spill
"add x8, x13, #32\n" // =32
"ldr w6, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"lsl x12, x5, #2\n"
"add x11, x5, x5, lsl #2\n"
"add x24, x22, x22, lsl #1\n"
"str x8, [sp, #368]\n" // 8-byte Folded Spill
"lsl x8, x5, #1\n"
"mov %[output_block_data], %[filter_workspace]\n"
"lsl %[filter_workspace], x22, #1\n"
"stp x11, x12, [sp, #296]\n" // 16-byte Folded Spill
"add x11, x11, x26\n"
"add x12, x12, x26\n"
"add x14, x9, x7\n"
"add x15, x9, x24\n"
"stp x8, x5, [sp, #320]\n" // 16-byte Folded Spill
"add x8, x8, x26\n"
"add x10, x11, #32\n" // =32
"add x11, x12, #32\n" // =32
"add x19, x8, #32\n" // =32
"add x12, x14, x24\n"
"add x13, x14, %[filter_workspace]\n"
"add x8, x14, x22\n"
"add x25, x23, x14\n"
"add x14, x23, x15\n"
"add x17, x9, x22\n"
"mov %[scratch_block_data], x19\n"
"mov x19, x14\n"
"add x14, x24, x7\n"
"add x21, x23, x17\n"
"mov w17, w6\n"
"add x15, x23, x14\n"
"add x14, %[filter_workspace], x7\n"
"add x6, x23, x12\n"
"add x12, x23, x13\n"
"add %[function_params], x23, x14\n"
"mov x14, x12\n"
"and w12, w17, #0xfffffffe\n"
"str w12, [sp, #20]\n" // 4-byte Folded Spill
"lsl x12, x7, #2\n"
"str x12, [sp, #152]\n" // 8-byte Folded Spill
"add x12, x23, x22\n"
"str x12, [sp, #144]\n" // 8-byte Folded Spill
"add x12, x23, x7\n"
"add x16, x9, %[filter_workspace]\n"
"str x12, [sp, #136]\n" // 8-byte Folded Spill
"add x12, x23, %[filter_workspace]\n"
"dup v7.8b, v3.b[0]\n"
"dup v14.8b, v4.b[0]\n"
"add x20, x23, x16\n"
"mov x13, x15\n"
"add x15, x23, x8\n"
"mov x5, %[filter_workspace]\n"
"str x12, [sp, #128]\n" // 8-byte Folded Spill
"mov x8, x24\n"
"add x12, x23, x24\n"
"mov w1, #4\n"
"stp x23, x12, [sp, #112]\n" // 16-byte Folded Spill
"str x26, [sp, #264]\n" // 8-byte Folded Spill
"str x22, [sp, #200]\n" // 8-byte Folded Spill
"str w17, [sp, #108]\n" // 4-byte Folded Spill
"str %[scratch_block_data], [sp, #96]\n" // 8-byte Folded Spill
"str x23, [sp, #24]\n" // 8-byte Folded Spill
"stp d14, d7, [sp, #160]\n" // 16-byte Folded Spill
"b " DC_KERNEL_NO_MULT_4 "f\n"
DC_KERNEL_NO_MULT_2 ":\n" // in Loop: Header=BB111_4 Depth=1
"mov %[bias_data], x9\n"
DC_KERNEL_NO_MULT_3 ":\n" // in Loop: Header=BB111_4 Depth=1
"ldr %[output_block_data], [sp, #48]\n" // 8-byte Folded Reload
"ldr x12, [sp, #264]\n" // 8-byte Folded Reload
"ldr x17, [sp, #88]\n" // 8-byte Folded Reload
"add x12, x12, %[output_block_data]\n"
"str x12, [sp, #264]\n" // 8-byte Folded Spill
"ldr x12, [sp, #112]\n" // 8-byte Folded Reload
"add x17, x17, #1\n" // =1
"add x12, x12, #8\n" // =8
"str x12, [sp, #112]\n" // 8-byte Folded Spill
"ldr x12, [sp, #72]\n" // 8-byte Folded Reload
"add x12, x12, %[output_block_data]\n"
"str x12, [sp, #72]\n" // 8-byte Folded Spill
"ldp x12, %[output_block_data], [sp, #56]\n" // 16-byte Folded Reload
"cmp x17, x12\n"
"ldr x12, [sp, #80]\n" // 8-byte Folded Reload
"add x12, x12, #8\n" // =8
"stp x12, x17, [sp, #80]\n" // 16-byte Folded Spill
"ldr w17, [sp, #108]\n" // 4-byte Folded Reload
"b.eq " DC_KERNEL_NO_MULT_33 "f\n"
DC_KERNEL_NO_MULT_4 ":\n" // =>This Loop Header: Depth=1
// Child Loop BB111_29 Depth 2
// Child Loop BB111_32 Depth 2
// Child Loop BB111_20 Depth 2
// Child Loop BB111_22 Depth 3
// Child Loop BB111_25 Depth 4
// Child Loop BB111_7 Depth 2
// Child Loop BB111_9 Depth 3
// Child Loop BB111_15 Depth 3
"ldp q16, q15, [%[output_block_data]]\n"
"ldp q17, q3, [%[output_block_data], #32]\n"
"ldp q18, q4, [%[output_block_data], #64]\n"
"cmp w17, #4\n" // =4
"add %[output_block_data], x3, #96\n" // =96
"str %[output_block_data], [sp, #64]\n" // 8-byte Folded Spill
"b.ne " DC_KERNEL_NO_MULT_16 "f\n"
// %bb.5: // in Loop: Header=BB111_4 Depth=1
"ldp x24, x12, [sp, #80]\n" // 16-byte Folded Reload
"ldr x17, [sp, #32]\n" // 8-byte Folded Reload
"ldr x26, [sp, #72]\n" // 8-byte Folded Reload
"mov x9, xzr\n"
"lsl w12, w12, #3\n"
"lsl x12, x12, #2\n"
"add x16, x17, x12\n"
"ldr x17, [sp, #40]\n" // 8-byte Folded Reload
"stp q4, q3, [sp, #224]\n" // 32-byte Folded Spill
"str q15, [sp, #176]\n" // 16-byte Folded Spill
"add x12, x17, x12\n"
"stp x12, x16, [sp, #208]\n" // 16-byte Folded Spill
"b " DC_KERNEL_NO_MULT_7 "f\n"
DC_KERNEL_NO_MULT_6 ":\n" // in Loop: Header=BB111_7 Depth=2
"ldp q18, q17, [sp, #224]\n" // 32-byte Folded Reload
"add x9, x9, #1\n" // =1
"add x26, x26, #16\n" // =16
"cmp x9, #2\n" // =2
"add x24, x24, #4\n" // =4
"mov v16.16b, v15.16b\n"
"b.eq " DC_KERNEL_NO_MULT_3 "b\n"
DC_KERNEL_NO_MULT_7 ":\n" // Parent Loop BB111_4 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB111_9 Depth 3
// Child Loop BB111_15 Depth 3
"ldr q19, [%[bias_data]], #16\n"
"ldr x16, [sp, #264]\n" // 8-byte Folded Reload
"lsl x12, x9, #4\n"
"ldr w17, [sp, #344]\n" // 4-byte Folded Reload
"mov v31.16b, v19.16b\n"
"add %[output_block_data], x16, x12\n"
"ldr x16, [sp, #216]\n" // 8-byte Folded Reload
"ldr q22, [%[output_block_data]]\n"
"mov v8.16b, v19.16b\n"
"mov v9.16b, v19.16b\n"
"ldr q20, [x16, x12]\n"
"ldr x16, [sp, #208]\n" // 8-byte Folded Reload
"mov v10.16b, v19.16b\n"
"cmp w17, #1\n" // =1
"ldr q21, [x16, x12]\n"
"ldr x12, [sp, #328]\n" // 8-byte Folded Reload
"ldr q27, [%[output_block_data], x12]\n"
"ldr x12, [sp, #320]\n" // 8-byte Folded Reload
"ldr q26, [%[output_block_data], x12]\n"
"ldr x12, [sp, #312]\n" // 8-byte Folded Reload
".word 0x4e9a965f // sdot v31.4s, v18.16b, v26.16b\n"
"ldr q25, [%[output_block_data], x12]\n"
"ldr x12, [sp, #304]\n" // 8-byte Folded Reload
".word 0x4e9a9628 // sdot v8.4s, v17.16b, v26.16b\n"
".word 0x4e9a9609 // sdot v9.4s, v16.16b, v26.16b\n"
".word 0x4e99960a // sdot v10.4s, v16.16b, v25.16b\n"
"ldr q24, [%[output_block_data], x12]\n"
"ldr x12, [sp, #296]\n" // 8-byte Folded Reload
"ldr q23, [%[output_block_data], x12]\n"
"b.lt " DC_KERNEL_NO_MULT_11 "f\n"
// %bb.8: // in Loop: Header=BB111_7 Depth=2
"stp x24, x9, [sp, #280]\n" // 16-byte Folded Spill
"ldr w12, [sp, #344]\n" // 4-byte Folded Reload
"mov x17, x24\n"
"str x26, [sp, #272]\n" // 8-byte Folded Spill
"mov x22, x26\n"
"ldp x27, x24, [sp, #144]\n" // 16-byte Folded Reload
"ldp x26, %[filter_workspace], [sp, #128]\n" // 16-byte Folded Reload
"ldr x16, [sp, #120]\n" // 8-byte Folded Reload
"shl v28.4s, v16.4s, #8\n"
"shl v29.4s, v17.4s, #8\n"
"shl v30.4s, v18.4s, #8\n"
"mov v11.16b, v23.16b\n"
"mov v12.16b, v24.16b\n"
"mov v13.16b, v27.16b\n"
"mov v14.16b, v22.16b\n"
DC_KERNEL_NO_MULT_9 ":\n" // Parent Loop BB111_4 Depth=1
// Parent Loop BB111_7 Depth=2
// => This Inner Loop Header: Depth=3
".word 0x4e8e961f // sdot v31.4s, v16.16b, v14.16b\n"
".word 0x4e8d9608 // sdot v8.4s, v16.16b, v13.16b\n"
".word 0x4e999629 // sdot v9.4s, v17.16b, v25.16b\n"
".word 0x4e8d963f // sdot v31.4s, v17.16b, v13.16b\n"
".word 0x4e8c962a // sdot v10.4s, v17.16b, v12.16b\n"
".word 0x4e999648 // sdot v8.4s, v18.16b, v25.16b\n"
".word 0x4e8c9649 // sdot v9.4s, v18.16b, v12.16b\n"
"sqrdmulh v31.4s, v31.4s, v21.4s\n"
".word 0x4e8b964a // sdot v10.4s, v18.16b, v11.16b\n"
"sqrdmulh v8.4s, v8.4s, v21.4s\n"
"sqrdmulh v9.4s, v9.4s, v21.4s\n"
"sqrshl v31.4s, v31.4s, v20.4s\n"
"sqrdmulh v10.4s, v10.4s, v21.4s\n"
"sqrshl v8.4s, v8.4s, v20.4s\n"
"sqrshl v9.4s, v9.4s, v20.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqrshl v10.4s, v10.4s, v20.4s\n"
"sqxtn v9.4h, v9.4s\n"
"sqxtn2 v31.8h, v8.4s\n"
"sqxtn2 v9.8h, v10.4s\n"
"sqadd v31.8h, v31.8h, v0.8h\n"
"sqadd v8.8h, v9.8h, v0.8h\n"
"sqxtn v31.8b, v31.8h\n"
"sqxtn2 v31.16b, v8.8h\n"
"smax v31.16b, v31.16b, v5.16b\n"
"add %[output_block_data], x27, x17\n"
"smin v31.16b, v31.16b, v6.16b\n"
"str s31, [x23, x17]\n"
"st1 { v31.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x26, x17\n"
"st1 { v31.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x16, x17\n"
"st1 { v31.s }[3], [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #376]\n" // 8-byte Folded Reload
"mov v10.16b, v19.16b\n"
"mov v31.16b, v19.16b\n"
"mov v8.16b, v19.16b\n"
"ldr x9, [sp, #352]\n" // 8-byte Folded Reload
".word 0x4e99978a // sdot v10.4s, v28.16b, v25.16b\n"
".word 0x4e8e979f // sdot v31.4s, v28.16b, v14.16b\n"
".word 0x4e8d9788 // sdot v8.4s, v28.16b, v13.16b\n"
".word 0x4e8c97aa // sdot v10.4s, v29.16b, v12.16b\n"
"mov v9.16b, v19.16b\n"
".word 0x4e8d97bf // sdot v31.4s, v29.16b, v13.16b\n"
".word 0x4e9a97a8 // sdot v8.4s, v29.16b, v26.16b\n"
".word 0x4e8b97ca // sdot v10.4s, v30.16b, v11.16b\n"
"add %[output_block_data], x3, x22\n"
"rev32 v2.8h, v26.8h\n"
".word 0x4e9a9789 // sdot v9.4s, v28.16b, v26.16b\n"
".word 0x4e9a97df // sdot v31.4s, v30.16b, v26.16b\n"
".word 0x4e9997c8 // sdot v8.4s, v30.16b, v25.16b\n"
"sqrdmulh v26.4s, v10.4s, v21.4s\n"
"rev32 v15.8h, v22.8h\n"
"ldr q22, [%[output_block_data], #32]\n"
"add %[output_block_data], x9, x17\n"
"rev32 v4.8h, v24.8h\n"
".word 0x4e9997a9 // sdot v9.4s, v29.16b, v25.16b\n"
"sqrdmulh v24.4s, v8.4s, v21.4s\n"
"sqrshl v8.4s, v26.4s, v20.4s\n"
"ldr q26, [%[scratch_block_data], x22]\n"
"mov x9, %[scratch_block_data]\n"
"ldr %[scratch_block_data], [sp, #368]\n" // 8-byte Folded Reload
"mov v7.16b, v6.16b\n"
"mov v6.16b, v5.16b\n"
"rev32 v5.8h, v23.8h\n"
".word 0x4e8c97c9 // sdot v9.4s, v30.16b, v12.16b\n"
"sqrdmulh v23.4s, v31.4s, v21.4s\n"
"rev32 v3.8h, v25.8h\n"
"sqrdmulh v25.4s, v9.4s, v21.4s\n"
"sqrshl v23.4s, v23.4s, v20.4s\n"
"sqrshl v31.4s, v24.4s, v20.4s\n"
"sqrshl v24.4s, v25.4s, v20.4s\n"
"sqxtn v9.4h, v23.4s\n"
"rev32 v1.8h, v27.8h\n"
"sqxtn v10.4h, v24.4s\n"
"ldr q27, [x28, x22]\n"
"ldr q25, [%[scratch_block_data], x22]\n"
"ldr q24, [x11, x22]\n"
"ldr q23, [x10, x22]\n"
"sqxtn2 v9.8h, v31.4s\n"
"sqxtn2 v10.8h, v8.4s\n"
"sqadd v31.8h, v9.8h, v0.8h\n"
"sqadd v8.8h, v10.8h, v0.8h\n"
"sqxtn v31.8b, v31.8h\n"
"sqxtn2 v31.16b, v8.8h\n"
"smax v31.16b, v31.16b, v6.16b\n"
"smin v31.16b, v31.16b, v7.16b\n"
"str s31, [%[filter_workspace], x17]\n"
"st1 { v31.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], %[function_params], x17\n"
"st1 { v31.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x13, x17\n"
"mov v8.16b, v19.16b\n"
"st1 { v31.s }[3], [%[output_block_data]]\n"
"trn1 v31.8h, v15.8h, v22.8h\n"
"mov v9.16b, v19.16b\n"
"mov v10.16b, v19.16b\n"
"trn1 v1.8h, v1.8h, v27.8h\n"
"trn1 v2.8h, v2.8h, v26.8h\n"
".word 0x4e9f9608 // sdot v8.4s, v16.16b, v31.16b\n"
"mov v11.16b, v19.16b\n"
"trn1 v3.8h, v3.8h, v25.8h\n"
".word 0x4e819609 // sdot v9.4s, v16.16b, v1.16b\n"
".word 0x4e82960a // sdot v10.4s, v16.16b, v2.16b\n"
".word 0x4e819628 // sdot v8.4s, v17.16b, v1.16b\n"
"trn1 v4.8h, v4.8h, v24.8h\n"
".word 0x4e83960b // sdot v11.4s, v16.16b, v3.16b\n"
".word 0x4e829629 // sdot v9.4s, v17.16b, v2.16b\n"
".word 0x4e83962a // sdot v10.4s, v17.16b, v3.16b\n"
".word 0x4e829648 // sdot v8.4s, v18.16b, v2.16b\n"
"trn1 v5.8h, v5.8h, v23.8h\n"
".word 0x4e84962b // sdot v11.4s, v17.16b, v4.16b\n"
".word 0x4e839649 // sdot v9.4s, v18.16b, v3.16b\n"
".word 0x4e84964a // sdot v10.4s, v18.16b, v4.16b\n"
"sqrdmulh v8.4s, v8.4s, v21.4s\n"
".word 0x4e85964b // sdot v11.4s, v18.16b, v5.16b\n"
"sqrdmulh v9.4s, v9.4s, v21.4s\n"
"sqrdmulh v10.4s, v10.4s, v21.4s\n"
"sqrshl v8.4s, v8.4s, v20.4s\n"
"sqrdmulh v11.4s, v11.4s, v21.4s\n"
"sqrshl v9.4s, v9.4s, v20.4s\n"
"sqrshl v10.4s, v10.4s, v20.4s\n"
"sqxtn v8.4h, v8.4s\n"
"sqrshl v11.4s, v11.4s, v20.4s\n"
"sqxtn v10.4h, v10.4s\n"
"sqxtn2 v8.8h, v9.4s\n"
"sqxtn2 v10.8h, v11.4s\n"
"sqadd v8.8h, v8.8h, v0.8h\n"
"sqadd v9.8h, v10.8h, v0.8h\n"
"sqxtn v8.8b, v8.8h\n"
"sqxtn2 v8.16b, v9.8h\n"
"mov v9.16b, v19.16b\n"
"ldr %[scratch_block_data], [sp, #360]\n" // 8-byte Folded Reload
"mov v10.16b, v19.16b\n"
"mov v11.16b, v19.16b\n"
".word 0x4e9f9789 // sdot v9.4s, v28.16b, v31.16b\n"
"mov v12.16b, v19.16b\n"
".word 0x4e81978a // sdot v10.4s, v28.16b, v1.16b\n"
".word 0x4e82978b // sdot v11.4s, v28.16b, v2.16b\n"
".word 0x4e8197a9 // sdot v9.4s, v29.16b, v1.16b\n"
"smax v8.16b, v8.16b, v6.16b\n"
".word 0x4e83978c // sdot v12.4s, v28.16b, v3.16b\n"
".word 0x4e8297aa // sdot v10.4s, v29.16b, v2.16b\n"
".word 0x4e8397ab // sdot v11.4s, v29.16b, v3.16b\n"
".word 0x4e8297c9 // sdot v9.4s, v30.16b, v2.16b\n"
"add %[output_block_data], x21, x17\n"
"smin v8.16b, v8.16b, v7.16b\n"
".word 0x4e8497ac // sdot v12.4s, v29.16b, v4.16b\n"
".word 0x4e8397ca // sdot v10.4s, v30.16b, v3.16b\n"
".word 0x4e8497cb // sdot v11.4s, v30.16b, v4.16b\n"
"sqrdmulh v1.4s, v9.4s, v21.4s\n"
"str s8, [%[scratch_block_data], x17]\n"
"st1 { v8.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x20, x17\n"
".word 0x4e8597cc // sdot v12.4s, v30.16b, v5.16b\n"
"sqrdmulh v2.4s, v10.4s, v21.4s\n"
"sqrdmulh v3.4s, v11.4s, v21.4s\n"
"sqrshl v1.4s, v1.4s, v20.4s\n"
"st1 { v8.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x19, x17\n"
"sqrdmulh v4.4s, v12.4s, v21.4s\n"
"sqrshl v2.4s, v2.4s, v20.4s\n"
"sqrshl v3.4s, v3.4s, v20.4s\n"
"sqxtn v1.4h, v1.4s\n"
"st1 { v8.s }[3], [%[output_block_data]]\n"
"sqrshl v4.4s, v4.4s, v20.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v1.8h, v2.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v1.8h, v1.8h, v0.8h\n"
"sqadd v2.8h, v3.8h, v0.8h\n"
"sqxtn v1.8b, v1.8h\n"
"mov v5.16b, v6.16b\n"
"sqxtn2 v1.16b, v2.8h\n"
"smax v1.16b, v1.16b, v5.16b\n"
"add %[output_block_data], x15, x17\n"
"smin v1.16b, v1.16b, v7.16b\n"
"str s1, [x25, x17]\n"
"st1 { v1.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x14, x17\n"
"mov v31.16b, v19.16b\n"
"mov v8.16b, v19.16b\n"
"mov v9.16b, v19.16b\n"
"mov v10.16b, v19.16b\n"
"mov %[scratch_block_data], x9\n"
"mov v6.16b, v7.16b\n"
"st1 { v1.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x6, x17\n"
"subs w12, w12, #1\n" // =1
"add x22, x22, #32\n" // =32
".word 0x4e9a965f // sdot v31.4s, v18.16b, v26.16b\n"
".word 0x4e9a9628 // sdot v8.4s, v17.16b, v26.16b\n"
".word 0x4e9a9609 // sdot v9.4s, v16.16b, v26.16b\n"
".word 0x4e99960a // sdot v10.4s, v16.16b, v25.16b\n"
"add x17, x17, x24\n"
"mov v11.16b, v23.16b\n"
"mov v12.16b, v24.16b\n"
"mov v13.16b, v27.16b\n"
"mov v14.16b, v22.16b\n"
"st1 { v1.s }[3], [%[output_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_9 "b\n"
// %bb.10: // in Loop: Header=BB111_7 Depth=2
"ldr x12, [sp, #376]\n" // 8-byte Folded Reload
"ldp d14, d7, [sp, #160]\n" // 16-byte Folded Reload
"ldr q15, [sp, #176]\n" // 16-byte Folded Reload
"ldp x24, x9, [sp, #280]\n" // 16-byte Folded Reload
"add %[output_block_data], x12, x22\n"
"ldr x22, [sp, #200]\n" // 8-byte Folded Reload
"ldr x26, [sp, #272]\n" // 8-byte Folded Reload
"add x12, x23, x17\n"
"mov w1, #4\n"
"ldr w17, [sp, #348]\n" // 4-byte Folded Reload
"cmp w17, #0\n" // =0
"b.gt " DC_KERNEL_NO_MULT_12 "f\n"
"b " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_11 ":\n" // in Loop: Header=BB111_7 Depth=2
"ldr x12, [sp, #112]\n" // 8-byte Folded Reload
"add x12, x12, x9, lsl #2\n"
"ldr w17, [sp, #348]\n" // 4-byte Folded Reload
"cmp w17, #0\n" // =0
"b.le " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_12 ":\n" // in Loop: Header=BB111_7 Depth=2
"ldr w17, [sp, #348]\n" // 4-byte Folded Reload
"movi v28.16b, #0\n"
"movi v29.16b, #0\n"
"movi v30.16b, #0\n"
"cmp w17, #3\n" // =3
"movi v11.16b, #0\n"
"movi v12.16b, #0\n"
"movi v13.16b, #0\n"
"b.lt " DC_KERNEL_NO_MULT_14 "f\n"
// %bb.13: // in Loop: Header=BB111_7 Depth=2
"add x17, %[output_block_data], #32\n" // =32
"ldp x16, %[output_block_data], [sp, #320]\n" // 16-byte Folded Reload
"ldr q13, [x17]\n"
"ldr %[scratch_block_data], [sp, #96]\n" // 8-byte Folded Reload
"ldr q12, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #312]\n" // 8-byte Folded Reload
"ldr q11, [x17, x16]\n"
"ldr q30, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #304]\n" // 8-byte Folded Reload
"ldr q29, [x17, %[output_block_data]]\n"
"ldr %[output_block_data], [sp, #296]\n" // 8-byte Folded Reload
"ldr q28, [x17, %[output_block_data]]\n"
DC_KERNEL_NO_MULT_14 ":\n" // in Loop: Header=BB111_7 Depth=2
"ldr w17, [sp, #348]\n" // 4-byte Folded Reload
DC_KERNEL_NO_MULT_15 ":\n" // Parent Loop BB111_4 Depth=1
// Parent Loop BB111_7 Depth=2
// => This Inner Loop Header: Depth=3
".word 0x4e96961f // sdot v31.4s, v16.16b, v22.16b\n"
".word 0x4e9b9608 // sdot v8.4s, v16.16b, v27.16b\n"
".word 0x4e999629 // sdot v9.4s, v17.16b, v25.16b\n"
".word 0x4e9b963f // sdot v31.4s, v17.16b, v27.16b\n"
".word 0x4e98962a // sdot v10.4s, v17.16b, v24.16b\n"
".word 0x4e999648 // sdot v8.4s, v18.16b, v25.16b\n"
".word 0x4e989649 // sdot v9.4s, v18.16b, v24.16b\n"
"sqrdmulh v1.4s, v31.4s, v21.4s\n"
".word 0x4e97964a // sdot v10.4s, v18.16b, v23.16b\n"
"sqrdmulh v2.4s, v8.4s, v21.4s\n"
"sqrdmulh v3.4s, v9.4s, v21.4s\n"
"sqrshl v1.4s, v1.4s, v20.4s\n"
"sqrdmulh v4.4s, v10.4s, v21.4s\n"
"sqrshl v2.4s, v2.4s, v20.4s\n"
"sqrshl v3.4s, v3.4s, v20.4s\n"
"sqxtn v1.4h, v1.4s\n"
"sqrshl v4.4s, v4.4s, v20.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v1.8h, v2.4s\n"
"sqxtn2 v3.8h, v4.4s\n"
"sqadd v1.8h, v1.8h, v0.8h\n"
"sqadd v2.8h, v3.8h, v0.8h\n"
"sqxtn v1.8b, v1.8h\n"
"sqxtn2 v1.16b, v2.8h\n"
"smax v1.16b, v1.16b, v5.16b\n"
"add %[output_block_data], x12, x22\n"
"smin v1.16b, v1.16b, v6.16b\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v25.4s, v25.4s, #8\n"
"str s1, [x12]\n"
"st1 { v1.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x12, x5\n"
"ushr v22.4s, v22.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"sli v26.4s, v11.4s, #24\n"
"ushr v24.4s, v24.4s, #8\n"
"ushr v23.4s, v23.4s, #8\n"
"sli v25.4s, v30.4s, #24\n"
"mov v31.16b, v19.16b\n"
"mov v8.16b, v19.16b\n"
"mov v9.16b, v19.16b\n"
"mov v10.16b, v19.16b\n"
"st1 { v1.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x12, x8\n"
"subs w17, w17, #1\n" // =1
"sli v22.4s, v13.4s, #24\n"
"ushr v13.4s, v13.4s, #8\n"
"ushr v11.4s, v11.4s, #8\n"
"sli v27.4s, v12.4s, #24\n"
"ushr v12.4s, v12.4s, #8\n"
"ushr v30.4s, v30.4s, #8\n"
"sli v24.4s, v29.4s, #24\n"
"ushr v29.4s, v29.4s, #8\n"
"sli v23.4s, v28.4s, #24\n"
"ushr v28.4s, v28.4s, #8\n"
".word 0x4e9a965f // sdot v31.4s, v18.16b, v26.16b\n"
".word 0x4e9a9628 // sdot v8.4s, v17.16b, v26.16b\n"
".word 0x4e9a9609 // sdot v9.4s, v16.16b, v26.16b\n"
"add x12, x12, x7\n"
".word 0x4e99960a // sdot v10.4s, v16.16b, v25.16b\n"
"st1 { v1.s }[3], [%[output_block_data]]\n"
"b.ne " DC_KERNEL_NO_MULT_15 "b\n"
"b " DC_KERNEL_NO_MULT_6 "b\n"
DC_KERNEL_NO_MULT_16 ":\n" // in Loop: Header=BB111_4 Depth=1
"cmp w17, #1\n" // =1
"add x9, %[bias_data], #32\n" // =32
"b.lt " DC_KERNEL_NO_MULT_2 "b\n"
// %bb.17: // in Loop: Header=BB111_4 Depth=1
"ldr w12, [sp, #340]\n" // 4-byte Folded Reload
"cmp w12, #1\n" // =1
"b.lt " DC_KERNEL_NO_MULT_27 "f\n"
// %bb.18: // in Loop: Header=BB111_4 Depth=1
"ldr x12, [sp, #88]\n" // 8-byte Folded Reload
"ldp x17, %[output_block_data], [sp, #32]\n" // 16-byte Folded Reload
"str x9, [sp, #288]\n" // 8-byte Folded Spill
"ldp q19, q20, [%[bias_data]]\n"
"lsl w12, w12, #3\n"
"lsl x12, x12, #2\n"
"add x17, x17, x12\n"
"add x12, %[output_block_data], x12\n"
"ldp q21, q22, [x17]\n"
"ldp q23, q24, [x12]\n"
"ldr x9, [sp, #264]\n" // 8-byte Folded Reload
"ldr x27, [sp, #112]\n" // 8-byte Folded Reload
"mov w26, wzr\n"
"b " DC_KERNEL_NO_MULT_20 "f\n"
DC_KERNEL_NO_MULT_19 ":\n" // in Loop: Header=BB111_20 Depth=2
"ldr w12, [sp, #108]\n" // 4-byte Folded Reload
"ldr x22, [sp, #200]\n" // 8-byte Folded Reload
"add w26, w26, #1\n" // =1
"cmp w26, w12\n"
"add x27, x27, x22\n"
"b.eq " DC_KERNEL_NO_MULT_26 "f\n"
DC_KERNEL_NO_MULT_20 ":\n" // Parent Loop BB111_4 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB111_22 Depth 3
// Child Loop BB111_25 Depth 4
"ldp x16, %[output_block_data], [sp, #320]\n" // 16-byte Folded Reload
"ldp q25, q26, [x9]\n"
"mov w12, wzr\n"
"mov x17, x9\n"
"add %[scratch_block_data], x9, %[output_block_data]\n"
"add %[output_block_data], x9, x16\n"
"ldp q27, q28, [%[scratch_block_data]]\n"
"ldp q29, q30, [%[output_block_data]]\n"
"mov x9, %[scratch_block_data]\n"
"mov x22, x27\n"
"b " DC_KERNEL_NO_MULT_22 "f\n"
DC_KERNEL_NO_MULT_21 ":\n" // in Loop: Header=BB111_22 Depth=3
"ldr w16, [sp, #340]\n" // 4-byte Folded Reload
"add w12, w12, #1\n" // =1
"mov x17, %[scratch_block_data]\n"
"cmp w12, w16\n"
"b.eq " DC_KERNEL_NO_MULT_19 "b\n"
DC_KERNEL_NO_MULT_22 ":\n" // Parent Loop BB111_4 Depth=1
// Parent Loop BB111_20 Depth=2
// => This Loop Header: Depth=3
// Child Loop BB111_25 Depth 4
"ldr w16, [sp, #344]\n" // 4-byte Folded Reload
"add %[scratch_block_data], x17, #32\n" // =32
"cmp w12, w16\n"
"ldr w16, [sp, #348]\n" // 4-byte Folded Reload
"csel w3, w16, w1, eq\n"
"cmp w3, #3\n" // =3
"b.ge " DC_KERNEL_NO_MULT_24 "f\n"
// %bb.23: // in Loop: Header=BB111_22 Depth=3
"movi v31.16b, #0\n"
"cmp w3, #1\n" // =1
"movi v8.16b, #0\n"
"movi v9.16b, #0\n"
"movi v11.16b, #0\n"
"movi v12.16b, #0\n"
"movi v10.16b, #0\n"
"b.ge " DC_KERNEL_NO_MULT_25 "f\n"
"b " DC_KERNEL_NO_MULT_21 "b\n"
DC_KERNEL_NO_MULT_24 ":\n" // in Loop: Header=BB111_22 Depth=3
"ldr x24, [sp, #328]\n" // 8-byte Folded Reload
"mov x16, x11\n"
"mov x11, x10\n"
"mov x10, %[scratch_block_data]\n"
"add x24, %[scratch_block_data], x24\n"
"ldr %[scratch_block_data], [sp, #320]\n" // 8-byte Folded Reload
"ldp q10, q9, [x17, #32]\n"
"ldp q12, q8, [x24]\n"
"mov x23, x15\n"
"add %[scratch_block_data], x10, x0\n"
"ldp q11, q31, [%[scratch_block_data]]\n"
"mov x15, x14\n"
"mov x14, x6\n"
"mov %[bias_data], x13\n"
"mov x13, x21\n"
"mov x21, x20\n"
"mov x20, x19\n"
"mov x19, x25\n"
"mov x19, x20\n"
"mov x20, x21\n"
"mov x21, x13\n"
"mov x13, %[bias_data]\n"
"mov x14, x15\n"
"mov x15, x23\n"
"mov %[scratch_block_data], x10\n"
"mov x10, x11\n"
"mov x11, x16\n"
DC_KERNEL_NO_MULT_25 ":\n" // Parent Loop BB111_4 Depth=1
// Parent Loop BB111_20 Depth=2
// Parent Loop BB111_22 Depth=3
// => This Inner Loop Header: Depth=4
"mov v1.16b, v19.16b\n"
"mov v2.16b, v20.16b\n"
".word 0x4e999601 // sdot v1.4s, v16.16b, v25.16b\n"
".word 0x4e9a95e2 // sdot v2.4s, v15.16b, v26.16b\n"
".word 0x4e9b9621 // sdot v1.4s, v17.16b, v27.16b\n"
".word 0x4e9c9462 // sdot v2.4s, v3.16b, v28.16b\n"
".word 0x4e9d9641 // sdot v1.4s, v18.16b, v29.16b\n"
".word 0x4e9e9482 // sdot v2.4s, v4.16b, v30.16b\n"
"sqrdmulh v1.4s, v1.4s, v23.4s\n"
"sqrdmulh v2.4s, v2.4s, v24.4s\n"
"sqrshl v1.4s, v1.4s, v21.4s\n"
"sqrshl v2.4s, v2.4s, v22.4s\n"
"sqxtn v1.4h, v1.4s\n"
"sqxtn2 v1.8h, v2.4s\n"
"sqadd v1.8h, v1.8h, v0.8h\n"
"sqxtn v1.8b, v1.8h\n"
"smax v1.8b, v1.8b, v7.8b\n"
"ushr v25.4s, v25.4s, #8\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"ushr v28.4s, v28.4s, #8\n"
"ushr v29.4s, v29.4s, #8\n"
"ushr v30.4s, v30.4s, #8\n"
"smin v1.8b, v1.8b, v14.8b\n"
"subs w3, w3, #1\n" // =1
"sli v25.4s, v10.4s, #24\n"
"ushr v10.4s, v10.4s, #8\n"
"sli v26.4s, v9.4s, #24\n"
"ushr v9.4s, v9.4s, #8\n"
"sli v27.4s, v12.4s, #24\n"
"ushr v12.4s, v12.4s, #8\n"
"sli v28.4s, v8.4s, #24\n"
"ushr v8.4s, v8.4s, #8\n"
"sli v29.4s, v11.4s, #24\n"
"ushr v11.4s, v11.4s, #8\n"
"sli v30.4s, v31.4s, #24\n"
"ushr v31.4s, v31.4s, #8\n"
"str d1, [x22]\n"
"add x22, x22, x7\n"
"b.ne " DC_KERNEL_NO_MULT_25 "b\n"
"b " DC_KERNEL_NO_MULT_21 "b\n"
DC_KERNEL_NO_MULT_26 ":\n" // in Loop: Header=BB111_4 Depth=1
"ldr %[bias_data], [sp, #288]\n" // 8-byte Folded Reload
"ldr x23, [sp, #24]\n" // 8-byte Folded Reload
"ldr %[scratch_block_data], [sp, #96]\n" // 8-byte Folded Reload
"b " DC_KERNEL_NO_MULT_3 "b\n"
DC_KERNEL_NO_MULT_27 ":\n" // in Loop: Header=BB111_4 Depth=1
"ldr w12, [sp, #20]\n" // 4-byte Folded Reload
"cmp w17, #2\n" // =2
"b.hs " DC_KERNEL_NO_MULT_29 "f\n"
// %bb.28: // in Loop: Header=BB111_4 Depth=1
"mov w12, wzr\n"
"b " DC_KERNEL_NO_MULT_31 "f\n"
DC_KERNEL_NO_MULT_29 ":\n" // Parent Loop BB111_4 Depth=1
// => This Inner Loop Header: Depth=2
"subs w12, w12, #2\n" // =2
"b.ne " DC_KERNEL_NO_MULT_29 "b\n"
// %bb.30: // in Loop: Header=BB111_4 Depth=1
"ldr w12, [sp, #20]\n" // 4-byte Folded Reload
"cmp w17, w12\n"
"b.eq " DC_KERNEL_NO_MULT_2 "b\n"
DC_KERNEL_NO_MULT_31 ":\n" // in Loop: Header=BB111_4 Depth=1
"sub w12, w17, w12\n"
DC_KERNEL_NO_MULT_32 ":\n" // Parent Loop BB111_4 Depth=1
// => This Inner Loop Header: Depth=2
"subs w12, w12, #1\n" // =1
"b.ne " DC_KERNEL_NO_MULT_32 "b\n"
"b " DC_KERNEL_NO_MULT_2 "b\n"
DC_KERNEL_NO_MULT_33 ":\n"
// Compiled intrinsics total stack 528, now 384 for spillage only.
"add sp, sp, #384\n" // =528
:
// Outputs.
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
// Inputs.
[ function_params ] "r"(function_params)
:
// Clobbers.
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
// We use these general-purpose registers.
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28");
#undef DC_KERNEL_NO_MULT_1
#undef DC_KERNEL_NO_MULT_2
#undef DC_KERNEL_NO_MULT_3
#undef DC_KERNEL_NO_MULT_4
#undef DC_KERNEL_NO_MULT_5
#undef DC_KERNEL_NO_MULT_6
#undef DC_KERNEL_NO_MULT_7
#undef DC_KERNEL_NO_MULT_8
#undef DC_KERNEL_NO_MULT_9
#undef DC_KERNEL_NO_MULT_10
#undef DC_KERNEL_NO_MULT_11
#undef DC_KERNEL_NO_MULT_12
#undef DC_KERNEL_NO_MULT_13
#undef DC_KERNEL_NO_MULT_14
#undef DC_KERNEL_NO_MULT_15
#undef DC_KERNEL_NO_MULT_16
#undef DC_KERNEL_NO_MULT_17
#undef DC_KERNEL_NO_MULT_18
#undef DC_KERNEL_NO_MULT_19
#undef DC_KERNEL_NO_MULT_20
#undef DC_KERNEL_NO_MULT_21
#undef DC_KERNEL_NO_MULT_22
#undef DC_KERNEL_NO_MULT_23
#undef DC_KERNEL_NO_MULT_24
#undef DC_KERNEL_NO_MULT_25
#undef DC_KERNEL_NO_MULT_26
#undef DC_KERNEL_NO_MULT_27
#undef DC_KERNEL_NO_MULT_28
#undef DC_KERNEL_NO_MULT_29
#undef DC_KERNEL_NO_MULT_30
#undef DC_KERNEL_NO_MULT_31
#undef DC_KERNEL_NO_MULT_32
#undef DC_KERNEL_NO_MULT_33
} // NOLINT(readability/fn_size) Manually unrolled.