in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h [7169:7833]
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
// Note that argument registers may be reused after parameter loading.
// x0 %[scratch_block_data]
// x1 %[filter_workspace]
// x2 %[bias_data]
// x3 %[output_block_data]
// x4 %[function_params]
#define DC_KERNEL_NO_MULT_1 "1"
#define DC_KERNEL_NO_MULT_2 "2"
#define DC_KERNEL_NO_MULT_3 "3"
#define DC_KERNEL_NO_MULT_4 "4"
#define DC_KERNEL_NO_MULT_5 "5"
#define DC_KERNEL_NO_MULT_6 "6"
#define DC_KERNEL_NO_MULT_7 "7"
#define DC_KERNEL_NO_MULT_8 "8"
#define DC_KERNEL_NO_MULT_9 "9"
#define DC_KERNEL_NO_MULT_10 "10"
#define DC_KERNEL_NO_MULT_11 "11"
#define DC_KERNEL_NO_MULT_12 "12"
#define DC_KERNEL_NO_MULT_13 "13"
#define DC_KERNEL_NO_MULT_14 "14"
#define DC_KERNEL_NO_MULT_15 "15"
#define DC_KERNEL_NO_MULT_16 "16"
#define DC_KERNEL_NO_MULT_17 "17"
#define DC_KERNEL_NO_MULT_18 "18"
#define DC_KERNEL_NO_MULT_19 "19"
#define DC_KERNEL_NO_MULT_20 "20"
#define DC_KERNEL_NO_MULT_21 "21"
#define DC_KERNEL_NO_MULT_22 "22"
#define DC_KERNEL_NO_MULT_23 "23"
#define DC_KERNEL_NO_MULT_24 "24"
#define DC_KERNEL_NO_MULT_25 "25"
#define DC_KERNEL_NO_MULT_26 "26"
#ifdef __linux__
asm volatile(
// Compiled code used block of 288 for spill out of total stack of 448.
// However, two 4-byte spills were sneaked in to #360 and #364.
// Spillage increased to 304 and these are mapped to #288 and #292.
"sub sp, sp, #304\n" // =448
"ldp w9, w14, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldpsw x12, x21, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldrsw x8, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"ldrsw x16, [%[function_params]]\n"
"str w9, [sp, #292]\n" // 4-byte Folded Spill
"ldr w9, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"ldrb w10, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
"lsl x8, x8, #5\n"
"str x8, [sp, #8]\n" // 8-byte Folded Spill
"str w9, [sp, #20]\n" // 4-byte Folded Spill
"ldr w9, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"add x8, x12, x12, lsl #1\n"
"ldr w5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"add x11, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n" // =32
"str w9, [sp, #288]\n" // 4-byte Folded Spill
"ldrb w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
"add x15, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n" // =36
"add x13, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n" // =28
"ld1r { v0.8h }, [x13]\n"
"dup v3.16b, w9\n"
"dup v5.8b, w9\n"
"add x9, x16, x16, lsl #1\n"
"add x7, x9, x8\n"
"add x28, x9, x12, lsl #1\n"
"add %[function_params], x9, x12\n"
"add x9, %[output_block_data], x9\n"
"add x13, x12, x16, lsl #1\n"
"str x9, [sp, #112]\n" // 8-byte Folded Spill
"add x9, x8, x16, lsl #1\n"
"str q3, [sp, #272]\n" // 16-byte Folded Spill
"dup v3.16b, w10\n"
"dup v6.8b, w10\n"
"lsl x10, x16, #1\n"
"add x13, %[output_block_data], x13\n"
"add x29, %[output_block_data], x9\n"
"add x9, x21, x21, lsl #1\n"
"ld1r { v1.4s }, [x11]\n"
"ld1r { v2.4s }, [x15]\n"
"add x15, x16, x12, lsl #1\n"
"add x10, x10, x12, lsl #1\n"
"str x13, [sp, #200]\n" // 8-byte Folded Spill
"add x13, x8, x16\n"
"add x22, %[output_block_data], x8\n"
"add x8, %[scratch_block_data], x21\n"
"str x9, [sp, #96]\n" // 8-byte Folded Spill
"add x9, %[scratch_block_data], x9\n"
"add x17, x12, x16\n"
"add x15, %[output_block_data], x15\n"
"add x25, x8, #32\n" // =32
"add x30, %[output_block_data], x10\n"
"add x8, x21, x21, lsl #2\n"
"add x10, x9, #32\n" // =32
"lsl x9, x21, #1\n"
"mov x6, %[filter_workspace]\n"
"mov %[filter_workspace], xzr\n"
"mov w27, wzr\n"
"add x11, %[scratch_block_data], x21, lsl #1\n"
"add x23, %[scratch_block_data], x21, lsl #2\n"
"str x15, [sp, #192]\n" // 8-byte Folded Spill
"add x15, %[output_block_data], x17\n"
"str x8, [sp, #104]\n" // 8-byte Folded Spill
"add x8, %[scratch_block_data], x8\n"
"str x9, [sp, #176]\n" // 8-byte Folded Spill
"lsl x9, x21, #2\n"
"mov x19, xzr\n"
"str x15, [sp, #184]\n" // 8-byte Folded Spill
"add x23, x23, #32\n" // =32
"add x24, x11, #32\n" // =32
"add x26, %[output_block_data], x7\n"
"mov w7, wzr\n"
"add x27, %[output_block_data], x28\n"
"add x28, %[output_block_data], %[function_params]\n"
"add x15, %[output_block_data], x13\n"
"mov x13, xzr\n"
"add x8, x8, #32\n" // =32
"stp x12, %[scratch_block_data], [sp, #120]\n" // 16-byte Folded Spill
"add x11, %[scratch_block_data], #32\n" // =32
"mov %[filter_workspace], x21\n"
"str x9, [sp, #88]\n" // 8-byte Folded Spill
"lsl %[function_params], x16, #2\n"
"add x9, %[output_block_data], x16, lsl #1\n"
"add x21, %[output_block_data], x16\n"
"add x17, %[output_block_data], x12, lsl #1\n"
"add x12, %[output_block_data], x12\n"
"str q3, [sp, #256]\n" // 16-byte Folded Spill
"str %[output_block_data], [sp, #64]\n" // 8-byte Folded Spill
"str %[output_block_data], [sp, #136]\n" // 8-byte Folded Spill
"stp d6, d5, [sp, #72]\n" // 16-byte Folded Spill
"b " DC_KERNEL_NO_MULT_26 "f\n"
DC_KERNEL_NO_MULT_1 ":\
" // in Loop: Header=BB225_26 Depth=1
"str w7, [sp, #36]\n" // 4-byte Folded Spill
"ldr w0, [sp, #288]\n" // 4-byte Folded Reload
"ldp q18, q7, [x6]\n"
"ldp q19, q16, [x6, #32]\n"
"ldp q20, q17, [x6, #64]\n"
"cmp w0, #4\n" // =4
"add x6, x6, #96\n" // =96
"stp x19, %[bias_data], [sp, #48]\n" // 16-byte Folded Spill
"str x13, [sp, #40]\n" // 8-byte Folded Spill
"str x6, [sp, #24]\n" // 8-byte Folded Spill
"b.ne " DC_KERNEL_NO_MULT_14 "f\n"
// %bb.2: // in Loop: Header=BB225_26 Depth=1
"mov %[scratch_block_data], xzr\n"
"mov %[output_block_data], x13\n"
"str %[bias_data], [sp, #168]\n" // 8-byte Folded Spill
"b " DC_KERNEL_NO_MULT_13 "f\n"
DC_KERNEL_NO_MULT_3 ":\n" // in Loop: Header=BB225_13 Depth=2
"ldr x13, [sp, #128]\n" // 8-byte Folded Reload
"str %[scratch_block_data], [sp, #160]\n" // 8-byte Folded Spill
"ldr x6, [sp, #136]\n" // 8-byte Folded Reload
"shl v3.4s, v18.4s, #8\n"
"add x13, x13, %[scratch_block_data], lsl #4\n"
"ldr %[scratch_block_data], [sp, #168]\n" // 8-byte Folded Reload
"ldr q14, [x13]\n"
"ldr q23, [x13, %[filter_workspace]]\n"
"str q3, [sp, #240]\n" // 16-byte Folded Spill
"ldr q21, [%[scratch_block_data]]\n"
"ldr %[scratch_block_data], [sp, #176]\n" // 8-byte Folded Reload
"shl v3.4s, v19.4s, #8\n"
"mov w2, wzr\n"
"mov v31.16b, v21.16b\n"
"ldr q24, [x13, %[scratch_block_data]]\n"
"ldr %[scratch_block_data], [sp, #96]\n" // 8-byte Folded Reload
"mov v8.16b, v21.16b\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"ldr q25, [x13, %[scratch_block_data]]\n"
"ldr %[scratch_block_data], [sp, #88]\n" // 8-byte Folded Reload
"str q3, [sp, #224]\n" // 16-byte Folded Spill
"shl v3.4s, v20.4s, #8\n"
".word 0x4e98969f // sdot v31.4s, v20.16b, v24.16b\n"
"ldr q26, [x13, %[scratch_block_data]]\n"
"ldp %[scratch_block_data], x7, [sp, #104]\n" // 16-byte Folded Reload
".word 0x4e989668 // sdot v8.4s, v19.16b, v24.16b\n"
".word 0x4e989649 // sdot v9.4s, v18.16b, v24.16b\n"
".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n"
"ldr q27, [x13, %[scratch_block_data]]\n"
"mov x13, x19\n"
"mov %[scratch_block_data], %[output_block_data]\n"
"str q3, [sp, #208]\n" // 16-byte Folded Spill
"stp %[output_block_data], x19, [sp, #144]\n" // 16-byte Folded Spill
"b " DC_KERNEL_NO_MULT_5 "f\n"
DC_KERNEL_NO_MULT_4 ":\n" // in Loop: Header=BB225_5 Depth=3
".word 0x4e8e965f // sdot v31.4s, v18.16b, v14.16b\n"
".word 0x4e979648 // sdot v8.4s, v18.16b, v23.16b\n"
".word 0x4e999669 // sdot v9.4s, v19.16b, v25.16b\n"
".word 0x4e97967f // sdot v31.4s, v19.16b, v23.16b\n"
".word 0x4e9a966a // sdot v10.4s, v19.16b, v26.16b\n"
".word 0x4e999688 // sdot v8.4s, v20.16b, v25.16b\n"
".word 0x4e9a9689 // sdot v9.4s, v20.16b, v26.16b\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
".word 0x4e9b968a // sdot v10.4s, v20.16b, v27.16b\n"
"sqrdmulh v8.4s, v8.4s, v1.4s\n"
"sqrdmulh v9.4s, v9.4s, v1.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqrdmulh v10.4s, v10.4s, v1.4s\n"
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqrshl v9.4s, v9.4s, v2.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqrshl v10.4s, v10.4s, v2.4s\n"
"sqxtn v9.4h, v9.4s\n"
"sqxtn2 v31.8h, v8.4s\n"
"sqxtn2 v9.8h, v10.4s\n"
"sqadd v31.8h, v31.8h, v0.8h\n"
"sqadd v8.8h, v9.8h, v0.8h\n"
"sqxtun v31.8b, v31.8h\n"
"sqxtun2 v31.16b, v8.8h\n"
"ldp q28, q6, [sp, #256]\n" // 32-byte Folded Reload
"add %[output_block_data], x12, %[scratch_block_data]\n"
"ldr q5, [sp, #208]\n" // 16-byte Folded Reload
"mov v8.16b, v21.16b\n"
"umax v31.16b, v31.16b, v6.16b\n"
"umin v31.16b, v31.16b, v28.16b\n"
"str s31, [x6, %[scratch_block_data]]\n"
"st1 { v31.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x17, %[scratch_block_data]\n"
"st1 { v31.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x22, %[scratch_block_data]\n"
"st1 { v31.s }[3], [%[output_block_data]]\n"
"ldp q30, q29, [sp, #224]\n" // 32-byte Folded Reload
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"mov v11.16b, v21.16b\n"
".word 0x4e8e97a8 // sdot v8.4s, v29.16b, v14.16b\n"
".word 0x4e9797a9 // sdot v9.4s, v29.16b, v23.16b\n"
".word 0x4e9897aa // sdot v10.4s, v29.16b, v24.16b\n"
".word 0x4e9797c8 // sdot v8.4s, v30.16b, v23.16b\n"
".word 0x4e9997ab // sdot v11.4s, v29.16b, v25.16b\n"
".word 0x4e9897c9 // sdot v9.4s, v30.16b, v24.16b\n"
".word 0x4e9997ca // sdot v10.4s, v30.16b, v25.16b\n"
".word 0x4e9894a8 // sdot v8.4s, v5.16b, v24.16b\n"
".word 0x4e9a97cb // sdot v11.4s, v30.16b, v26.16b\n"
".word 0x4e9994a9 // sdot v9.4s, v5.16b, v25.16b\n"
".word 0x4e9a94aa // sdot v10.4s, v5.16b, v26.16b\n"
"sqrdmulh v22.4s, v8.4s, v1.4s\n"
"rev32 v12.8h, v23.8h\n"
"rev32 v13.8h, v24.8h\n"
".word 0x4e9b94ab // sdot v11.4s, v5.16b, v27.16b\n"
"sqrdmulh v23.4s, v9.4s, v1.4s\n"
"sqrdmulh v24.4s, v10.4s, v1.4s\n"
"sqrshl v22.4s, v22.4s, v2.4s\n"
"rev32 v4.8h, v25.8h\n"
"sqrdmulh v25.4s, v11.4s, v1.4s\n"
"sqrshl v8.4s, v23.4s, v2.4s\n"
"sqrshl v23.4s, v24.4s, v2.4s\n"
"sqxtn v10.4h, v22.4s\n"
"ldr %[output_block_data], [sp, #184]\n" // 8-byte Folded Reload
"rev32 v15.8h, v26.8h\n"
"rev32 v3.8h, v27.8h\n"
"sqrshl v9.4s, v25.4s, v2.4s\n"
"sqxtn v11.4h, v23.4s\n"
"ldr q22, [x11, x13]\n"
"ldr q23, [x25, x13]\n"
"ldr q24, [x24, x13]\n"
"ldr q25, [x10, x13]\n"
"ldr q26, [x23, x13]\n"
"ldr q27, [x8, x13]\n"
"sqxtn2 v10.8h, v8.4s\n"
"sqxtn2 v11.8h, v9.4s\n"
"sqadd v8.8h, v10.8h, v0.8h\n"
"sqadd v9.8h, v11.8h, v0.8h\n"
"sqxtun v8.8b, v8.8h\n"
"sqxtun2 v8.16b, v9.8h\n"
"umax v8.16b, v8.16b, v6.16b\n"
"add %[output_block_data], x3, %[scratch_block_data]\n"
"umin v8.16b, v8.16b, v28.16b\n"
"str s8, [x21, %[scratch_block_data]]\n"
"st1 { v8.s }[1], [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #192]\n" // 8-byte Folded Reload
"rev32 v31.8h, v14.8h\n"
"mov v9.16b, v21.16b\n"
"trn1 v31.8h, v31.8h, v22.8h\n"
"add %[output_block_data], x3, %[scratch_block_data]\n"
"st1 { v8.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x15, %[scratch_block_data]\n"
"mov v10.16b, v21.16b\n"
"mov v11.16b, v21.16b\n"
"trn1 v12.8h, v12.8h, v23.8h\n"
"trn1 v13.8h, v13.8h, v24.8h\n"
".word 0x4e9f9649 // sdot v9.4s, v18.16b, v31.16b\n"
"st1 { v8.s }[3], [%[output_block_data]]\n"
"mov v8.16b, v21.16b\n"
"trn1 v14.8h, v4.8h, v25.8h\n"
".word 0x4e8c964a // sdot v10.4s, v18.16b, v12.16b\n"
".word 0x4e8d964b // sdot v11.4s, v18.16b, v13.16b\n"
".word 0x4e8c9669 // sdot v9.4s, v19.16b, v12.16b\n"
"trn1 v15.8h, v15.8h, v26.8h\n"
".word 0x4e8e9648 // sdot v8.4s, v18.16b, v14.16b\n"
".word 0x4e8d966a // sdot v10.4s, v19.16b, v13.16b\n"
".word 0x4e8e966b // sdot v11.4s, v19.16b, v14.16b\n"
".word 0x4e8d9689 // sdot v9.4s, v20.16b, v13.16b\n"
"trn1 v3.8h, v3.8h, v27.8h\n"
".word 0x4e8f9668 // sdot v8.4s, v19.16b, v15.16b\n"
".word 0x4e8e968a // sdot v10.4s, v20.16b, v14.16b\n"
".word 0x4e8f968b // sdot v11.4s, v20.16b, v15.16b\n"
"sqrdmulh v9.4s, v9.4s, v1.4s\n"
".word 0x4e839688 // sdot v8.4s, v20.16b, v3.16b\n"
"sqrdmulh v10.4s, v10.4s, v1.4s\n"
"sqrdmulh v11.4s, v11.4s, v1.4s\n"
"sqrshl v9.4s, v9.4s, v2.4s\n"
"sqrdmulh v8.4s, v8.4s, v1.4s\n"
"sqrshl v10.4s, v10.4s, v2.4s\n"
"sqrshl v11.4s, v11.4s, v2.4s\n"
"sqxtn v9.4h, v9.4s\n"
"ldr %[output_block_data], [sp, #200]\n" // 8-byte Folded Reload
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqxtn v11.4h, v11.4s\n"
"sqxtn2 v9.8h, v10.4s\n"
"sqxtn2 v11.8h, v8.4s\n"
"sqadd v8.8h, v9.8h, v0.8h\n"
"sqadd v9.8h, v11.8h, v0.8h\n"
"sqxtun v8.8b, v8.8h\n"
"sqxtun2 v8.16b, v9.8h\n"
"umax v8.16b, v8.16b, v6.16b\n"
"add %[output_block_data], x3, %[scratch_block_data]\n"
"umin v8.16b, v8.16b, v28.16b\n"
"str s8, [x9, %[scratch_block_data]]\n"
"st1 { v8.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x30, %[scratch_block_data]\n"
"st1 { v8.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x29, %[scratch_block_data]\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"mov v11.16b, v21.16b\n"
"st1 { v8.s }[3], [%[output_block_data]]\n"
"mov v8.16b, v21.16b\n"
".word 0x4e9f97a9 // sdot v9.4s, v29.16b, v31.16b\n"
".word 0x4e8c97aa // sdot v10.4s, v29.16b, v12.16b\n"
".word 0x4e8d97ab // sdot v11.4s, v29.16b, v13.16b\n"
".word 0x4e8e97a8 // sdot v8.4s, v29.16b, v14.16b\n"
".word 0x4e8c97c9 // sdot v9.4s, v30.16b, v12.16b\n"
".word 0x4e8d97ca // sdot v10.4s, v30.16b, v13.16b\n"
".word 0x4e8e97cb // sdot v11.4s, v30.16b, v14.16b\n"
".word 0x4e8f97c8 // sdot v8.4s, v30.16b, v15.16b\n"
".word 0x4e8d94a9 // sdot v9.4s, v5.16b, v13.16b\n"
".word 0x4e8e94aa // sdot v10.4s, v5.16b, v14.16b\n"
".word 0x4e8f94ab // sdot v11.4s, v5.16b, v15.16b\n"
".word 0x4e8394a8 // sdot v8.4s, v5.16b, v3.16b\n"
"sqrdmulh v3.4s, v9.4s, v1.4s\n"
"sqrdmulh v31.4s, v10.4s, v1.4s\n"
"sqrdmulh v9.4s, v11.4s, v1.4s\n"
"sqrshl v3.4s, v3.4s, v2.4s\n"
"sqrdmulh v8.4s, v8.4s, v1.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqrshl v9.4s, v9.4s, v2.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqxtn v9.4h, v9.4s\n"
"sqxtn2 v3.8h, v31.4s\n"
"sqxtn2 v9.8h, v8.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqadd v31.8h, v9.8h, v0.8h\n"
"sqxtun v3.8b, v3.8h\n"
"sqxtun2 v3.16b, v31.8h\n"
"umax v3.16b, v3.16b, v6.16b\n"
"add %[output_block_data], x28, %[scratch_block_data]\n"
"umin v3.16b, v3.16b, v28.16b\n"
"str s3, [x7, %[scratch_block_data]]\n"
"st1 { v3.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x27, %[scratch_block_data]\n"
"st1 { v3.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x26, %[scratch_block_data]\n"
"mov v31.16b, v21.16b\n"
"mov v8.16b, v21.16b\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"add w2, w2, #1\n" // =1
".word 0x4e98969f // sdot v31.4s, v20.16b, v24.16b\n"
".word 0x4e989668 // sdot v8.4s, v19.16b, v24.16b\n"
".word 0x4e989649 // sdot v9.4s, v18.16b, v24.16b\n"
".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n"
"st1 { v3.s }[3], [%[output_block_data]]\n"
"add %[scratch_block_data], x0, %[function_params]\n"
"add x13, x13, #32\n" // =32
"mov v14.16b, v22.16b\n"
DC_KERNEL_NO_MULT_5 ":\n" // Parent Loop BB225_26 Depth=1
// Parent Loop BB225_13 Depth=2
// => This Inner Loop Header: Depth=3
"cmp w2, w14\n"
"b.lt " DC_KERNEL_NO_MULT_4 "b\n"
// %bb.6: // in Loop: Header=BB225_13 Depth=2
"ldr %[bias_data], [sp, #168]\n" // 8-byte Folded Reload
"ldp d6, d5, [sp, #72]\n" // 16-byte Folded Reload
"cmp w5, #0\n" // =0
"add %[bias_data], x2, #16\n" // =16
"str %[bias_data], [sp, #168]\n" // 8-byte Folded Spill
"b.le " DC_KERNEL_NO_MULT_12 "f\n"
// %bb.7: // in Loop: Header=BB225_13 Depth=2
"movi v28.16b, #0\n"
"cmp w5, #3\n" // =3
"movi v29.16b, #0\n"
"movi v30.16b, #0\n"
"movi v11.16b, #0\n"
"movi v12.16b, #0\n"
"movi v13.16b, #0\n"
"b.lt " DC_KERNEL_NO_MULT_9 "f\n"
// %bb.8: // in Loop: Header=BB225_13 Depth=2
"ldr q28, [x11, x13]\n"
"ldr q29, [x25, x13]\n"
"ldr q30, [x24, x13]\n"
"ldr q11, [x10, x13]\n"
"ldr q12, [x23, x13]\n"
"ldr q13, [x8, x13]\n"
DC_KERNEL_NO_MULT_9 ":\n" // in Loop: Header=BB225_13 Depth=2
"ldr x19, [sp, #136]\n" // 8-byte Folded Reload
"mov x13, xzr\n"
"mov w2, wzr\n"
"add %[output_block_data], x22, %[scratch_block_data]\n"
"add x6, x17, %[scratch_block_data]\n"
"add x7, x12, %[scratch_block_data]\n"
"add %[scratch_block_data], x19, x0\n"
"b " DC_KERNEL_NO_MULT_11 "f\n"
DC_KERNEL_NO_MULT_10 ":\n" // in Loop: Header=BB225_11 Depth=3
".word 0x4e8e965f // sdot v31.4s, v18.16b, v14.16b\n"
".word 0x4e979648 // sdot v8.4s, v18.16b, v23.16b\n"
".word 0x4e999669 // sdot v9.4s, v19.16b, v25.16b\n"
".word 0x4e97967f // sdot v31.4s, v19.16b, v23.16b\n"
".word 0x4e9a966a // sdot v10.4s, v19.16b, v26.16b\n"
".word 0x4e999688 // sdot v8.4s, v20.16b, v25.16b\n"
".word 0x4e9a9689 // sdot v9.4s, v20.16b, v26.16b\n"
"sqrdmulh v3.4s, v31.4s, v1.4s\n"
".word 0x4e9b968a // sdot v10.4s, v20.16b, v27.16b\n"
"sqrdmulh v31.4s, v8.4s, v1.4s\n"
"sqrdmulh v8.4s, v9.4s, v1.4s\n"
"sqrshl v3.4s, v3.4s, v2.4s\n"
"sqrdmulh v9.4s, v10.4s, v1.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqrshl v9.4s, v9.4s, v2.4s\n"
"sqxtn v8.4h, v8.4s\n"
"sqxtn2 v3.8h, v31.4s\n"
"sqxtn2 v8.8h, v9.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqadd v31.8h, v8.8h, v0.8h\n"
"sqxtun v3.8b, v3.8h\n"
"sqxtun2 v3.16b, v31.8h\n"
"ldr q4, [sp, #272]\n" // 16-byte Folded Reload
"add x19, x7, x13\n"
"ushr v24.4s, v24.4s, #8\n"
"ushr v25.4s, v25.4s, #8\n"
"umax v3.16b, v3.16b, v4.16b\n"
"ldr q4, [sp, #256]\n" // 16-byte Folded Reload
"ushr v14.4s, v14.4s, #8\n"
"ushr v23.4s, v23.4s, #8\n"
"sli v24.4s, v30.4s, #24\n"
"umin v3.16b, v3.16b, v4.16b\n"
"str s3, [%[scratch_block_data], x13]\n"
"st1 { v3.s }[1], [x19]\n"
"add x19, x6, x13\n"
"st1 { v3.s }[2], [x19]\n"
"add x19, %[output_block_data], x13\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"sli v25.4s, v11.4s, #24\n"
"mov v31.16b, v21.16b\n"
"mov v8.16b, v21.16b\n"
"mov v9.16b, v21.16b\n"
"mov v10.16b, v21.16b\n"
"add w2, w2, #1\n" // =1
"sli v14.4s, v28.4s, #24\n"
"ushr v28.4s, v28.4s, #8\n"
"ushr v30.4s, v30.4s, #8\n"
"sli v23.4s, v29.4s, #24\n"
"ushr v29.4s, v29.4s, #8\n"
"ushr v11.4s, v11.4s, #8\n"
"sli v26.4s, v12.4s, #24\n"
"ushr v12.4s, v12.4s, #8\n"
"sli v27.4s, v13.4s, #24\n"
"ushr v13.4s, v13.4s, #8\n"
"st1 { v3.s }[3], [x19]\n"
".word 0x4e98969f // sdot v31.4s, v20.16b, v24.16b\n"
".word 0x4e989668 // sdot v8.4s, v19.16b, v24.16b\n"
".word 0x4e989649 // sdot v9.4s, v18.16b, v24.16b\n"
".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n"
"add x13, x13, x16\n"
DC_KERNEL_NO_MULT_11 ":\n" // Parent Loop BB225_26 Depth=1
// Parent Loop BB225_13 Depth=2
// => This Inner Loop Header: Depth=3
"cmp w2, w5\n"
"b.lt " DC_KERNEL_NO_MULT_10 "b\n"
DC_KERNEL_NO_MULT_12 ":\n" // in Loop: Header=BB225_13 Depth=2
"ldp x19, %[scratch_block_data], [sp, #152]\n" // 16-byte Folded Reload
"ldr %[output_block_data], [sp, #144]\n" // 8-byte Folded Reload
"mov v20.16b, v17.16b\n"
"mov v19.16b, v16.16b\n"
"add %[scratch_block_data], x0, #1\n" // =1
"add %[output_block_data], x3, #4\n" // =4
"add x19, x19, #16\n" // =16
"mov v18.16b, v7.16b\n"
DC_KERNEL_NO_MULT_13 ":\n" // Parent Loop BB225_26 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB225_5 Depth 3
// Child Loop BB225_11 Depth 3
"cmp %[scratch_block_data], #2\n" // =2
"b.ne " DC_KERNEL_NO_MULT_3 "b\n"
"b " DC_KERNEL_NO_MULT_25 "f\n"
DC_KERNEL_NO_MULT_14 ":\n" // in Loop: Header=BB225_26 Depth=1
"ldp q21, q22, [%[bias_data]]\n"
"ldr %[bias_data], [sp, #64]\n" // 8-byte Folded Reload
"ldr x7, [sp, #128]\n" // 8-byte Folded Reload
"mov w0, wzr\n"
"b " DC_KERNEL_NO_MULT_24 "f\n"
DC_KERNEL_NO_MULT_15 ":\n" // in Loop: Header=BB225_24 Depth=2
"str w0, [sp, #240]\n" // 4-byte Folded Spill
"ldr %[scratch_block_data], [sp, #176]\n" // 8-byte Folded Reload
"add %[output_block_data], x7, %[filter_workspace]\n"
"ldp q23, q24, [x7]\n"
"ldp q25, q26, [%[output_block_data]]\n"
"add %[scratch_block_data], x7, x0\n"
"str %[output_block_data], [sp, #208]\n" // 8-byte Folded Spill
"ldp q27, q28, [%[scratch_block_data]]\n"
"mov w13, wzr\n"
"mov %[scratch_block_data], %[bias_data]\n"
"str %[bias_data], [sp, #224]\n" // 8-byte Folded Spill
"b " DC_KERNEL_NO_MULT_22 "f\n"
DC_KERNEL_NO_MULT_16 ":\n" // in Loop: Header=BB225_22 Depth=3
"cmp w13, w14\n"
"orr w2, wzr, #0x4\n"
"csel w6, w5, w2, eq\n"
"add %[output_block_data], x7, #32\n" // =32
"movi v29.16b, #0\n"
"movi v30.16b, #0\n"
"movi v8.16b, #0\n"
"movi v31.16b, #0\n"
"cmp w6, #3\n" // =3
"movi v9.16b, #0\n"
"movi v10.16b, #0\n"
"b.lt " DC_KERNEL_NO_MULT_18 "f\n"
// %bb.17: // in Loop: Header=BB225_22 Depth=3
"ldr %[bias_data], [sp, #176]\n" // 8-byte Folded Reload
"add x19, %[output_block_data], %[filter_workspace]\n"
"ldp q29, q31, [x7, #32]\n"
"ldp q30, q9, [x19]\n"
"add %[bias_data], %[output_block_data], x2\n"
"ldp q8, q10, [%[bias_data]]\n"
DC_KERNEL_NO_MULT_18 ":\n" // in Loop: Header=BB225_22 Depth=3
"mov w7, wzr\n"
"b " DC_KERNEL_NO_MULT_20 "f\n"
DC_KERNEL_NO_MULT_19 ":\n" // in Loop: Header=BB225_20 Depth=4
"mov v3.16b, v21.16b\n"
"mov v11.16b, v22.16b\n"
".word 0x4e979643 // sdot v3.4s, v18.16b, v23.16b\n"
".word 0x4e9894eb // sdot v11.4s, v7.16b, v24.16b\n"
".word 0x4e999663 // sdot v3.4s, v19.16b, v25.16b\n"
".word 0x4e9a960b // sdot v11.4s, v16.16b, v26.16b\n"
".word 0x4e9b9683 // sdot v3.4s, v20.16b, v27.16b\n"
".word 0x4e9c962b // sdot v11.4s, v17.16b, v28.16b\n"
"sqrdmulh v3.4s, v3.4s, v1.4s\n"
"sqrdmulh v11.4s, v11.4s, v1.4s\n"
"sqrshl v3.4s, v3.4s, v2.4s\n"
"sqrshl v11.4s, v11.4s, v2.4s\n"
"sqxtn v3.4h, v3.4s\n"
"sqxtn2 v3.8h, v11.4s\n"
"sqadd v3.8h, v3.8h, v0.8h\n"
"sqxtun v3.8b, v3.8h\n"
"umax v3.8b, v3.8b, v5.8b\n"
"ushr v23.4s, v23.4s, #8\n"
"ushr v24.4s, v24.4s, #8\n"
"ushr v25.4s, v25.4s, #8\n"
"ushr v26.4s, v26.4s, #8\n"
"ushr v27.4s, v27.4s, #8\n"
"ushr v28.4s, v28.4s, #8\n"
"umin v3.8b, v3.8b, v6.8b\n"
"sli v23.4s, v29.4s, #24\n"
"ushr v29.4s, v29.4s, #8\n"
"sli v24.4s, v31.4s, #24\n"
"ushr v31.4s, v31.4s, #8\n"
"sli v25.4s, v30.4s, #24\n"
"ushr v30.4s, v30.4s, #8\n"
"sli v26.4s, v9.4s, #24\n"
"ushr v9.4s, v9.4s, #8\n"
"sli v27.4s, v8.4s, #24\n"
"ushr v8.4s, v8.4s, #8\n"
"sli v28.4s, v10.4s, #24\n"
"ushr v10.4s, v10.4s, #8\n"
"str d3, [%[scratch_block_data]]\n"
"add %[scratch_block_data], x0, x16\n"
"add w7, w7, #1\n" // =1
DC_KERNEL_NO_MULT_20 ":\n" // Parent Loop BB225_26 Depth=1
// Parent Loop BB225_24 Depth=2
// Parent Loop BB225_22 Depth=3
// => This Inner Loop Header: Depth=4
"cmp w7, w6\n"
"b.lt " DC_KERNEL_NO_MULT_19 "b\n"
// %bb.21: // in Loop: Header=BB225_22 Depth=3
"add w13, w13, #1\n" // =1
"mov x7, %[output_block_data]\n"
DC_KERNEL_NO_MULT_22 ":\n" // Parent Loop BB225_26 Depth=1
// Parent Loop BB225_24 Depth=2
// => This Loop Header: Depth=3
// Child Loop BB225_20 Depth 4
"ldr w2, [sp, #292]\n" // 4-byte Folded Reload
"cmp w13, w2\n"
"b.lt " DC_KERNEL_NO_MULT_16 "b\n"
// %bb.23: // in Loop: Header=BB225_24 Depth=2
"ldr x13, [sp, #120]\n" // 8-byte Folded Reload
"ldr %[bias_data], [sp, #224]\n" // 8-byte Folded Reload
"ldr w0, [sp, #240]\n" // 4-byte Folded Reload
"ldr x7, [sp, #208]\n" // 8-byte Folded Reload
"add %[bias_data], x2, x13\n"
"add w0, w0, #1\n" // =1
DC_KERNEL_NO_MULT_24 ":\n" // Parent Loop BB225_26 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB225_22 Depth 3
// Child Loop BB225_20 Depth 4
"ldr w13, [sp, #288]\n" // 4-byte Folded Reload
"cmp w0, w13\n"
"b.lt " DC_KERNEL_NO_MULT_15 "b\n"
DC_KERNEL_NO_MULT_25 ":\n" // in Loop: Header=BB225_26 Depth=1
"ldr x13, [sp, #128]\n" // 8-byte Folded Reload
"ldr %[scratch_block_data], [sp, #8]\n" // 8-byte Folded Reload
"ldp x19, %[bias_data], [sp, #48]\n" // 16-byte Folded Reload
"ldr w7, [sp, #36]\n" // 4-byte Folded Reload
"ldr x6, [sp, #24]\n" // 8-byte Folded Reload
"add x13, x13, %[scratch_block_data]\n"
"str x13, [sp, #128]\n" // 8-byte Folded Spill
"ldr x13, [sp, #64]\n" // 8-byte Folded Reload
"add %[bias_data], x2, #32\n" // =32
"add w7, w7, #1\n" // =1
"add x19, x19, %[scratch_block_data]\n"
"add x13, x13, #8\n" // =8
"str x13, [sp, #64]\n" // 8-byte Folded Spill
"ldr x13, [sp, #40]\n" // 8-byte Folded Reload
"add x13, x13, #8\n" // =8
DC_KERNEL_NO_MULT_26 ":\n" // =>This Loop Header: Depth=1
// Child Loop BB225_24 Depth 2
// Child Loop BB225_22 Depth 3
// Child Loop BB225_20 Depth 4
// Child Loop BB225_13 Depth 2
// Child Loop BB225_5 Depth 3
// Child Loop BB225_11 Depth 3
"ldr w0, [sp, #20]\n" // 4-byte Folded Reload
"cmp w7, w0\n"
"b.lt " DC_KERNEL_NO_MULT_1 "b\n"
// %bb.27:
// Compiled intrinsics total stack 448, now 304 for spillage only.
"add sp, sp, #304\n" // =448
:
// Outputs.
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
// Inputs.
[ function_params ] "r"(function_params)
:
// Clobbers.
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
// We use these general-purpose registers.
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
"x28", "x29", "x30");
#endif // __linux__
} // NOLINT(readability/fn_size) Manually unrolled.