in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h [8293:8889]
static inline void KernelMacroBlockNeon(
const int8* scratch_block_data, const int8* filter_workspace,
const int32* bias_data, uint8* output_block_data,
const DepthwiseConvDotProdParams* function_params) {
// Note that argument registers may be reused after parameter loading.
// x0 %[scratch_block_data]
// x1 %[filter_workspace]
// x2 %[bias_data]
// x3 %[output_block_data]
// x4 %[function_params]
#define DC_KERNEL_MULT_1 "1"
#define DC_KERNEL_MULT_2 "2"
#define DC_KERNEL_MULT_3 "3"
#define DC_KERNEL_MULT_4 "4"
#define DC_KERNEL_MULT_5 "5"
#define DC_KERNEL_MULT_6 "6"
#define DC_KERNEL_MULT_7 "7"
#define DC_KERNEL_MULT_8 "8"
#define DC_KERNEL_MULT_9 "9"
#define DC_KERNEL_MULT_10 "10"
#define DC_KERNEL_MULT_11 "11"
#define DC_KERNEL_MULT_12 "12"
#define DC_KERNEL_MULT_13 "13"
#define DC_KERNEL_MULT_14 "14"
#define DC_KERNEL_MULT_15 "15"
#define DC_KERNEL_MULT_16 "16"
#define DC_KERNEL_MULT_17 "17"
#define DC_KERNEL_MULT_18 "18"
#define DC_KERNEL_MULT_19 "19"
#define DC_KERNEL_MULT_20 "20"
#define DC_KERNEL_MULT_21 "21"
#define DC_KERNEL_MULT_22 "22"
#ifdef __linux__
asm volatile(
// Compiled code used block of 160 for spill out of total stack of 288.
// However, an 8-byte spill was sneaked in to #168.
// Spillage increased to 176 and so the original offset of #168 is OK.
"sub sp, sp, #176\n" // =288
"stp xzr, %[bias_data], [sp, #32]\n" // 16-byte Folded Spill
"ldr w8, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
"str %[filter_workspace], [sp, #16]\n" // 8-byte Folded Spill
"ldpsw x19, x11, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
"ldp w20, w26, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
"str w8, [sp, #8]\n" // 4-byte Folded Spill
"ldrsw x8, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
"ldrsw x15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
"add x12, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n" // =36
"add x14, x11, x11, lsl #2\n"
"stp x8, %[output_block_data], [sp, #88]\n" // 16-byte Folded Spill
"ldrb w8, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
"ldrb w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
"ldr w17, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
"add %[filter_workspace], x19, x15, lsl #1\n"
"dup v3.16b, w8\n"
"dup v4.16b, w9\n"
"dup v5.8b, w8\n"
"dup v6.8b, w9\n"
"add x8, x19, x19, lsl #1\n"
"add x9, x15, x15, lsl #1\n"
"add %[bias_data], x15, x19, lsl #1\n"
"add x10, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n" // =32
"add x13, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n" // =28
"ld1r { v2.4s }, [x12]\n"
"add x6, x19, x15\n"
"add x12, %[scratch_block_data], x14\n"
"add x14, x9, x8\n"
"add %[function_params], x9, x19, lsl #1\n"
"add x5, x9, x19\n"
"add x9, %[output_block_data], x9\n"
"add %[filter_workspace], %[output_block_data], x1\n"
"add %[bias_data], %[output_block_data], x2\n"
"ld1r { v0.8h }, [x13]\n"
"ld1r { v1.4s }, [x10]\n"
"str x9, [sp, #56]\n" // 8-byte Folded Spill
"add x9, x8, x15, lsl #1\n"
"str %[filter_workspace], [sp, #168]\n" // 8-byte Folded Spill
"add %[filter_workspace], x8, x15\n"
"str %[bias_data], [sp, #152]\n" // 8-byte Folded Spill
"add %[bias_data], %[output_block_data], x6\n"
"add x21, %[output_block_data], x8\n"
"add x8, %[output_block_data], x14\n"
"add x25, x11, x11, lsl #1\n"
"cmp w17, #4\n" // =4
"lsl x16, x15, #1\n"
"stp x8, %[bias_data], [sp, #136]\n" // 16-byte Folded Spill
"add x8, %[output_block_data], %[function_params]\n"
"add x28, %[output_block_data], x9\n"
"lsl x9, x11, #2\n"
"add x10, %[scratch_block_data], x11\n"
"add x23, %[scratch_block_data], x11, lsl #1\n"
"add x13, %[scratch_block_data], x11, lsl #2\n"
"ccmp w26, w20, #0, lt\n"
"add x16, x16, x19, lsl #1\n"
"str x8, [sp, #128]\n" // 8-byte Folded Spill
"add x8, %[scratch_block_data], x25\n"
"str x9, [sp, #48]\n" // 8-byte Folded Spill
"mov w9, w26\n"
"mov w7, wzr\n"
"add x22, x13, #4\n" // =4
"add x23, x23, #4\n" // =4
"add x24, x10, #4\n" // =4
"add x27, %[output_block_data], x5\n"
"add x29, %[output_block_data], x16\n"
"add x30, %[output_block_data], %[filter_workspace]\n"
"stp x25, x19, [sp, #72]\n" // 16-byte Folded Spill
"add x14, x8, #4\n" // =4
"lsl x8, x11, #1\n"
"lsl x10, x15, #2\n"
"add %[bias_data], %[output_block_data], x15, lsl #1\n"
"add %[filter_workspace], %[output_block_data], x15\n"
"add %[function_params], %[output_block_data], x19, lsl #1\n"
"add x5, %[output_block_data], x19\n"
"mov w25, w20\n"
"csel w9, w26, w20, lt\n"
"add x16, x12, #4\n" // =4
"str x12, [sp, #64]\n" // 8-byte Folded Spill
"str %[output_block_data], [sp, #24]\n" // 8-byte Folded Spill
"b " DC_KERNEL_MULT_22 "f\n"
DC_KERNEL_MULT_1 ":\n" // in Loop: Header=BB205_22 Depth=1
"ldr x12, [sp, #16]\n" // 8-byte Folded Reload
"str w7, [sp, #12]\n" // 4-byte Folded Spill
"ldr x13, [sp, #88]\n" // 8-byte Folded Reload
"ldp q18, q7, [x12]\n"
"ldp q19, q16, [x12, #32]\n"
"ldp q20, q17, [x12, #64]\n"
"add x12, x12, #96\n" // =96
"cmp w13, #4\n" // =4
"str x12, [sp, #16]\n" // 8-byte Folded Spill
"mov x12, xzr\n"
"b.ne " DC_KERNEL_MULT_12 "f\n"
// %bb.2: // in Loop: Header=BB205_22 Depth=1
"ldp x19, x13, [sp, #32]\n" // 16-byte Folded Reload
"str x13, [sp, #120]\n" // 8-byte Folded Spill
"b " DC_KERNEL_MULT_11 "f\n"
DC_KERNEL_MULT_3 ":\n" // in Loop: Header=BB205_11 Depth=2
"str x12, [sp, #112]\n" // 8-byte Folded Spill
"ldr w12, [%[scratch_block_data]]\n"
"add %[output_block_data], %[scratch_block_data], x11\n"
"ldr x7, [sp, #72]\n" // 8-byte Folded Reload
"ldr w6, [%[scratch_block_data], x8]\n"
"fmov s21, w12\n"
"mov v21.s[1], w12\n"
"ld1 { v21.s }[2], [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #120]\n" // 8-byte Folded Reload
"ldr w7, [%[scratch_block_data], x7]\n"
"fmov s23, w6\n"
"mov v23.s[1], w6\n"
"ldr q22, [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #48]\n" // 8-byte Folded Reload
"mov v23.s[2], w7\n"
"dup v8.4s, w7\n"
"dup v31.4s, w6\n"
"ldr w3, [%[scratch_block_data], %[output_block_data]]\n"
"mov v23.s[3], w6\n"
"ldp x7, x6, [sp, #56]\n" // 16-byte Folded Reload
"mov v28.16b, v22.16b\n"
"fmov s24, w3\n"
"mov v24.s[1], w3\n"
"ld1 { v24.s }[2], [x6]\n"
"ldr x6, [sp, #96]\n" // 8-byte Folded Reload
"mov v29.16b, v22.16b\n"
"mov v30.16b, v22.16b\n"
".word 0x4e9f969c // sdot v28.4s, v20.16b, v31.16b\n"
".word 0x4e9f967d // sdot v29.4s, v19.16b, v31.16b\n"
".word 0x4e9f965e // sdot v30.4s, v18.16b, v31.16b\n"
"mov v31.16b, v22.16b\n"
"mov x13, xzr\n"
"shl v25.4s, v18.4s, #8\n"
"shl v26.4s, v19.4s, #8\n"
"shl v27.4s, v20.4s, #8\n"
"mov v21.s[3], w12\n"
"mov v24.s[3], w3\n"
".word 0x4e88965f // sdot v31.4s, v18.16b, v8.16b\n"
"mov x12, x19\n"
"b " DC_KERNEL_MULT_5 "f\n"
DC_KERNEL_MULT_4 ":\n" // in Loop: Header=BB205_5 Depth=3
".word 0x4f95e25c // sdot v28.4s, v18.16b, v21.4b[0]\n"
".word 0x4f95ea5d // sdot v29.4s, v18.16b, v21.4b[2]\n"
".word 0x4f97ea7e // sdot v30.4s, v19.16b, v23.4b[2]\n"
".word 0x4f95ea7c // sdot v28.4s, v19.16b, v21.4b[2]\n"
".word 0x4f98e27f // sdot v31.4s, v19.16b, v24.4b[0]\n"
".word 0x4f97ea9d // sdot v29.4s, v20.16b, v23.4b[2]\n"
".word 0x4f98e29e // sdot v30.4s, v20.16b, v24.4b[0]\n"
"sqrdmulh v28.4s, v28.4s, v1.4s\n"
".word 0x4f98ea9f // sdot v31.4s, v20.16b, v24.4b[2]\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqxtn v30.4h, v30.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqxtun v28.8b, v28.8h\n"
"sqxtun2 v28.16b, v29.8h\n"
"umax v28.16b, v28.16b, v3.16b\n"
"add %[output_block_data], x5, x12\n"
"umin v28.16b, v28.16b, v4.16b\n"
"str s28, [x6, x12]\n"
"st1 { v28.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], %[function_params], x12\n"
"st1 { v28.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x21, x12\n"
"st1 { v28.s }[3], [%[output_block_data]]\n"
"add %[output_block_data], %[scratch_block_data], x13, lsl #2\n"
"add %[output_block_data], x3, #4\n" // =4
"ld1 { v21.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x23, x13, lsl #2\n"
"ld1 { v23.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x22, x13, lsl #2\n"
"ld1 { v24.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x24, x13, lsl #2\n"
"ld1 { v21.s }[3], [%[output_block_data]]\n"
"add %[output_block_data], x14, x13, lsl #2\n"
"ld1 { v23.s }[3], [%[output_block_data]]\n"
"add %[output_block_data], x16, x13, lsl #2\n"
"mov v28.16b, v22.16b\n"
"ld1 { v24.s }[3], [%[output_block_data]]\n"
"mov v29.16b, v22.16b\n"
"mov v30.16b, v22.16b\n"
".word 0x4f95e33c // sdot v28.4s, v25.16b, v21.4b[0]\n"
"mov v31.16b, v22.16b\n"
".word 0x4f95eb3d // sdot v29.4s, v25.16b, v21.4b[2]\n"
".word 0x4f97e33e // sdot v30.4s, v25.16b, v23.4b[0]\n"
".word 0x4f95eb5c // sdot v28.4s, v26.16b, v21.4b[2]\n"
".word 0x4f97eb3f // sdot v31.4s, v25.16b, v23.4b[2]\n"
".word 0x4f97e35d // sdot v29.4s, v26.16b, v23.4b[0]\n"
".word 0x4f97eb5e // sdot v30.4s, v26.16b, v23.4b[2]\n"
".word 0x4f97e37c // sdot v28.4s, v27.16b, v23.4b[0]\n"
".word 0x4f98e35f // sdot v31.4s, v26.16b, v24.4b[0]\n"
".word 0x4f97eb7d // sdot v29.4s, v27.16b, v23.4b[2]\n"
".word 0x4f98e37e // sdot v30.4s, v27.16b, v24.4b[0]\n"
"sqrdmulh v28.4s, v28.4s, v1.4s\n"
".word 0x4f98eb7f // sdot v31.4s, v27.16b, v24.4b[2]\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqxtn v28.4h, v28.4s\n"
"ldr %[output_block_data], [sp, #144]\n" // 8-byte Folded Reload
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqxtn v30.4h, v30.4s\n"
"sqxtn2 v28.8h, v29.4s\n"
"sqxtn2 v30.8h, v31.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqadd v29.8h, v30.8h, v0.8h\n"
"sqxtun v28.8b, v28.8h\n"
"sqxtun2 v28.16b, v29.8h\n"
"umax v28.16b, v28.16b, v3.16b\n"
"add %[output_block_data], x3, x12\n"
"umin v28.16b, v28.16b, v4.16b\n"
"str s28, [%[filter_workspace], x12]\n"
"st1 { v28.s }[1], [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #152]\n" // 8-byte Folded Reload
"mov v29.16b, v22.16b\n"
"mov v30.16b, v22.16b\n"
"mov v31.16b, v22.16b\n"
"add %[output_block_data], x3, x12\n"
"st1 { v28.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x30, x12\n"
"st1 { v28.s }[3], [%[output_block_data]]\n"
"ushr v28.2d, v21.2d, #16\n"
"ushr v9.2d, v23.2d, #16\n"
".word 0x4f9ce25d // sdot v29.4s, v18.16b, v28.4b[0]\n"
"mov v8.16b, v22.16b\n"
".word 0x4f9cea5e // sdot v30.4s, v18.16b, v28.4b[2]\n"
".word 0x4f89e25f // sdot v31.4s, v18.16b, v9.4b[0]\n"
".word 0x4f9cea7d // sdot v29.4s, v19.16b, v28.4b[2]\n"
"ushr v10.2d, v24.2d, #16\n"
".word 0x4f89ea48 // sdot v8.4s, v18.16b, v9.4b[2]\n"
".word 0x4f89e27e // sdot v30.4s, v19.16b, v9.4b[0]\n"
".word 0x4f89ea7f // sdot v31.4s, v19.16b, v9.4b[2]\n"
".word 0x4f89e29d // sdot v29.4s, v20.16b, v9.4b[0]\n"
".word 0x4f8ae268 // sdot v8.4s, v19.16b, v10.4b[0]\n"
".word 0x4f89ea9e // sdot v30.4s, v20.16b, v9.4b[2]\n"
".word 0x4f8ae29f // sdot v31.4s, v20.16b, v10.4b[0]\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
".word 0x4f8aea88 // sdot v8.4s, v20.16b, v10.4b[2]\n"
"sqrdmulh v30.4s, v30.4s, v1.4s\n"
"sqrdmulh v31.4s, v31.4s, v1.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqrdmulh v8.4s, v8.4s, v1.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqxtn v29.4h, v29.4s\n"
"ldr %[output_block_data], [sp, #168]\n" // 8-byte Folded Reload
"sqrshl v8.4s, v8.4s, v2.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqxtn2 v29.8h, v30.4s\n"
"sqxtn2 v31.8h, v8.4s\n"
"sqadd v29.8h, v29.8h, v0.8h\n"
"sqadd v30.8h, v31.8h, v0.8h\n"
"sqxtun v29.8b, v29.8h\n"
"sqxtun2 v29.16b, v30.8h\n"
"umax v29.16b, v29.16b, v3.16b\n"
"add %[output_block_data], x3, x12\n"
"umin v29.16b, v29.16b, v4.16b\n"
"str s29, [%[bias_data], x12]\n"
"st1 { v29.s }[1], [%[output_block_data]]\n"
"add %[output_block_data], x29, x12\n"
"mov v30.16b, v22.16b\n"
"st1 { v29.s }[2], [%[output_block_data]]\n"
"add %[output_block_data], x28, x12\n"
"mov v31.16b, v22.16b\n"
"mov v8.16b, v22.16b\n"
".word 0x4f9ce33e // sdot v30.4s, v25.16b, v28.4b[0]\n"
"st1 { v29.s }[3], [%[output_block_data]]\n"
"mov v29.16b, v22.16b\n"
".word 0x4f9ceb3f // sdot v31.4s, v25.16b, v28.4b[2]\n"
".word 0x4f89e328 // sdot v8.4s, v25.16b, v9.4b[0]\n"
".word 0x4f9ceb5e // sdot v30.4s, v26.16b, v28.4b[2]\n"
".word 0x4f89eb3d // sdot v29.4s, v25.16b, v9.4b[2]\n"
".word 0x4f89e35f // sdot v31.4s, v26.16b, v9.4b[0]\n"
".word 0x4f89eb48 // sdot v8.4s, v26.16b, v9.4b[2]\n"
".word 0x4f89e37e // sdot v30.4s, v27.16b, v9.4b[0]\n"
".word 0x4f8ae35d // sdot v29.4s, v26.16b, v10.4b[0]\n"
".word 0x4f89eb7f // sdot v31.4s, v27.16b, v9.4b[2]\n"
".word 0x4f8ae368 // sdot v8.4s, v27.16b, v10.4b[0]\n"
"sqrdmulh v28.4s, v30.4s, v1.4s\n"
".word 0x4f8aeb7d // sdot v29.4s, v27.16b, v10.4b[2]\n"
"sqrdmulh v30.4s, v31.4s, v1.4s\n"
"sqrdmulh v31.4s, v8.4s, v1.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqrdmulh v29.4s, v29.4s, v1.4s\n"
"sqrshl v30.4s, v30.4s, v2.4s\n"
"sqrshl v31.4s, v31.4s, v2.4s\n"
"sqxtn v28.4h, v28.4s\n"
"sqrshl v29.4s, v29.4s, v2.4s\n"
"sqxtn v31.4h, v31.4s\n"
"sqxtn2 v28.8h, v30.4s\n"
"sqxtn2 v31.8h, v29.4s\n"
"sqadd v28.8h, v28.8h, v0.8h\n"
"sqadd v29.8h, v31.8h, v0.8h\n"
"sqxtun v28.8b, v28.8h\n"
"sqxtun2 v28.16b, v29.8h\n"
"umax v28.16b, v28.16b, v3.16b\n"
"add %[output_block_data], x27, x12\n"
"umin v8.16b, v28.16b, v4.16b\n"
"str s8, [x7, x12]\n"
"st1 { v8.s }[1], [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #128]\n" // 8-byte Folded Reload
"mov v28.16b, v22.16b\n"
"mov v29.16b, v22.16b\n"
"mov v30.16b, v22.16b\n"
"add %[output_block_data], x3, x12\n"
"st1 { v8.s }[2], [%[output_block_data]]\n"
"ldr %[output_block_data], [sp, #136]\n" // 8-byte Folded Reload
"mov v31.16b, v22.16b\n"
"ushr v23.2d, v23.2d, #32\n"
"add x13, x13, #1\n" // =1
"add %[output_block_data], x3, x12\n"
"ushr v21.2d, v21.2d, #32\n"
"ushr v24.2d, v24.2d, #32\n"
".word 0x4f97e29c // sdot v28.4s, v20.16b, v23.4b[0]\n"
".word 0x4f97e27d // sdot v29.4s, v19.16b, v23.4b[0]\n"
".word 0x4f97e25e // sdot v30.4s, v18.16b, v23.4b[0]\n"
".word 0x4f97ea5f // sdot v31.4s, v18.16b, v23.4b[2]\n"
"st1 { v8.s }[3], [%[output_block_data]]\n"
"add x12, x12, x10\n"
DC_KERNEL_MULT_5 ":\n" // Parent Loop BB205_22 Depth=1
// Parent Loop BB205_11 Depth=2
// => This Inner Loop Header: Depth=3
"cmp w13, w9\n"
"b.lt " DC_KERNEL_MULT_4 "b\n"
// %bb.6: // in Loop: Header=BB205_11 Depth=2
"ldr %[output_block_data], [sp, #120]\n" // 8-byte Folded Reload
"cmp w13, w25\n"
"str x19, [sp, #104]\n" // 8-byte Folded Spill
"add %[output_block_data], x3, #16\n" // =16
"str %[output_block_data], [sp, #120]\n" // 8-byte Folded Spill
"b.ge " DC_KERNEL_MULT_10 "f\n"
// %bb.7: // in Loop: Header=BB205_11 Depth=2
"add x7, %[scratch_block_data], x13, lsl #2\n"
"add x19, x23, x13, lsl #2\n"
"ld1 { v23.s }[1], [x19]\n"
"add x19, x22, x13, lsl #2\n"
"add x7, x7, #4\n" // =4
"ld1 { v24.s }[1], [x19]\n"
"ld1 { v21.s }[1], [x7]\n"
"add x19, x24, x13, lsl #2\n"
"add x7, x14, x13, lsl #2\n"
"add x13, x16, x13, lsl #2\n"
"ldr x20, [sp, #96]\n" // 8-byte Folded Reload
"ld1 { v23.s }[3], [x7]\n"
"ld1 { v24.s }[3], [x13]\n"
"ld1 { v21.s }[3], [x19]\n"
"mov %[output_block_data], xzr\n"
"mov w6, wzr\n"
"add x13, x21, x12\n"
"add x7, %[function_params], x12\n"
"add x19, x5, x12\n"
"add x12, x20, x12\n"
"b " DC_KERNEL_MULT_9 "f\n"
DC_KERNEL_MULT_8 ":\n" // in Loop: Header=BB205_9 Depth=3
".word 0x4f95e25c // sdot v28.4s, v18.16b, v21.4b[0]\n"
".word 0x4f95ea5d // sdot v29.4s, v18.16b, v21.4b[2]\n"
".word 0x4f97ea7e // sdot v30.4s, v19.16b, v23.4b[2]\n"
".word 0x4f95ea7c // sdot v28.4s, v19.16b, v21.4b[2]\n"
".word 0x4f98e27f // sdot v31.4s, v19.16b, v24.4b[0]\n"
".word 0x4f97ea9d // sdot v29.4s, v20.16b, v23.4b[2]\n"
".word 0x4f98e29e // sdot v30.4s, v20.16b, v24.4b[0]\n"
"sqrdmulh v25.4s, v28.4s, v1.4s\n"
".word 0x4f98ea9f // sdot v31.4s, v20.16b, v24.4b[2]\n"
"sqrdmulh v26.4s, v29.4s, v1.4s\n"
"sqrdmulh v27.4s, v30.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqrdmulh v28.4s, v31.4s, v1.4s\n"
"sqrshl v26.4s, v26.4s, v2.4s\n"
"sqrshl v27.4s, v27.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqrshl v28.4s, v28.4s, v2.4s\n"
"sqxtn v27.4h, v27.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqxtn2 v27.8h, v28.4s\n"
"sqadd v25.8h, v25.8h, v0.8h\n"
"sqadd v26.8h, v27.8h, v0.8h\n"
"sqxtun v25.8b, v25.8h\n"
"sqxtun2 v25.16b, v26.8h\n"
"umax v25.16b, v25.16b, v3.16b\n"
"add x20, x19, %[output_block_data]\n"
"umin v25.16b, v25.16b, v4.16b\n"
"str s25, [x12, %[output_block_data]]\n"
"st1 { v25.s }[1], [x20]\n"
"add x20, x7, %[output_block_data]\n"
"st1 { v25.s }[2], [x20]\n"
"add x20, x13, %[output_block_data]\n"
"ushr v23.2d, v23.2d, #8\n"
"mov v28.16b, v22.16b\n"
"mov v29.16b, v22.16b\n"
"mov v30.16b, v22.16b\n"
"mov v31.16b, v22.16b\n"
"add w6, w6, #1\n" // =1
"ushr v21.2d, v21.2d, #8\n"
"ushr v24.2d, v24.2d, #8\n"
"st1 { v25.s }[3], [x20]\n"
".word 0x4f97e29c // sdot v28.4s, v20.16b, v23.4b[0]\n"
".word 0x4f97e27d // sdot v29.4s, v19.16b, v23.4b[0]\n"
".word 0x4f97e25e // sdot v30.4s, v18.16b, v23.4b[0]\n"
".word 0x4f97ea5f // sdot v31.4s, v18.16b, v23.4b[2]\n"
"add %[output_block_data], x3, x15\n"
DC_KERNEL_MULT_9 ":\n" // Parent Loop BB205_22 Depth=1
// Parent Loop BB205_11 Depth=2
// => This Inner Loop Header: Depth=3
"cmp w6, w17\n"
"b.lt " DC_KERNEL_MULT_8 "b\n"
DC_KERNEL_MULT_10 ":\n" // in Loop: Header=BB205_11 Depth=2
"ldp x19, x12, [sp, #104]\n" // 16-byte Folded Reload
"mov v20.16b, v17.16b\n"
"mov v19.16b, v16.16b\n"
"mov v18.16b, v7.16b\n"
"add x12, x12, #1\n" // =1
"add x19, x19, #4\n" // =4
DC_KERNEL_MULT_11 ":\n" // Parent Loop BB205_22 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB205_5 Depth 3
// Child Loop BB205_9 Depth 3
"cmp x12, #2\n" // =2
"b.ne " DC_KERNEL_MULT_3 "b\n"
"b " DC_KERNEL_MULT_21 "f\n"
DC_KERNEL_MULT_12 ":\n" // in Loop: Header=BB205_22 Depth=1
"ldr x13, [sp, #40]\n" // 8-byte Folded Reload
"ldp q21, q22, [x13]\n"
"ldr x13, [sp, #24]\n" // 8-byte Folded Reload
"str x13, [sp, #120]\n" // 8-byte Folded Spill
"b " DC_KERNEL_MULT_20 "f\n"
DC_KERNEL_MULT_13 ":\n" // in Loop: Header=BB205_20 Depth=2
"madd x6, x12, x11, %[scratch_block_data]\n"
"ldr w13, [x6]\n"
"add x7, x6, x11\n"
"mov w3, wzr\n"
"fmov s23, w13\n"
"mov v23.s[1], w13\n"
"ld1 { v23.s }[2], [x7]\n"
"add x7, x6, x8\n"
"ld1r { v24.4s }, [x7]\n"
"ldr x7, [sp, #120]\n" // 8-byte Folded Reload
"mov v23.s[3], w13\n"
"b " DC_KERNEL_MULT_18 "f\n"
DC_KERNEL_MULT_14 ":\n" // in Loop: Header=BB205_18 Depth=3
"add x6, x6, #4\n" // =4
"mov x13, x6\n"
"ld1 { v23.s }[1], [x13], x8\n"
"add x20, x6, x11\n"
"cmp w3, w26\n"
"mov w19, wzr\n"
"ld1 { v23.s }[3], [x20]\n"
"ld1 { v24.s }[1], [x13]\n"
"orr w13, wzr, #0x4\n"
"csel w13, w17, w13, eq\n"
"b " DC_KERNEL_MULT_16 "f\n"
DC_KERNEL_MULT_15 ":\n" // in Loop: Header=BB205_16 Depth=4
"mov v25.16b, v21.16b\n"
"mov v26.16b, v22.16b\n"
".word 0x4f97e259 // sdot v25.4s, v18.16b, v23.4b[0]\n"
".word 0x4f97e0fa // sdot v26.4s, v7.16b, v23.4b[0]\n"
".word 0x4f97ea79 // sdot v25.4s, v19.16b, v23.4b[2]\n"
".word 0x4f97ea1a // sdot v26.4s, v16.16b, v23.4b[2]\n"
".word 0x4f98e299 // sdot v25.4s, v20.16b, v24.4b[0]\n"
".word 0x4f98e23a // sdot v26.4s, v17.16b, v24.4b[0]\n"
"sqrdmulh v25.4s, v25.4s, v1.4s\n"
"sqrdmulh v26.4s, v26.4s, v1.4s\n"
"sqrshl v25.4s, v25.4s, v2.4s\n"
"sqrshl v26.4s, v26.4s, v2.4s\n"
"sqxtn v25.4h, v25.4s\n"
"sqxtn2 v25.8h, v26.4s\n"
"sqadd v25.8h, v25.8h, v0.8h\n"
"sqxtun v25.8b, v25.8h\n"
"umax v25.8b, v25.8b, v5.8b\n"
"umin v25.8b, v25.8b, v6.8b\n"
"ushr v23.2d, v23.2d, #8\n"
"ushr v24.2d, v24.2d, #8\n"
"str d25, [x7]\n"
"add x7, x7, x15\n"
"add w19, w19, #1\n" // =1
DC_KERNEL_MULT_16 ":\n" // Parent Loop BB205_22 Depth=1
// Parent Loop BB205_20 Depth=2
// Parent Loop BB205_18 Depth=3
// => This Inner Loop Header: Depth=4
"cmp w19, w13\n"
"b.lt " DC_KERNEL_MULT_15 "b\n"
// %bb.17: // in Loop: Header=BB205_18 Depth=3
"add w3, w3, #1\n" // =1
DC_KERNEL_MULT_18 ":\n" // Parent Loop BB205_22 Depth=1
// Parent Loop BB205_20 Depth=2
// => This Loop Header: Depth=3
// Child Loop BB205_16 Depth 4
"cmp w3, w25\n"
"b.lt " DC_KERNEL_MULT_14 "b\n"
// %bb.19: // in Loop: Header=BB205_20 Depth=2
"ldr x13, [sp, #80]\n" // 8-byte Folded Reload
"ldr %[output_block_data], [sp, #120]\n" // 8-byte Folded Reload
"add x12, x12, #1\n" // =1
"add %[output_block_data], x3, x13\n"
"str %[output_block_data], [sp, #120]\n" // 8-byte Folded Spill
DC_KERNEL_MULT_20 ":\n" // Parent Loop BB205_22 Depth=1
// => This Loop Header: Depth=2
// Child Loop BB205_18 Depth 3
// Child Loop BB205_16 Depth 4
"ldr x13, [sp, #88]\n" // 8-byte Folded Reload
"cmp x12, x13\n"
"b.lt " DC_KERNEL_MULT_13 "b\n"
DC_KERNEL_MULT_21 ":\n" // in Loop: Header=BB205_22 Depth=1
"ldr x12, [sp, #40]\n" // 8-byte Folded Reload
"ldr w7, [sp, #12]\n" // 4-byte Folded Reload
"add x12, x12, #32\n" // =32
"str x12, [sp, #40]\n" // 8-byte Folded Spill
"ldr x12, [sp, #24]\n" // 8-byte Folded Reload
"add w7, w7, #1\n" // =1
"add x12, x12, #8\n" // =8
"str x12, [sp, #24]\n" // 8-byte Folded Spill
"ldr x12, [sp, #32]\n" // 8-byte Folded Reload
"add x12, x12, #8\n" // =8
"str x12, [sp, #32]\n" // 8-byte Folded Spill
DC_KERNEL_MULT_22 ":\n" // =>This Loop Header: Depth=1
// Child Loop BB205_20 Depth 2
// Child Loop BB205_18 Depth 3
// Child Loop BB205_16 Depth 4
// Child Loop BB205_11 Depth 2
// Child Loop BB205_5 Depth 3
// Child Loop BB205_9 Depth 3
"ldr w12, [sp, #8]\n" // 4-byte Folded Reload
"cmp w7, w12\n"
"b.lt " DC_KERNEL_MULT_1 "b\n"
// %bb.23:
// Compiled intrinsics total stack 266, now 176 for spillage only.
"add sp, sp, #176\n" // =288
:
// Outputs.
[ scratch_block_data ] "+r"(scratch_block_data),
[ filter_workspace ] "+r"(filter_workspace),
[ bias_data ] "+r"(bias_data),
[ output_block_data ] "+r"(output_block_data)
:
// Inputs.
[ function_params ] "r"(function_params)
:
// Clobbers.
"cc", "memory",
// We use these NEON registers.
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
"v31",
// We use these general-purpose registers.
"x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
"x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",
"x27", "x28", "x29", "x30");
#endif // __linux__
} // NOLINT(readability/fn_size) Manually unrolled.