static inline void KernelMacroBlockNeon()

in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h [7169:7833]


  static inline void KernelMacroBlockNeon(
      const int8* scratch_block_data, const int8* filter_workspace,
      const int32* bias_data, uint8* output_block_data,
      const DepthwiseConvDotProdParams* function_params) {
    // Note that argument registers may be reused after parameter loading.
    // x0 %[scratch_block_data]
    // x1 %[filter_workspace]
    // x2 %[bias_data]
    // x3 %[output_block_data]
    // x4 %[function_params]
#define DC_KERNEL_NO_MULT_1 "1"
#define DC_KERNEL_NO_MULT_2 "2"
#define DC_KERNEL_NO_MULT_3 "3"
#define DC_KERNEL_NO_MULT_4 "4"
#define DC_KERNEL_NO_MULT_5 "5"
#define DC_KERNEL_NO_MULT_6 "6"
#define DC_KERNEL_NO_MULT_7 "7"
#define DC_KERNEL_NO_MULT_8 "8"
#define DC_KERNEL_NO_MULT_9 "9"
#define DC_KERNEL_NO_MULT_10 "10"
#define DC_KERNEL_NO_MULT_11 "11"
#define DC_KERNEL_NO_MULT_12 "12"
#define DC_KERNEL_NO_MULT_13 "13"
#define DC_KERNEL_NO_MULT_14 "14"
#define DC_KERNEL_NO_MULT_15 "15"
#define DC_KERNEL_NO_MULT_16 "16"
#define DC_KERNEL_NO_MULT_17 "17"
#define DC_KERNEL_NO_MULT_18 "18"
#define DC_KERNEL_NO_MULT_19 "19"
#define DC_KERNEL_NO_MULT_20 "20"
#define DC_KERNEL_NO_MULT_21 "21"
#define DC_KERNEL_NO_MULT_22 "22"
#define DC_KERNEL_NO_MULT_23 "23"
#define DC_KERNEL_NO_MULT_24 "24"
#define DC_KERNEL_NO_MULT_25 "25"
#define DC_KERNEL_NO_MULT_26 "26"

#ifdef __linux__
    asm volatile(
        // Compiled code used block of 288 for spill out of total stack of 448.
        // However, two 4-byte spills were sneaked in to #360 and #364.
        // Spillage increased to 304 and these are mapped to #288 and #292.
        "sub    sp, sp, #304\n"  // =448
        "ldp    w9, w14, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
        "ldpsw  x12, x21, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
        "ldrsw  x8, [%[function_params], #" STR(DP_OFFSET_INPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
        "ldrsw  x16, [%[function_params]]\n"
        "str    w9, [sp, #292]\n"  // 4-byte Folded Spill
        "ldr    w9, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
        "ldrb   w10, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "]\n"
        "lsl    x8, x8, #5\n"
        "str    x8, [sp, #8]\n"  // 8-byte Folded Spill
        "str    w9, [sp, #20]\n"  // 4-byte Folded Spill
        "ldr    w9, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
        "add    x8, x12, x12, lsl #1\n"
        "ldr    w5, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
        "add    x11, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
        "str    w9, [sp, #288]\n"  // 4-byte Folded Spill
        "ldrb   w9, [%[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "]\n"
        "add    x15, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"  // =36
        "add    x13, %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
        "ld1r   { v0.8h }, [x13]\n"
        "dup    v3.16b, w9\n"
        "dup    v5.8b, w9\n"
        "add    x9, x16, x16, lsl #1\n"
        "add    x7, x9, x8\n"
        "add    x28, x9, x12, lsl #1\n"
        "add    %[function_params], x9, x12\n"
        "add    x9, %[output_block_data], x9\n"
        "add    x13, x12, x16, lsl #1\n"
        "str    x9, [sp, #112]\n"  // 8-byte Folded Spill
        "add    x9, x8, x16, lsl #1\n"
        "str    q3, [sp, #272]\n"  // 16-byte Folded Spill
        "dup    v3.16b, w10\n"
        "dup    v6.8b, w10\n"
        "lsl    x10, x16, #1\n"
        "add    x13, %[output_block_data], x13\n"
        "add    x29, %[output_block_data], x9\n"
        "add    x9, x21, x21, lsl #1\n"
        "ld1r   { v1.4s }, [x11]\n"
        "ld1r   { v2.4s }, [x15]\n"
        "add    x15, x16, x12, lsl #1\n"
        "add    x10, x10, x12, lsl #1\n"
        "str    x13, [sp, #200]\n"  // 8-byte Folded Spill
        "add    x13, x8, x16\n"
        "add    x22, %[output_block_data], x8\n"
        "add    x8, %[scratch_block_data], x21\n"
        "str    x9, [sp, #96]\n"  // 8-byte Folded Spill
        "add    x9, %[scratch_block_data], x9\n"
        "add    x17, x12, x16\n"
        "add    x15, %[output_block_data], x15\n"
        "add    x25, x8, #32\n"  // =32
        "add    x30, %[output_block_data], x10\n"
        "add    x8, x21, x21, lsl #2\n"
        "add    x10, x9, #32\n"  // =32
        "lsl    x9, x21, #1\n"
        "mov    x6, %[filter_workspace]\n"
        "mov    %[filter_workspace], xzr\n"
        "mov    w27, wzr\n"
        "add    x11, %[scratch_block_data], x21, lsl #1\n"
        "add    x23, %[scratch_block_data], x21, lsl #2\n"
        "str    x15, [sp, #192]\n"  // 8-byte Folded Spill
        "add    x15, %[output_block_data], x17\n"
        "str    x8, [sp, #104]\n"  // 8-byte Folded Spill
        "add    x8, %[scratch_block_data], x8\n"
        "str    x9, [sp, #176]\n"  // 8-byte Folded Spill
        "lsl    x9, x21, #2\n"
        "mov    x19, xzr\n"
        "str    x15, [sp, #184]\n"  // 8-byte Folded Spill
        "add    x23, x23, #32\n"  // =32
        "add    x24, x11, #32\n"  // =32
        "add    x26, %[output_block_data], x7\n"
        "mov    w7, wzr\n"
        "add    x27, %[output_block_data], x28\n"
        "add    x28, %[output_block_data], %[function_params]\n"
        "add    x15, %[output_block_data], x13\n"
        "mov    x13, xzr\n"
        "add    x8, x8, #32\n"  // =32
        "stp    x12, %[scratch_block_data], [sp, #120]\n"  // 16-byte Folded Spill
        "add    x11, %[scratch_block_data], #32\n"  // =32
        "mov    %[filter_workspace], x21\n"
        "str    x9, [sp, #88]\n"  // 8-byte Folded Spill
        "lsl    %[function_params], x16, #2\n"
        "add    x9, %[output_block_data], x16, lsl #1\n"
        "add    x21, %[output_block_data], x16\n"
        "add    x17, %[output_block_data], x12, lsl #1\n"
        "add    x12, %[output_block_data], x12\n"
        "str    q3, [sp, #256]\n"  // 16-byte Folded Spill
        "str    %[output_block_data], [sp, #64]\n"  // 8-byte Folded Spill
        "str    %[output_block_data], [sp, #136]\n"  // 8-byte Folded Spill
        "stp    d6, d5, [sp, #72]\n"  // 16-byte Folded Spill
        "b      " DC_KERNEL_NO_MULT_26 "f\n"
        DC_KERNEL_NO_MULT_1 ":\
"  // in Loop: Header=BB225_26 Depth=1
        "str    w7, [sp, #36]\n"  // 4-byte Folded Spill
        "ldr    w0, [sp, #288]\n"  // 4-byte Folded Reload
        "ldp    q18, q7, [x6]\n"
        "ldp    q19, q16, [x6, #32]\n"
        "ldp    q20, q17, [x6, #64]\n"
        "cmp    w0, #4\n"  // =4
        "add    x6, x6, #96\n"  // =96
        "stp    x19, %[bias_data], [sp, #48]\n"  // 16-byte Folded Spill
        "str    x13, [sp, #40]\n"  // 8-byte Folded Spill
        "str    x6, [sp, #24]\n"  // 8-byte Folded Spill
        "b.ne   " DC_KERNEL_NO_MULT_14 "f\n"
        // %bb.2:        // in Loop: Header=BB225_26 Depth=1
        "mov    %[scratch_block_data], xzr\n"
        "mov    %[output_block_data], x13\n"
        "str    %[bias_data], [sp, #168]\n"  // 8-byte Folded Spill
        "b      " DC_KERNEL_NO_MULT_13 "f\n"
        DC_KERNEL_NO_MULT_3 ":\n"  // in Loop: Header=BB225_13 Depth=2
        "ldr    x13, [sp, #128]\n"  // 8-byte Folded Reload
        "str    %[scratch_block_data], [sp, #160]\n"  // 8-byte Folded Spill
        "ldr    x6, [sp, #136]\n"  // 8-byte Folded Reload
        "shl    v3.4s, v18.4s, #8\n"
        "add    x13, x13, %[scratch_block_data], lsl #4\n"
        "ldr    %[scratch_block_data], [sp, #168]\n"  // 8-byte Folded Reload
        "ldr    q14, [x13]\n"
        "ldr    q23, [x13, %[filter_workspace]]\n"
        "str    q3, [sp, #240]\n"  // 16-byte Folded Spill
        "ldr    q21, [%[scratch_block_data]]\n"
        "ldr    %[scratch_block_data], [sp, #176]\n"  // 8-byte Folded Reload
        "shl    v3.4s, v19.4s, #8\n"
        "mov    w2, wzr\n"
        "mov    v31.16b, v21.16b\n"
        "ldr    q24, [x13, %[scratch_block_data]]\n"
        "ldr    %[scratch_block_data], [sp, #96]\n"  // 8-byte Folded Reload
        "mov    v8.16b, v21.16b\n"
        "mov    v9.16b, v21.16b\n"
        "mov    v10.16b, v21.16b\n"
        "ldr    q25, [x13, %[scratch_block_data]]\n"
        "ldr    %[scratch_block_data], [sp, #88]\n"  // 8-byte Folded Reload
        "str    q3, [sp, #224]\n"  // 16-byte Folded Spill
        "shl    v3.4s, v20.4s, #8\n"
        ".word 0x4e98969f  // sdot   v31.4s, v20.16b, v24.16b\n"
        "ldr    q26, [x13, %[scratch_block_data]]\n"
        "ldp    %[scratch_block_data], x7, [sp, #104]\n"  // 16-byte Folded Reload
        ".word 0x4e989668  // sdot   v8.4s, v19.16b, v24.16b\n"
        ".word 0x4e989649  // sdot   v9.4s, v18.16b, v24.16b\n"
        ".word 0x4e99964a  // sdot   v10.4s, v18.16b, v25.16b\n"
        "ldr    q27, [x13, %[scratch_block_data]]\n"
        "mov    x13, x19\n"
        "mov    %[scratch_block_data], %[output_block_data]\n"
        "str    q3, [sp, #208]\n"  // 16-byte Folded Spill
        "stp    %[output_block_data], x19, [sp, #144]\n"  // 16-byte Folded Spill
        "b      " DC_KERNEL_NO_MULT_5 "f\n"
        DC_KERNEL_NO_MULT_4 ":\n"  // in Loop: Header=BB225_5 Depth=3
        ".word 0x4e8e965f  // sdot   v31.4s, v18.16b, v14.16b\n"
        ".word 0x4e979648  // sdot   v8.4s, v18.16b, v23.16b\n"
        ".word 0x4e999669  // sdot   v9.4s, v19.16b, v25.16b\n"
        ".word 0x4e97967f  // sdot   v31.4s, v19.16b, v23.16b\n"
        ".word 0x4e9a966a  // sdot   v10.4s, v19.16b, v26.16b\n"
        ".word 0x4e999688  // sdot   v8.4s, v20.16b, v25.16b\n"
        ".word 0x4e9a9689  // sdot   v9.4s, v20.16b, v26.16b\n"
        "sqrdmulh        v31.4s, v31.4s, v1.4s\n"
        ".word 0x4e9b968a  // sdot   v10.4s, v20.16b, v27.16b\n"
        "sqrdmulh        v8.4s, v8.4s, v1.4s\n"
        "sqrdmulh        v9.4s, v9.4s, v1.4s\n"
        "sqrshl v31.4s, v31.4s, v2.4s\n"
        "sqrdmulh        v10.4s, v10.4s, v1.4s\n"
        "sqrshl v8.4s, v8.4s, v2.4s\n"
        "sqrshl v9.4s, v9.4s, v2.4s\n"
        "sqxtn  v31.4h, v31.4s\n"
        "sqrshl v10.4s, v10.4s, v2.4s\n"
        "sqxtn  v9.4h, v9.4s\n"
        "sqxtn2 v31.8h, v8.4s\n"
        "sqxtn2 v9.8h, v10.4s\n"
        "sqadd  v31.8h, v31.8h, v0.8h\n"
        "sqadd  v8.8h, v9.8h, v0.8h\n"
        "sqxtun v31.8b, v31.8h\n"
        "sqxtun2        v31.16b, v8.8h\n"
        "ldp    q28, q6, [sp, #256]\n"  // 32-byte Folded Reload
        "add    %[output_block_data], x12, %[scratch_block_data]\n"
        "ldr    q5, [sp, #208]\n"  // 16-byte Folded Reload
        "mov    v8.16b, v21.16b\n"
        "umax   v31.16b, v31.16b, v6.16b\n"
        "umin   v31.16b, v31.16b, v28.16b\n"
        "str    s31, [x6, %[scratch_block_data]]\n"
        "st1    { v31.s }[1], [%[output_block_data]]\n"
        "add    %[output_block_data], x17, %[scratch_block_data]\n"
        "st1    { v31.s }[2], [%[output_block_data]]\n"
        "add    %[output_block_data], x22, %[scratch_block_data]\n"
        "st1    { v31.s }[3], [%[output_block_data]]\n"
        "ldp    q30, q29, [sp, #224]\n"  // 32-byte Folded Reload
        "mov    v9.16b, v21.16b\n"
        "mov    v10.16b, v21.16b\n"
        "mov    v11.16b, v21.16b\n"
        ".word 0x4e8e97a8  // sdot   v8.4s, v29.16b, v14.16b\n"
        ".word 0x4e9797a9  // sdot   v9.4s, v29.16b, v23.16b\n"
        ".word 0x4e9897aa  // sdot   v10.4s, v29.16b, v24.16b\n"
        ".word 0x4e9797c8  // sdot   v8.4s, v30.16b, v23.16b\n"
        ".word 0x4e9997ab  // sdot   v11.4s, v29.16b, v25.16b\n"
        ".word 0x4e9897c9  // sdot   v9.4s, v30.16b, v24.16b\n"
        ".word 0x4e9997ca  // sdot   v10.4s, v30.16b, v25.16b\n"
        ".word 0x4e9894a8  // sdot   v8.4s, v5.16b, v24.16b\n"
        ".word 0x4e9a97cb  // sdot   v11.4s, v30.16b, v26.16b\n"
        ".word 0x4e9994a9  // sdot   v9.4s, v5.16b, v25.16b\n"
        ".word 0x4e9a94aa  // sdot   v10.4s, v5.16b, v26.16b\n"
        "sqrdmulh        v22.4s, v8.4s, v1.4s\n"
        "rev32  v12.8h, v23.8h\n"
        "rev32  v13.8h, v24.8h\n"
        ".word 0x4e9b94ab  // sdot   v11.4s, v5.16b, v27.16b\n"
        "sqrdmulh        v23.4s, v9.4s, v1.4s\n"
        "sqrdmulh        v24.4s, v10.4s, v1.4s\n"
        "sqrshl v22.4s, v22.4s, v2.4s\n"
        "rev32  v4.8h, v25.8h\n"
        "sqrdmulh        v25.4s, v11.4s, v1.4s\n"
        "sqrshl v8.4s, v23.4s, v2.4s\n"
        "sqrshl v23.4s, v24.4s, v2.4s\n"
        "sqxtn  v10.4h, v22.4s\n"
        "ldr    %[output_block_data], [sp, #184]\n"  // 8-byte Folded Reload
        "rev32  v15.8h, v26.8h\n"
        "rev32  v3.8h, v27.8h\n"
        "sqrshl v9.4s, v25.4s, v2.4s\n"
        "sqxtn  v11.4h, v23.4s\n"
        "ldr    q22, [x11, x13]\n"
        "ldr    q23, [x25, x13]\n"
        "ldr    q24, [x24, x13]\n"
        "ldr    q25, [x10, x13]\n"
        "ldr    q26, [x23, x13]\n"
        "ldr    q27, [x8, x13]\n"
        "sqxtn2 v10.8h, v8.4s\n"
        "sqxtn2 v11.8h, v9.4s\n"
        "sqadd  v8.8h, v10.8h, v0.8h\n"
        "sqadd  v9.8h, v11.8h, v0.8h\n"
        "sqxtun v8.8b, v8.8h\n"
        "sqxtun2        v8.16b, v9.8h\n"
        "umax   v8.16b, v8.16b, v6.16b\n"
        "add    %[output_block_data], x3, %[scratch_block_data]\n"
        "umin   v8.16b, v8.16b, v28.16b\n"
        "str    s8, [x21, %[scratch_block_data]]\n"
        "st1    { v8.s }[1], [%[output_block_data]]\n"
        "ldr    %[output_block_data], [sp, #192]\n"  // 8-byte Folded Reload
        "rev32  v31.8h, v14.8h\n"
        "mov    v9.16b, v21.16b\n"
        "trn1   v31.8h, v31.8h, v22.8h\n"
        "add    %[output_block_data], x3, %[scratch_block_data]\n"
        "st1    { v8.s }[2], [%[output_block_data]]\n"
        "add    %[output_block_data], x15, %[scratch_block_data]\n"
        "mov    v10.16b, v21.16b\n"
        "mov    v11.16b, v21.16b\n"
        "trn1   v12.8h, v12.8h, v23.8h\n"
        "trn1   v13.8h, v13.8h, v24.8h\n"
        ".word 0x4e9f9649  // sdot   v9.4s, v18.16b, v31.16b\n"
        "st1    { v8.s }[3], [%[output_block_data]]\n"
        "mov    v8.16b, v21.16b\n"
        "trn1   v14.8h, v4.8h, v25.8h\n"
        ".word 0x4e8c964a  // sdot   v10.4s, v18.16b, v12.16b\n"
        ".word 0x4e8d964b  // sdot   v11.4s, v18.16b, v13.16b\n"
        ".word 0x4e8c9669  // sdot   v9.4s, v19.16b, v12.16b\n"
        "trn1   v15.8h, v15.8h, v26.8h\n"
        ".word 0x4e8e9648  // sdot   v8.4s, v18.16b, v14.16b\n"
        ".word 0x4e8d966a  // sdot   v10.4s, v19.16b, v13.16b\n"
        ".word 0x4e8e966b  // sdot   v11.4s, v19.16b, v14.16b\n"
        ".word 0x4e8d9689  // sdot   v9.4s, v20.16b, v13.16b\n"
        "trn1   v3.8h, v3.8h, v27.8h\n"
        ".word 0x4e8f9668  // sdot   v8.4s, v19.16b, v15.16b\n"
        ".word 0x4e8e968a  // sdot   v10.4s, v20.16b, v14.16b\n"
        ".word 0x4e8f968b  // sdot   v11.4s, v20.16b, v15.16b\n"
        "sqrdmulh        v9.4s, v9.4s, v1.4s\n"
        ".word 0x4e839688  // sdot   v8.4s, v20.16b, v3.16b\n"
        "sqrdmulh        v10.4s, v10.4s, v1.4s\n"
        "sqrdmulh        v11.4s, v11.4s, v1.4s\n"
        "sqrshl v9.4s, v9.4s, v2.4s\n"
        "sqrdmulh        v8.4s, v8.4s, v1.4s\n"
        "sqrshl v10.4s, v10.4s, v2.4s\n"
        "sqrshl v11.4s, v11.4s, v2.4s\n"
        "sqxtn  v9.4h, v9.4s\n"
        "ldr    %[output_block_data], [sp, #200]\n"  // 8-byte Folded Reload
        "sqrshl v8.4s, v8.4s, v2.4s\n"
        "sqxtn  v11.4h, v11.4s\n"
        "sqxtn2 v9.8h, v10.4s\n"
        "sqxtn2 v11.8h, v8.4s\n"
        "sqadd  v8.8h, v9.8h, v0.8h\n"
        "sqadd  v9.8h, v11.8h, v0.8h\n"
        "sqxtun v8.8b, v8.8h\n"
        "sqxtun2        v8.16b, v9.8h\n"
        "umax   v8.16b, v8.16b, v6.16b\n"
        "add    %[output_block_data], x3, %[scratch_block_data]\n"
        "umin   v8.16b, v8.16b, v28.16b\n"
        "str    s8, [x9, %[scratch_block_data]]\n"
        "st1    { v8.s }[1], [%[output_block_data]]\n"
        "add    %[output_block_data], x30, %[scratch_block_data]\n"
        "st1    { v8.s }[2], [%[output_block_data]]\n"
        "add    %[output_block_data], x29, %[scratch_block_data]\n"
        "mov    v9.16b, v21.16b\n"
        "mov    v10.16b, v21.16b\n"
        "mov    v11.16b, v21.16b\n"
        "st1    { v8.s }[3], [%[output_block_data]]\n"
        "mov    v8.16b, v21.16b\n"
        ".word 0x4e9f97a9  // sdot   v9.4s, v29.16b, v31.16b\n"
        ".word 0x4e8c97aa  // sdot   v10.4s, v29.16b, v12.16b\n"
        ".word 0x4e8d97ab  // sdot   v11.4s, v29.16b, v13.16b\n"
        ".word 0x4e8e97a8  // sdot   v8.4s, v29.16b, v14.16b\n"
        ".word 0x4e8c97c9  // sdot   v9.4s, v30.16b, v12.16b\n"
        ".word 0x4e8d97ca  // sdot   v10.4s, v30.16b, v13.16b\n"
        ".word 0x4e8e97cb  // sdot   v11.4s, v30.16b, v14.16b\n"
        ".word 0x4e8f97c8  // sdot   v8.4s, v30.16b, v15.16b\n"
        ".word 0x4e8d94a9  // sdot   v9.4s, v5.16b, v13.16b\n"
        ".word 0x4e8e94aa  // sdot   v10.4s, v5.16b, v14.16b\n"
        ".word 0x4e8f94ab  // sdot   v11.4s, v5.16b, v15.16b\n"
        ".word 0x4e8394a8  // sdot   v8.4s, v5.16b, v3.16b\n"
        "sqrdmulh        v3.4s, v9.4s, v1.4s\n"
        "sqrdmulh        v31.4s, v10.4s, v1.4s\n"
        "sqrdmulh        v9.4s, v11.4s, v1.4s\n"
        "sqrshl v3.4s, v3.4s, v2.4s\n"
        "sqrdmulh        v8.4s, v8.4s, v1.4s\n"
        "sqrshl v31.4s, v31.4s, v2.4s\n"
        "sqrshl v9.4s, v9.4s, v2.4s\n"
        "sqxtn  v3.4h, v3.4s\n"
        "sqrshl v8.4s, v8.4s, v2.4s\n"
        "sqxtn  v9.4h, v9.4s\n"
        "sqxtn2 v3.8h, v31.4s\n"
        "sqxtn2 v9.8h, v8.4s\n"
        "sqadd  v3.8h, v3.8h, v0.8h\n"
        "sqadd  v31.8h, v9.8h, v0.8h\n"
        "sqxtun v3.8b, v3.8h\n"
        "sqxtun2        v3.16b, v31.8h\n"
        "umax   v3.16b, v3.16b, v6.16b\n"
        "add    %[output_block_data], x28, %[scratch_block_data]\n"
        "umin   v3.16b, v3.16b, v28.16b\n"
        "str    s3, [x7, %[scratch_block_data]]\n"
        "st1    { v3.s }[1], [%[output_block_data]]\n"
        "add    %[output_block_data], x27, %[scratch_block_data]\n"
        "st1    { v3.s }[2], [%[output_block_data]]\n"
        "add    %[output_block_data], x26, %[scratch_block_data]\n"
        "mov    v31.16b, v21.16b\n"
        "mov    v8.16b, v21.16b\n"
        "mov    v9.16b, v21.16b\n"
        "mov    v10.16b, v21.16b\n"
        "add    w2, w2, #1\n"  // =1
        ".word 0x4e98969f  // sdot   v31.4s, v20.16b, v24.16b\n"
        ".word 0x4e989668  // sdot   v8.4s, v19.16b, v24.16b\n"
        ".word 0x4e989649  // sdot   v9.4s, v18.16b, v24.16b\n"
        ".word 0x4e99964a  // sdot   v10.4s, v18.16b, v25.16b\n"
        "st1    { v3.s }[3], [%[output_block_data]]\n"
        "add    %[scratch_block_data], x0, %[function_params]\n"
        "add    x13, x13, #32\n"  // =32
        "mov    v14.16b, v22.16b\n"
        DC_KERNEL_NO_MULT_5 ":\n"  // Parent Loop BB225_26 Depth=1
        // Parent Loop BB225_13 Depth=2
        // =>  This Inner Loop Header: Depth=3
        "cmp    w2, w14\n"
        "b.lt   " DC_KERNEL_NO_MULT_4 "b\n"
        // %bb.6:        // in Loop: Header=BB225_13 Depth=2
        "ldr    %[bias_data], [sp, #168]\n"  // 8-byte Folded Reload
        "ldp    d6, d5, [sp, #72]\n"  // 16-byte Folded Reload
        "cmp    w5, #0\n"  // =0
        "add    %[bias_data], x2, #16\n"  // =16
        "str    %[bias_data], [sp, #168]\n"  // 8-byte Folded Spill
        "b.le   " DC_KERNEL_NO_MULT_12 "f\n"
        // %bb.7:        // in Loop: Header=BB225_13 Depth=2
        "movi   v28.16b, #0\n"
        "cmp    w5, #3\n"  // =3
        "movi   v29.16b, #0\n"
        "movi   v30.16b, #0\n"
        "movi   v11.16b, #0\n"
        "movi   v12.16b, #0\n"
        "movi   v13.16b, #0\n"
        "b.lt   " DC_KERNEL_NO_MULT_9 "f\n"
        // %bb.8:        // in Loop: Header=BB225_13 Depth=2
        "ldr    q28, [x11, x13]\n"
        "ldr    q29, [x25, x13]\n"
        "ldr    q30, [x24, x13]\n"
        "ldr    q11, [x10, x13]\n"
        "ldr    q12, [x23, x13]\n"
        "ldr    q13, [x8, x13]\n"
        DC_KERNEL_NO_MULT_9 ":\n"  // in Loop: Header=BB225_13 Depth=2
        "ldr    x19, [sp, #136]\n"  // 8-byte Folded Reload
        "mov    x13, xzr\n"
        "mov    w2, wzr\n"
        "add    %[output_block_data], x22, %[scratch_block_data]\n"
        "add    x6, x17, %[scratch_block_data]\n"
        "add    x7, x12, %[scratch_block_data]\n"
        "add    %[scratch_block_data], x19, x0\n"
        "b      " DC_KERNEL_NO_MULT_11 "f\n"
        DC_KERNEL_NO_MULT_10 ":\n"  // in Loop: Header=BB225_11 Depth=3
        ".word 0x4e8e965f  // sdot   v31.4s, v18.16b, v14.16b\n"
        ".word 0x4e979648  // sdot   v8.4s, v18.16b, v23.16b\n"
        ".word 0x4e999669  // sdot   v9.4s, v19.16b, v25.16b\n"
        ".word 0x4e97967f  // sdot   v31.4s, v19.16b, v23.16b\n"
        ".word 0x4e9a966a  // sdot   v10.4s, v19.16b, v26.16b\n"
        ".word 0x4e999688  // sdot   v8.4s, v20.16b, v25.16b\n"
        ".word 0x4e9a9689  // sdot   v9.4s, v20.16b, v26.16b\n"
        "sqrdmulh        v3.4s, v31.4s, v1.4s\n"
        ".word 0x4e9b968a  // sdot   v10.4s, v20.16b, v27.16b\n"
        "sqrdmulh        v31.4s, v8.4s, v1.4s\n"
        "sqrdmulh        v8.4s, v9.4s, v1.4s\n"
        "sqrshl v3.4s, v3.4s, v2.4s\n"
        "sqrdmulh        v9.4s, v10.4s, v1.4s\n"
        "sqrshl v31.4s, v31.4s, v2.4s\n"
        "sqrshl v8.4s, v8.4s, v2.4s\n"
        "sqxtn  v3.4h, v3.4s\n"
        "sqrshl v9.4s, v9.4s, v2.4s\n"
        "sqxtn  v8.4h, v8.4s\n"
        "sqxtn2 v3.8h, v31.4s\n"
        "sqxtn2 v8.8h, v9.4s\n"
        "sqadd  v3.8h, v3.8h, v0.8h\n"
        "sqadd  v31.8h, v8.8h, v0.8h\n"
        "sqxtun v3.8b, v3.8h\n"
        "sqxtun2        v3.16b, v31.8h\n"
        "ldr    q4, [sp, #272]\n"  // 16-byte Folded Reload
        "add    x19, x7, x13\n"
        "ushr   v24.4s, v24.4s, #8\n"
        "ushr   v25.4s, v25.4s, #8\n"
        "umax   v3.16b, v3.16b, v4.16b\n"
        "ldr    q4, [sp, #256]\n"  // 16-byte Folded Reload
        "ushr   v14.4s, v14.4s, #8\n"
        "ushr   v23.4s, v23.4s, #8\n"
        "sli    v24.4s, v30.4s, #24\n"
        "umin   v3.16b, v3.16b, v4.16b\n"
        "str    s3, [%[scratch_block_data], x13]\n"
        "st1    { v3.s }[1], [x19]\n"
        "add    x19, x6, x13\n"
        "st1    { v3.s }[2], [x19]\n"
        "add    x19, %[output_block_data], x13\n"
        "ushr   v26.4s, v26.4s, #8\n"
        "ushr   v27.4s, v27.4s, #8\n"
        "sli    v25.4s, v11.4s, #24\n"
        "mov    v31.16b, v21.16b\n"
        "mov    v8.16b, v21.16b\n"
        "mov    v9.16b, v21.16b\n"
        "mov    v10.16b, v21.16b\n"
        "add    w2, w2, #1\n"  // =1
        "sli    v14.4s, v28.4s, #24\n"
        "ushr   v28.4s, v28.4s, #8\n"
        "ushr   v30.4s, v30.4s, #8\n"
        "sli    v23.4s, v29.4s, #24\n"
        "ushr   v29.4s, v29.4s, #8\n"
        "ushr   v11.4s, v11.4s, #8\n"
        "sli    v26.4s, v12.4s, #24\n"
        "ushr   v12.4s, v12.4s, #8\n"
        "sli    v27.4s, v13.4s, #24\n"
        "ushr   v13.4s, v13.4s, #8\n"
        "st1    { v3.s }[3], [x19]\n"
        ".word 0x4e98969f  // sdot   v31.4s, v20.16b, v24.16b\n"
        ".word 0x4e989668  // sdot   v8.4s, v19.16b, v24.16b\n"
        ".word 0x4e989649  // sdot   v9.4s, v18.16b, v24.16b\n"
        ".word 0x4e99964a  // sdot   v10.4s, v18.16b, v25.16b\n"
        "add    x13, x13, x16\n"
        DC_KERNEL_NO_MULT_11 ":\n"  // Parent Loop BB225_26 Depth=1
        // Parent Loop BB225_13 Depth=2
        // =>  This Inner Loop Header: Depth=3
        "cmp    w2, w5\n"
        "b.lt   " DC_KERNEL_NO_MULT_10 "b\n"
        DC_KERNEL_NO_MULT_12 ":\n"  // in Loop: Header=BB225_13 Depth=2
        "ldp    x19, %[scratch_block_data], [sp, #152]\n"  // 16-byte Folded Reload
        "ldr    %[output_block_data], [sp, #144]\n"  // 8-byte Folded Reload
        "mov    v20.16b, v17.16b\n"
        "mov    v19.16b, v16.16b\n"
        "add    %[scratch_block_data], x0, #1\n"  // =1
        "add    %[output_block_data], x3, #4\n"  // =4
        "add    x19, x19, #16\n"  // =16
        "mov    v18.16b, v7.16b\n"
        DC_KERNEL_NO_MULT_13 ":\n"  // Parent Loop BB225_26 Depth=1
        // =>  This Loop Header: Depth=2
        // Child Loop BB225_5 Depth 3
        // Child Loop BB225_11 Depth 3
        "cmp    %[scratch_block_data], #2\n"  // =2
        "b.ne   " DC_KERNEL_NO_MULT_3 "b\n"
        "b      " DC_KERNEL_NO_MULT_25 "f\n"
        DC_KERNEL_NO_MULT_14 ":\n"  // in Loop: Header=BB225_26 Depth=1
        "ldp    q21, q22, [%[bias_data]]\n"
        "ldr    %[bias_data], [sp, #64]\n"  // 8-byte Folded Reload
        "ldr    x7, [sp, #128]\n"  // 8-byte Folded Reload
        "mov    w0, wzr\n"
        "b      " DC_KERNEL_NO_MULT_24 "f\n"
        DC_KERNEL_NO_MULT_15 ":\n"  // in Loop: Header=BB225_24 Depth=2
        "str    w0, [sp, #240]\n"  // 4-byte Folded Spill
        "ldr    %[scratch_block_data], [sp, #176]\n"  // 8-byte Folded Reload
        "add    %[output_block_data], x7, %[filter_workspace]\n"
        "ldp    q23, q24, [x7]\n"
        "ldp    q25, q26, [%[output_block_data]]\n"
        "add    %[scratch_block_data], x7, x0\n"
        "str    %[output_block_data], [sp, #208]\n"  // 8-byte Folded Spill
        "ldp    q27, q28, [%[scratch_block_data]]\n"
        "mov    w13, wzr\n"
        "mov    %[scratch_block_data], %[bias_data]\n"
        "str    %[bias_data], [sp, #224]\n"  // 8-byte Folded Spill
        "b      " DC_KERNEL_NO_MULT_22 "f\n"
        DC_KERNEL_NO_MULT_16 ":\n"  // in Loop: Header=BB225_22 Depth=3
        "cmp    w13, w14\n"
        "orr    w2, wzr, #0x4\n"
        "csel   w6, w5, w2, eq\n"
        "add    %[output_block_data], x7, #32\n"  // =32
        "movi   v29.16b, #0\n"
        "movi   v30.16b, #0\n"
        "movi   v8.16b, #0\n"
        "movi   v31.16b, #0\n"
        "cmp    w6, #3\n"  // =3
        "movi   v9.16b, #0\n"
        "movi   v10.16b, #0\n"
        "b.lt   " DC_KERNEL_NO_MULT_18 "f\n"
        // %bb.17:        // in Loop: Header=BB225_22 Depth=3
        "ldr    %[bias_data], [sp, #176]\n"  // 8-byte Folded Reload
        "add    x19, %[output_block_data], %[filter_workspace]\n"
        "ldp    q29, q31, [x7, #32]\n"
        "ldp    q30, q9, [x19]\n"
        "add    %[bias_data], %[output_block_data], x2\n"
        "ldp    q8, q10, [%[bias_data]]\n"
        DC_KERNEL_NO_MULT_18 ":\n"  // in Loop: Header=BB225_22 Depth=3
        "mov    w7, wzr\n"
        "b      " DC_KERNEL_NO_MULT_20 "f\n"
        DC_KERNEL_NO_MULT_19 ":\n"  // in Loop: Header=BB225_20 Depth=4
        "mov    v3.16b, v21.16b\n"
        "mov    v11.16b, v22.16b\n"
        ".word 0x4e979643  // sdot   v3.4s, v18.16b, v23.16b\n"
        ".word 0x4e9894eb  // sdot   v11.4s, v7.16b, v24.16b\n"
        ".word 0x4e999663  // sdot   v3.4s, v19.16b, v25.16b\n"
        ".word 0x4e9a960b  // sdot   v11.4s, v16.16b, v26.16b\n"
        ".word 0x4e9b9683  // sdot   v3.4s, v20.16b, v27.16b\n"
        ".word 0x4e9c962b  // sdot   v11.4s, v17.16b, v28.16b\n"
        "sqrdmulh        v3.4s, v3.4s, v1.4s\n"
        "sqrdmulh        v11.4s, v11.4s, v1.4s\n"
        "sqrshl v3.4s, v3.4s, v2.4s\n"
        "sqrshl v11.4s, v11.4s, v2.4s\n"
        "sqxtn  v3.4h, v3.4s\n"
        "sqxtn2 v3.8h, v11.4s\n"
        "sqadd  v3.8h, v3.8h, v0.8h\n"
        "sqxtun v3.8b, v3.8h\n"
        "umax   v3.8b, v3.8b, v5.8b\n"
        "ushr   v23.4s, v23.4s, #8\n"
        "ushr   v24.4s, v24.4s, #8\n"
        "ushr   v25.4s, v25.4s, #8\n"
        "ushr   v26.4s, v26.4s, #8\n"
        "ushr   v27.4s, v27.4s, #8\n"
        "ushr   v28.4s, v28.4s, #8\n"
        "umin   v3.8b, v3.8b, v6.8b\n"
        "sli    v23.4s, v29.4s, #24\n"
        "ushr   v29.4s, v29.4s, #8\n"
        "sli    v24.4s, v31.4s, #24\n"
        "ushr   v31.4s, v31.4s, #8\n"
        "sli    v25.4s, v30.4s, #24\n"
        "ushr   v30.4s, v30.4s, #8\n"
        "sli    v26.4s, v9.4s, #24\n"
        "ushr   v9.4s, v9.4s, #8\n"
        "sli    v27.4s, v8.4s, #24\n"
        "ushr   v8.4s, v8.4s, #8\n"
        "sli    v28.4s, v10.4s, #24\n"
        "ushr   v10.4s, v10.4s, #8\n"
        "str    d3, [%[scratch_block_data]]\n"
        "add    %[scratch_block_data], x0, x16\n"
        "add    w7, w7, #1\n"  // =1
        DC_KERNEL_NO_MULT_20 ":\n"  // Parent Loop BB225_26 Depth=1
        // Parent Loop BB225_24 Depth=2
        // Parent Loop BB225_22 Depth=3
        // =>  This Inner Loop Header: Depth=4
        "cmp    w7, w6\n"
        "b.lt   " DC_KERNEL_NO_MULT_19 "b\n"
        // %bb.21:        // in Loop: Header=BB225_22 Depth=3
        "add    w13, w13, #1\n"  // =1
        "mov    x7, %[output_block_data]\n"
        DC_KERNEL_NO_MULT_22 ":\n"  // Parent Loop BB225_26 Depth=1
        // Parent Loop BB225_24 Depth=2
        // =>  This Loop Header: Depth=3
        // Child Loop BB225_20 Depth 4
        "ldr    w2, [sp, #292]\n"  // 4-byte Folded Reload
        "cmp    w13, w2\n"
        "b.lt   " DC_KERNEL_NO_MULT_16 "b\n"
        // %bb.23:        // in Loop: Header=BB225_24 Depth=2
        "ldr    x13, [sp, #120]\n"  // 8-byte Folded Reload
        "ldr    %[bias_data], [sp, #224]\n"  // 8-byte Folded Reload
        "ldr    w0, [sp, #240]\n"  // 4-byte Folded Reload
        "ldr    x7, [sp, #208]\n"  // 8-byte Folded Reload
        "add    %[bias_data], x2, x13\n"
        "add    w0, w0, #1\n"  // =1
        DC_KERNEL_NO_MULT_24 ":\n"  // Parent Loop BB225_26 Depth=1
        // =>  This Loop Header: Depth=2
        // Child Loop BB225_22 Depth 3
        // Child Loop BB225_20 Depth 4
        "ldr    w13, [sp, #288]\n"  // 4-byte Folded Reload
        "cmp    w0, w13\n"
        "b.lt   " DC_KERNEL_NO_MULT_15 "b\n"
        DC_KERNEL_NO_MULT_25 ":\n"  // in Loop: Header=BB225_26 Depth=1
        "ldr    x13, [sp, #128]\n"  // 8-byte Folded Reload
        "ldr    %[scratch_block_data], [sp, #8]\n"  // 8-byte Folded Reload
        "ldp    x19, %[bias_data], [sp, #48]\n"  // 16-byte Folded Reload
        "ldr    w7, [sp, #36]\n"  // 4-byte Folded Reload
        "ldr    x6, [sp, #24]\n"  // 8-byte Folded Reload
        "add    x13, x13, %[scratch_block_data]\n"
        "str    x13, [sp, #128]\n"  // 8-byte Folded Spill
        "ldr    x13, [sp, #64]\n"  // 8-byte Folded Reload
        "add    %[bias_data], x2, #32\n"  // =32
        "add    w7, w7, #1\n"  // =1
        "add    x19, x19, %[scratch_block_data]\n"
        "add    x13, x13, #8\n"  // =8
        "str    x13, [sp, #64]\n"  // 8-byte Folded Spill
        "ldr    x13, [sp, #40]\n"  // 8-byte Folded Reload
        "add    x13, x13, #8\n"  // =8
        DC_KERNEL_NO_MULT_26 ":\n"  // =>This Loop Header: Depth=1
        // Child Loop BB225_24 Depth 2
        // Child Loop BB225_22 Depth 3
        // Child Loop BB225_20 Depth 4
        // Child Loop BB225_13 Depth 2
        // Child Loop BB225_5 Depth 3
        // Child Loop BB225_11 Depth 3
        "ldr    w0, [sp, #20]\n"  // 4-byte Folded Reload
        "cmp    w7, w0\n"
        "b.lt   " DC_KERNEL_NO_MULT_1 "b\n"
        // %bb.27:
         // Compiled intrinsics total stack 448, now 304 for spillage only.
        "add    sp, sp, #304\n"  // =448
        :
        // Outputs.
        [ scratch_block_data ] "+r"(scratch_block_data),
        [ filter_workspace ] "+r"(filter_workspace),
        [ bias_data ] "+r"(bias_data),
        [ output_block_data ] "+r"(output_block_data)
        :
        // Inputs.
        [ function_params ] "r"(function_params)
        :
        // Clobbers.
        "cc", "memory",
        // We use these NEON registers.
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
        "v31",
        // We use these general-purpose registers.
        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
        "x16", "x17", "x19", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
        "x28", "x29", "x30");
#endif  // __linux__
  }  // NOLINT(readability/fn_size) Manually unrolled.