static inline void KernelMacroBlockNeon()

in tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h [8927:9280]


  static inline void KernelMacroBlockNeon(
      const int8* scratch_block_data, const int8* filter_workspace,
      const int32* bias_data, uint8* output_block_data,
      const DepthwiseConvDotProdParams* function_params) {
    // Note that argument registers may be reused after parameter loading.
    // x0 %[scratch_block_data]
    // x1 %[filter_workspace]
    // x2 %[bias_data]
    // x3 %[output_block_data]
    // x4 %[function_params]
#define DC_KERNEL_MULT_STRIDE_1 "1"
#define DC_KERNEL_MULT_STRIDE_2 "2"
#define DC_KERNEL_MULT_STRIDE_3 "3"
#define DC_KERNEL_MULT_STRIDE_4 "4"
#define DC_KERNEL_MULT_STRIDE_5 "5"
#define DC_KERNEL_MULT_STRIDE_6 "6"
#define DC_KERNEL_MULT_STRIDE_7 "7"
#define DC_KERNEL_MULT_STRIDE_8 "8"
#define DC_KERNEL_MULT_STRIDE_9 "9"
#define DC_KERNEL_MULT_STRIDE_10 "10"
#define DC_KERNEL_MULT_STRIDE_11 "11"
#define DC_KERNEL_MULT_STRIDE_12 "12"
#define DC_KERNEL_MULT_STRIDE_13 "13"

    asm volatile(
        "ldr    w15, [%[function_params], #" STR(DP_OFFSET_OUTPUT_RESIDUAL_WIDTH) "]\n"
        "ldp    w11, w6, [%[function_params], #" STR(DP_OFFSET_OUTPUT_WIDTH_OVERALL_MICRO_REPEATS) "]\n"
        "ldpsw  x9, x10, [%[function_params], #" STR(DP_OFFSET_OUTPUT_HEIGHT_STRIDE) "]\n"
        "ldrsw  x12, [%[function_params], #" STR(DP_OFFSET_DEPTH_MICRO_REPEATS) "]\n"
        "ldrsw  x13, [%[function_params], #" STR(DP_OFFSET_OUTPUT_DEPTH) "]\n"
        "ldr    w14, [%[function_params], #" STR(DP_OFFSET_OUTBOUND_BLOCK_HEIGHT) "]\n"
        "add    x17, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MIN) "\n"  // =40
        "add    x5, %[function_params], #" STR(DP_OFFSET_QUANTIZED_ACTIVATION_MAX) "\n"  // =44
        "add    x7, %[function_params], #" STR(DP_OFFSET_OUTPUT_MULTIPLIER) "\n"  // =32
        "add    x19, %[function_params], #" STR(DP_OFFSET_OUTPUT_SHIFT) "\n"  // =36
        "add    %[function_params], %[function_params], #" STR(DP_OFFSET_OUTPUT_OFFSET) "\n"  // =28
        "sxtw   x11, w11\n"
        "ld1r   { v0.8h }, [%[function_params]]\n"
        "ld1r   { v1.4s }, [x7]\n"
        "ld1r   { v2.4s }, [x19]\n"
        "ld1r   { v3.8b }, [x17]\n"
        "ld1r   { v4.8b }, [x5]\n"
        "cmp    w15, #2\n"  // =2
        "ccmp   w6, w11, #0, lt\n"
        "lsl    x5, x6, #2\n"
        "csel   w6, w6, w11, lt\n"
        "mov    x8, xzr\n"
        "add    x16, %[scratch_block_data], #4\n"  // =4
        "lsl    x17, x10, #1\n"
        "add    %[function_params], x10, x10, lsl #1\n"
        "sxtw   x6, w6\n"
        "add    x7, x9, x13\n"
        "b      " DC_KERNEL_MULT_STRIDE_13 "f\n"
        DC_KERNEL_MULT_STRIDE_1 ":\n"  // in Loop: Header=BB206_13 Depth=1
        "ldr    w20, [%[scratch_block_data]]\n"
        "add    x21, %[scratch_block_data], x10\n"
        "ldp    q5, q6, [%[filter_workspace]]\n"
        "ldp    q7, q16, [%[filter_workspace], #32]\n"
        "fmov   s21, w20\n"
        "mov    v21.s[1], w20\n"
        "ld1    { v21.s }[2], [x21]\n"
        "ldp    q17, q18, [%[filter_workspace], #64]\n"
        "ldp    q19, q20, [%[bias_data]], #32\n"
        "ldr    s22, [%[scratch_block_data], x17]\n"
        "ubfiz  x19, x8, #3, #29\n"
        "add    %[filter_workspace], %[filter_workspace], #96\n"  // =96
        "add    x19, %[output_block_data], x19\n"
        "cmp    w14, #2\n"  // =2
        "mov    v21.s[3], w20\n"
        "mov    x20, xzr\n"
        "b.ne   " DC_KERNEL_MULT_STRIDE_7 "f\n"
        // %bb.2:        // in Loop: Header=BB206_13 Depth=1
        "dup    v22.4s, v22.s[0]\n"
        "add    x21, %[scratch_block_data], %[function_params]\n"
        "add    x22, %[scratch_block_data], x10, lsl #2\n"
        "ld1    { v22.s }[2], [x21]\n"
        "ld1r   { v23.4s }, [x22]\n"
        "mov    x21, xzr\n"
        "b      " DC_KERNEL_MULT_STRIDE_4 "f\n"
        DC_KERNEL_MULT_STRIDE_3 ":\n"  // in Loop: Header=BB206_4 Depth=2
        "and    x22, x20, #0xfffffffc\n"
        "add    x23, x16, x22\n"
        "lsl    x24, x10, #2\n"
        "mov    x22, x23\n"
        "ld1    { v21.s }[1], [x22], x24\n"
        "add    x24, x23, x17\n"
        "ld1    { v22.s }[1], [x24]\n"
        "add    x24, x23, x10\n"
        "ld1    { v21.s }[3], [x24]\n"
        "add    x23, x23, %[function_params]\n"
        "ld1    { v22.s }[3], [x23]\n"
        "mov    v25.16b, v19.16b\n"
        "mov    v27.16b, v20.16b\n"
        "ld1    { v23.s }[1], [x22]\n"
        "ushr   v29.2d, v21.2d, #16\n"
        ".word 0x4f9de0b9  // sdot   v25.4s, v5.16b, v29.4b[0]\n"
        ".word 0x4f9de0db  // sdot   v27.4s, v6.16b, v29.4b[0]\n"
        "mov    v26.16b, v19.16b\n"
        "mov    v28.16b, v20.16b\n"
        ".word 0x4f9de8f9  // sdot   v25.4s, v7.16b, v29.4b[2]\n"
        ".word 0x4f9dea1b  // sdot   v27.4s, v16.16b, v29.4b[2]\n"
        "ushr   v29.2d, v22.2d, #16\n"
        ".word 0x4f9de0ba  // sdot   v26.4s, v5.16b, v29.4b[0]\n"
        ".word 0x4f9de0dc  // sdot   v28.4s, v6.16b, v29.4b[0]\n"
        "mov    v24.16b, v19.16b\n"
        ".word 0x4f9de8fa  // sdot   v26.4s, v7.16b, v29.4b[2]\n"
        ".word 0x4f9dea1c  // sdot   v28.4s, v16.16b, v29.4b[2]\n"
        ".word 0x4f9de239  // sdot   v25.4s, v17.16b, v29.4b[0]\n"
        ".word 0x4f9de25b  // sdot   v27.4s, v18.16b, v29.4b[0]\n"
        "ushr   v29.2d, v23.2d, #16\n"
        ".word 0x4f9de23a  // sdot   v26.4s, v17.16b, v29.4b[0]\n"
        ".word 0x4f9de25c  // sdot   v28.4s, v18.16b, v29.4b[0]\n"
        "mov    v29.16b, v19.16b\n"
        ".word 0x4f95e0b8  // sdot   v24.4s, v5.16b, v21.4b[0]\n"
        ".word 0x4f96e0bd  // sdot   v29.4s, v5.16b, v22.4b[0]\n"
        ".word 0x4f95e8f8  // sdot   v24.4s, v7.16b, v21.4b[2]\n"
        ".word 0x4f96e8fd  // sdot   v29.4s, v7.16b, v22.4b[2]\n"
        ".word 0x4f96e238  // sdot   v24.4s, v17.16b, v22.4b[0]\n"
        ".word 0x4f97e23d  // sdot   v29.4s, v17.16b, v23.4b[0]\n"
        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
        "sqrshl v24.4s, v24.4s, v2.4s\n"
        "sqrshl v29.4s, v29.4s, v2.4s\n"
        "sqxtn  v24.4h, v24.4s\n"
        "sqxtn2 v24.8h, v29.4s\n"
        "sqadd  v24.8h, v24.8h, v0.8h\n"
        "sqxtun v24.8b, v24.8h\n"
        "umax   v24.8b, v24.8b, v3.8b\n"
        "add    x22, x19, x9\n"
        "mov    v29.16b, v20.16b\n"
        "umin   v24.8b, v24.8b, v4.8b\n"
        "str    s24, [x19]\n"
        "st1    { v24.s }[1], [x22]\n"
        "mov    v24.16b, v20.16b\n"
        ".word 0x4f95e0dd  // sdot   v29.4s, v6.16b, v21.4b[0]\n"
        ".word 0x4f96e0d8  // sdot   v24.4s, v6.16b, v22.4b[0]\n"
        ".word 0x4f95ea1d  // sdot   v29.4s, v16.16b, v21.4b[2]\n"
        ".word 0x4f96ea18  // sdot   v24.4s, v16.16b, v22.4b[2]\n"
        ".word 0x4f96e25d  // sdot   v29.4s, v18.16b, v22.4b[0]\n"
        ".word 0x4f97e258  // sdot   v24.4s, v18.16b, v23.4b[0]\n"
        "sqrdmulh        v29.4s, v29.4s, v1.4s\n"
        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
        "sqrshl v29.4s, v29.4s, v2.4s\n"
        "sqrshl v24.4s, v24.4s, v2.4s\n"
        "sqxtn  v29.4h, v29.4s\n"
        "sqxtn2 v29.8h, v24.4s\n"
        "sqadd  v24.8h, v29.8h, v0.8h\n"
        "sqxtun v24.8b, v24.8h\n"
        "sqrdmulh        v25.4s, v25.4s, v1.4s\n"
        "umax   v24.8b, v24.8b, v3.8b\n"
        "sqrdmulh        v26.4s, v26.4s, v1.4s\n"
        "sqrshl v25.4s, v25.4s, v2.4s\n"
        "add    x22, x22, #4\n"  // =4
        "umin   v24.8b, v24.8b, v4.8b\n"
        "sqrshl v26.4s, v26.4s, v2.4s\n"
        "sqxtn  v25.4h, v25.4s\n"
        "str    s24, [x19, #4]\n"
        "st1    { v24.s }[1], [x22]\n"
        "sqxtn2 v25.8h, v26.4s\n"
        "sqadd  v24.8h, v25.8h, v0.8h\n"
        "sqrdmulh        v27.4s, v27.4s, v1.4s\n"
        "sqxtun v24.8b, v24.8h\n"
        "sqrdmulh        v28.4s, v28.4s, v1.4s\n"
        "sqrshl v27.4s, v27.4s, v2.4s\n"
        "umax   v24.8b, v24.8b, v3.8b\n"
        "add    x23, x19, x13\n"
        "add    x24, x19, x7\n"
        "sqrshl v28.4s, v28.4s, v2.4s\n"
        "sqxtn  v27.4h, v27.4s\n"
        "umin   v24.8b, v24.8b, v4.8b\n"
        "str    s24, [x23]\n"
        "st1    { v24.s }[1], [x24]\n"
        "sqxtn2 v27.8h, v28.4s\n"
        "sqadd  v24.8h, v27.8h, v0.8h\n"
        "sqxtun v24.8b, v24.8h\n"
        "umax   v24.8b, v24.8b, v3.8b\n"
        "add    x25, x24, #4\n"  // =4
        "umin   v24.8b, v24.8b, v4.8b\n"
        "add    x21, x21, #1\n"  // =1
        "ushr   v21.2d, v21.2d, #32\n"
        "ushr   v22.2d, v22.2d, #32\n"
        "ushr   v23.2d, v23.2d, #32\n"
        "add    x19, x23, x13\n"
        "str    s24, [x23, #4]\n"
        "st1    { v24.s }[1], [x25]\n"
        "add    x20, x20, #4\n"  // =4
        DC_KERNEL_MULT_STRIDE_4 ":\n"  // Parent Loop BB206_13 Depth=1
        // =>  This Inner Loop Header: Depth=2
        "cmp    x21, x6\n"
        "b.lt   " DC_KERNEL_MULT_STRIDE_3 "b\n"
        "b      " DC_KERNEL_MULT_STRIDE_6 "f\n"
        DC_KERNEL_MULT_STRIDE_5 ":\n"  // in Loop: Header=BB206_6 Depth=2
        "and    x22, x20, #0xfffffffc\n"
        "add    x22, x16, x22\n"
        "lsl    x23, x10, #2\n"
        "mov    x25, x22\n"
        "add    x24, x22, x17\n"
        "ld1    { v21.s }[1], [x25], x23\n"
        "ld1    { v22.s }[1], [x24]\n"
        "add    x23, x22, x10\n"
        "add    x22, x22, %[function_params]\n"
        "ld1    { v21.s }[3], [x23]\n"
        "ld1    { v22.s }[3], [x22]\n"
        "mov    v24.16b, v19.16b\n"
        "ld1    { v23.s }[1], [x25]\n"
        "mov    v25.16b, v19.16b\n"
        ".word 0x4f95e0b8  // sdot   v24.4s, v5.16b, v21.4b[0]\n"
        ".word 0x4f96e0b9  // sdot   v25.4s, v5.16b, v22.4b[0]\n"
        ".word 0x4f95e8f8  // sdot   v24.4s, v7.16b, v21.4b[2]\n"
        ".word 0x4f96e8f9  // sdot   v25.4s, v7.16b, v22.4b[2]\n"
        ".word 0x4f96e238  // sdot   v24.4s, v17.16b, v22.4b[0]\n"
        ".word 0x4f97e239  // sdot   v25.4s, v17.16b, v23.4b[0]\n"
        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
        "sqrdmulh        v25.4s, v25.4s, v1.4s\n"
        "sqrshl v24.4s, v24.4s, v2.4s\n"
        "sqrshl v25.4s, v25.4s, v2.4s\n"
        "sqxtn  v24.4h, v24.4s\n"
        "sqxtn2 v24.8h, v25.4s\n"
        "sqadd  v24.8h, v24.8h, v0.8h\n"
        "sqxtun v24.8b, v24.8h\n"
        "umax   v24.8b, v24.8b, v3.8b\n"
        "add    x22, x19, x9\n"
        "mov    v25.16b, v20.16b\n"
        "umin   v24.8b, v24.8b, v4.8b\n"
        "str    s24, [x19]\n"
        "st1    { v24.s }[1], [x22]\n"
        "mov    v24.16b, v20.16b\n"
        ".word 0x4f95e0d9  // sdot   v25.4s, v6.16b, v21.4b[0]\n"
        ".word 0x4f96e0d8  // sdot   v24.4s, v6.16b, v22.4b[0]\n"
        ".word 0x4f95ea19  // sdot   v25.4s, v16.16b, v21.4b[2]\n"
        ".word 0x4f96ea18  // sdot   v24.4s, v16.16b, v22.4b[2]\n"
        ".word 0x4f96e259  // sdot   v25.4s, v18.16b, v22.4b[0]\n"
        ".word 0x4f97e258  // sdot   v24.4s, v18.16b, v23.4b[0]\n"
        "sqrdmulh        v25.4s, v25.4s, v1.4s\n"
        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
        "sqrshl v25.4s, v25.4s, v2.4s\n"
        "sqrshl v24.4s, v24.4s, v2.4s\n"
        "sqxtn  v25.4h, v25.4s\n"
        "sqxtn2 v25.8h, v24.4s\n"
        "sqadd  v24.8h, v25.8h, v0.8h\n"
        "sqxtun v24.8b, v24.8h\n"
        "umax   v24.8b, v24.8b, v3.8b\n"
        "add    x22, x22, #4\n"  // =4
        "umin   v24.8b, v24.8b, v4.8b\n"
        "add    x21, x21, #1\n"  // =1
        "ushr   v21.2d, v21.2d, #16\n"
        "ushr   v22.2d, v22.2d, #16\n"
        "ushr   v23.2d, v23.2d, #16\n"
        "str    s24, [x19, #4]\n"
        "st1    { v24.s }[1], [x22]\n"
        "add    x19, x19, x13\n"
        "add    x20, x20, #4\n"  // =4
        DC_KERNEL_MULT_STRIDE_6 ":\n"  // Parent Loop BB206_13 Depth=1
        // =>  This Inner Loop Header: Depth=2
        "cmp    x21, x11\n"
        "b.lt   " DC_KERNEL_MULT_STRIDE_5 "b\n"
        "b      " DC_KERNEL_MULT_STRIDE_12 "f\n"
        DC_KERNEL_MULT_STRIDE_7 ":\n"  // in Loop: Header=BB206_13 Depth=1
        "mov    x21, xzr\n"
        "dup    v22.4s, v22.s[0]\n"
        "b      " DC_KERNEL_MULT_STRIDE_11 "f\n"
        DC_KERNEL_MULT_STRIDE_8 ":\n"  // in Loop: Header=BB206_11 Depth=2
        "and    x22, x20, #0xfffffffc\n"
        "add    x22, x16, x22\n"
        "mov    x23, x22\n"
        "ld1    { v21.s }[1], [x23], x17\n"
        "add    x22, x22, x10\n"
        "mov    v23.16b, v19.16b\n"
        "mov    v24.16b, v20.16b\n"
        "ld1    { v22.s }[1], [x23]\n"
        "ld1    { v21.s }[3], [x22]\n"
        "cmp    w15, #2\n"  // =2
        "ccmp   x5, x20, #0, ne\n"
        ".word 0x4f96e237  // sdot   v23.4s, v17.16b, v22.4b[0]\n"
        ".word 0x4f96e258  // sdot   v24.4s, v18.16b, v22.4b[0]\n"
        ".word 0x4f95e0b7  // sdot   v23.4s, v5.16b, v21.4b[0]\n"
        ".word 0x4f95e0d8  // sdot   v24.4s, v6.16b, v21.4b[0]\n"
        ".word 0x4f95e8f7  // sdot   v23.4s, v7.16b, v21.4b[2]\n"
        ".word 0x4f95ea18  // sdot   v24.4s, v16.16b, v21.4b[2]\n"
        "sqrdmulh        v23.4s, v23.4s, v1.4s\n"
        "sqrdmulh        v24.4s, v24.4s, v1.4s\n"
        "sqrshl v23.4s, v23.4s, v2.4s\n"
        "sqrshl v24.4s, v24.4s, v2.4s\n"
        "sqxtn  v25.4h, v23.4s\n"
        "sqxtn2 v25.8h, v24.4s\n"
        "sqadd  v24.8h, v25.8h, v0.8h\n"
        "sqxtun v24.8b, v24.8h\n"
        "umax   v24.8b, v24.8b, v3.8b\n"
        "umin   v24.8b, v24.8b, v4.8b\n"
        "ushr   v23.2d, v21.2d, #16\n"
        "str    d24, [x19]\n"
        "ushr   v24.2d, v22.2d, #16\n"
        "add    x19, x19, x13\n"
        "b.eq   " DC_KERNEL_MULT_STRIDE_10 "f\n"
        // %bb.9:        // in Loop: Header=BB206_11 Depth=2
        "mov    v25.16b, v19.16b\n"
        "mov    v26.16b, v20.16b\n"
        ".word 0x4f98e239  // sdot   v25.4s, v17.16b, v24.4b[0]\n"
        ".word 0x4f98e25a  // sdot   v26.4s, v18.16b, v24.4b[0]\n"
        ".word 0x4f97e0b9  // sdot   v25.4s, v5.16b, v23.4b[0]\n"
        ".word 0x4f97e0da  // sdot   v26.4s, v6.16b, v23.4b[0]\n"
        ".word 0x4f97e8f9  // sdot   v25.4s, v7.16b, v23.4b[2]\n"
        ".word 0x4f97ea1a  // sdot   v26.4s, v16.16b, v23.4b[2]\n"
        "ushr   v23.2d, v21.2d, #32\n"
        "sqrdmulh        v21.4s, v25.4s, v1.4s\n"
        "ushr   v24.2d, v22.2d, #32\n"
        "sqrdmulh        v22.4s, v26.4s, v1.4s\n"
        "sqrshl v21.4s, v21.4s, v2.4s\n"
        "sqrshl v22.4s, v22.4s, v2.4s\n"
        "sqxtn  v21.4h, v21.4s\n"
        "sqxtn2 v21.8h, v22.4s\n"
        "sqadd  v21.8h, v21.8h, v0.8h\n"
        "sqxtun v21.8b, v21.8h\n"
        "umax   v21.8b, v21.8b, v3.8b\n"
        "umin   v21.8b, v21.8b, v4.8b\n"
        "str    d21, [x19]\n"
        "add    x19, x19, x13\n"
        DC_KERNEL_MULT_STRIDE_10 ":\n"  // in Loop: Header=BB206_11 Depth=2
        "add    x21, x21, #1\n"  // =1
        "add    x20, x20, #4\n"  // =4
        "mov    v22.16b, v24.16b\n"
        "mov    v21.16b, v23.16b\n"
        DC_KERNEL_MULT_STRIDE_11 ":\n"  // Parent Loop BB206_13 Depth=1
        // =>  This Inner Loop Header: Depth=2
        "cmp    x21, x11\n"
        "b.lt   " DC_KERNEL_MULT_STRIDE_8 "b\n"
        DC_KERNEL_MULT_STRIDE_12 ":\n"  // in Loop: Header=BB206_13 Depth=1
        "add    x8, x8, #1\n"  // =1
        DC_KERNEL_MULT_STRIDE_13 ":\n"  // =>This Loop Header: Depth=1
        // Child Loop BB206_11 Depth 2
        // Child Loop BB206_4 Depth 2
        // Child Loop BB206_6 Depth 2
        "cmp    x8, x12\n"
        "b.lt   " DC_KERNEL_MULT_STRIDE_1 "b\n"
        :
        // Outputs.
        [ scratch_block_data ] "+r"(scratch_block_data),
        [ filter_workspace ] "+r"(filter_workspace),
        [ bias_data ] "+r"(bias_data),
        [ output_block_data ] "+r"(output_block_data)
        :
        // Inputs.
        [ function_params ] "r"(function_params)
        :
        // Clobbers.
        "cc", "memory",
        // We use these NEON registers.
        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
        "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20",
        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
        // We use these general-purpose registers.
        "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
        "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25");
  }