in tensorflow/tensorflow/lite/experimental/ruy/pack_arm.cc [1008:1462]
void Pack8bitNeonDotprodOutOfOrder(const void* src_ptr0, const void* src_ptr1,
const void* src_ptr2, const void* src_ptr3,
int src_inc0, int src_inc1, int src_inc2,
int src_inc3, int src_rows,
int src_zero_point, std::int8_t* packed_ptr,
int start_col, int end_col,
std::int32_t* sums_ptr, int input_xor) {
gemmlowp::ScopedProfilingLabel label(
"Pack (kNeonDotprod, optimized for out-of-order cores)");
asm volatile(
// clang-format off
"dup v26.16b, %w[input_xor]\n"
"mov w1, #1\n"
"dup v27.16b, w1\n"
"mov w1, #0\n"
"dup v28.4s, wzr\n"
"dup v29.4s, wzr\n"
"dup v30.4s, wzr\n"
"dup v31.4s, wzr\n"
#if RUY_OPT_ENABLED(RUY_OPT_MAX_STREAMING)
"and w2, %w[rows], #-64\n"
"cmp w1, w2\n"
"beq 9f\n"
"ld1 {v0.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v1.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v2.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v3.16b}, [%[src_ptr3]], %[src_inc3]\n"
"ld1 {v4.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v5.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v6.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v7.16b}, [%[src_ptr3]], %[src_inc3]\n"
"ld1 {v8.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v9.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v10.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v11.16b}, [%[src_ptr3]], %[src_inc3]\n"
"ld1 {v12.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v13.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v14.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v15.16b}, [%[src_ptr3]], %[src_inc3]\n"
"add w1, w1, #64\n"
"cmp w1, w2\n"
"beq 8f\n"
"7:\n"
"eor v0.16b, v0.16b, v26.16b\n"
"eor v1.16b, v1.16b, v26.16b\n"
"eor v2.16b, v2.16b, v26.16b\n"
"eor v3.16b, v3.16b, v26.16b\n"
"trn1 v16.4s, v0.4s, v1.4s\n"
"trn2 v17.4s, v0.4s, v1.4s\n"
"trn1 v18.4s, v2.4s, v3.4s\n"
"trn2 v19.4s, v2.4s, v3.4s\n"
"ld1 {v0.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v1.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v2.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v3.16b}, [%[src_ptr3]], %[src_inc3]\n"
"add w1, w1, #16\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"eor v4.16b, v4.16b, v26.16b\n"
"eor v5.16b, v5.16b, v26.16b\n"
"eor v6.16b, v6.16b, v26.16b\n"
"eor v7.16b, v7.16b, v26.16b\n"
"trn1 v16.4s, v4.4s, v5.4s\n"
"trn2 v17.4s, v4.4s, v5.4s\n"
"trn1 v18.4s, v6.4s, v7.4s\n"
"trn2 v19.4s, v6.4s, v7.4s\n"
"ld1 {v4.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v5.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v6.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v7.16b}, [%[src_ptr3]], %[src_inc3]\n"
"add w1, w1, #16\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"eor v8.16b, v8.16b, v26.16b\n"
"eor v9.16b, v9.16b, v26.16b\n"
"eor v10.16b, v10.16b, v26.16b\n"
"eor v11.16b, v11.16b, v26.16b\n"
"trn1 v16.4s, v8.4s, v9.4s\n"
"trn2 v17.4s, v8.4s, v9.4s\n"
"trn1 v18.4s, v10.4s, v11.4s\n"
"trn2 v19.4s, v10.4s, v11.4s\n"
"ld1 {v8.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v9.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v10.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v11.16b}, [%[src_ptr3]], %[src_inc3]\n"
"add w1, w1, #16\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"eor v12.16b, v12.16b, v26.16b\n"
"eor v13.16b, v13.16b, v26.16b\n"
"eor v14.16b, v14.16b, v26.16b\n"
"eor v15.16b, v15.16b, v26.16b\n"
"trn1 v16.4s, v12.4s, v13.4s\n"
"trn2 v17.4s, v12.4s, v13.4s\n"
"trn1 v18.4s, v14.4s, v15.4s\n"
"trn2 v19.4s, v14.4s, v15.4s\n"
"ld1 {v12.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v13.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v14.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v15.16b}, [%[src_ptr3]], %[src_inc3]\n"
"add w1, w1, #16\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"cmp w1, w2\n"
"bne 7b\n"
"8:\n"
"eor v0.16b, v0.16b, v26.16b\n"
"eor v1.16b, v1.16b, v26.16b\n"
"eor v2.16b, v2.16b, v26.16b\n"
"eor v3.16b, v3.16b, v26.16b\n"
"trn1 v16.4s, v0.4s, v1.4s\n"
"trn2 v17.4s, v0.4s, v1.4s\n"
"trn1 v18.4s, v2.4s, v3.4s\n"
"trn2 v19.4s, v2.4s, v3.4s\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"eor v4.16b, v4.16b, v26.16b\n"
"eor v5.16b, v5.16b, v26.16b\n"
"eor v6.16b, v6.16b, v26.16b\n"
"eor v7.16b, v7.16b, v26.16b\n"
"trn1 v16.4s, v4.4s, v5.4s\n"
"trn2 v17.4s, v4.4s, v5.4s\n"
"trn1 v18.4s, v6.4s, v7.4s\n"
"trn2 v19.4s, v6.4s, v7.4s\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"eor v8.16b, v8.16b, v26.16b\n"
"eor v9.16b, v9.16b, v26.16b\n"
"eor v10.16b, v10.16b, v26.16b\n"
"eor v11.16b, v11.16b, v26.16b\n"
"trn1 v16.4s, v8.4s, v9.4s\n"
"trn2 v17.4s, v8.4s, v9.4s\n"
"trn1 v18.4s, v10.4s, v11.4s\n"
"trn2 v19.4s, v10.4s, v11.4s\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"eor v12.16b, v12.16b, v26.16b\n"
"eor v13.16b, v13.16b, v26.16b\n"
"eor v14.16b, v14.16b, v26.16b\n"
"eor v15.16b, v15.16b, v26.16b\n"
"trn1 v16.4s, v12.4s, v13.4s\n"
"trn2 v17.4s, v12.4s, v13.4s\n"
"trn1 v18.4s, v14.4s, v15.4s\n"
"trn2 v19.4s, v14.4s, v15.4s\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"9:\n"
#endif // #if RUY_OPT_ENABLED(RUY_OPT_MAX_STREAMING)
"and w2, %w[rows], #-16\n"
"cmp w1, w2\n"
"beq 3f\n"
"ld1 {v0.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v1.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v2.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v3.16b}, [%[src_ptr3]], %[src_inc3]\n"
"add w1, w1, #16\n"
"cmp w1, w2\n"
"beq 2f\n"
"1:\n"
"eor v0.16b, v0.16b, v26.16b\n"
"eor v1.16b, v1.16b, v26.16b\n"
"eor v2.16b, v2.16b, v26.16b\n"
"eor v3.16b, v3.16b, v26.16b\n"
"trn1 v16.4s, v0.4s, v1.4s\n"
"trn2 v17.4s, v0.4s, v1.4s\n"
"trn1 v18.4s, v2.4s, v3.4s\n"
"trn2 v19.4s, v2.4s, v3.4s\n"
"ld1 {v0.16b}, [%[src_ptr0]], %[src_inc0]\n"
"ld1 {v1.16b}, [%[src_ptr1]], %[src_inc1]\n"
"ld1 {v2.16b}, [%[src_ptr2]], %[src_inc2]\n"
"ld1 {v3.16b}, [%[src_ptr3]], %[src_inc3]\n"
"add w1, w1, #16\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"cmp w1, w2\n"
"bne 1b\n"
"2:\n"
"eor v0.16b, v0.16b, v26.16b\n"
"eor v1.16b, v1.16b, v26.16b\n"
"eor v2.16b, v2.16b, v26.16b\n"
"eor v3.16b, v3.16b, v26.16b\n"
"trn1 v16.4s, v0.4s, v1.4s\n"
"trn2 v17.4s, v0.4s, v1.4s\n"
"trn1 v18.4s, v2.4s, v3.4s\n"
"trn2 v19.4s, v2.4s, v3.4s\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"str q21, [%[packed_ptr], #32]\n"
"str q22, [%[packed_ptr], #64]\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"3:\n"
"ands w2, %w[rows], #15\n"
"beq 4f\n"
"dup v0.16b, %w[src_zero_point]\n"
"dup v1.16b, %w[src_zero_point]\n"
"dup v2.16b, %w[src_zero_point]\n"
"dup v3.16b, %w[src_zero_point]\n"
#define RUY_LOAD_ONE_ROW(R) \
"cmp w2, #" #R "\n" \
"beq 5f\n" \
"ld1 { v0.b }[" #R "], [%[src_ptr0]], #1\n" \
"ld1 { v1.b }[" #R "], [%[src_ptr1]], #1\n" \
"ld1 { v2.b }[" #R "], [%[src_ptr2]], #1\n" \
"ld1 { v3.b }[" #R "], [%[src_ptr3]], #1\n"
RUY_LOAD_ONE_ROW(0)
RUY_LOAD_ONE_ROW(1)
RUY_LOAD_ONE_ROW(2)
RUY_LOAD_ONE_ROW(3)
RUY_LOAD_ONE_ROW(4)
RUY_LOAD_ONE_ROW(5)
RUY_LOAD_ONE_ROW(6)
RUY_LOAD_ONE_ROW(7)
RUY_LOAD_ONE_ROW(8)
RUY_LOAD_ONE_ROW(9)
RUY_LOAD_ONE_ROW(10)
RUY_LOAD_ONE_ROW(11)
RUY_LOAD_ONE_ROW(12)
RUY_LOAD_ONE_ROW(13)
RUY_LOAD_ONE_ROW(14)
RUY_LOAD_ONE_ROW(15)
#undef RUY_LOAD_ONE_ROW
"5:\n"
"eor v0.16b, v0.16b, v26.16b\n"
"eor v1.16b, v1.16b, v26.16b\n"
"eor v2.16b, v2.16b, v26.16b\n"
"eor v3.16b, v3.16b, v26.16b\n"
"trn1 v16.4s, v0.4s, v1.4s\n"
"trn2 v17.4s, v0.4s, v1.4s\n"
"trn1 v18.4s, v2.4s, v3.4s\n"
"trn2 v19.4s, v2.4s, v3.4s\n"
"trn1 v20.2d, v16.2d, v18.2d\n"
"trn2 v22.2d, v16.2d, v18.2d\n"
"trn1 v21.2d, v17.2d, v19.2d\n"
"trn2 v23.2d, v17.2d, v19.2d\n"
".word 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
"str q20, [%[packed_ptr], #0]\n"
"cmp w2, #4\n"
"ble 4f\n"
".word 0x4e9b96be // sdot v30.4s, v21.16b, v27.16b\n"
"str q21, [%[packed_ptr], #32]\n"
"cmp w2, #8\n"
"ble 4f\n"
".word 0x4e9b96dd // sdot v29.4s, v22.16b, v27.16b\n"
"str q22, [%[packed_ptr], #64]\n"
"cmp w2, #12\n"
"ble 4f\n"
".word 0x4e9b96ff // sdot v31.4s, v23.16b, v27.16b\n"
"str q23, [%[packed_ptr], #96]\n"
"add %[packed_ptr], %[packed_ptr], #128\n"
"4:\n"
"add v28.4s, v28.4s, v29.4s\n"
"add v30.4s, v30.4s, v31.4s\n"
"add v28.4s, v28.4s, v30.4s\n"
"cmp %[sums_ptr], #0\n"
"beq 6f\n"
"st1 {v28.4s}, [%[sums_ptr]], #16\n"
"6:\n"
// clang-format on
: [ src_ptr0 ] "+r"(src_ptr0), [ src_ptr1 ] "+r"(src_ptr1),
[ src_ptr2 ] "+r"(src_ptr2), [ src_ptr3 ] "+r"(src_ptr3),
[ packed_ptr ] "+r"(packed_ptr), [ sums_ptr ] "+r"(sums_ptr)
: [ src_inc0 ] "r"(static_cast<std::int64_t>(src_inc0)),
[ src_inc1 ] "r"(static_cast<std::int64_t>(src_inc1)),
[ src_inc2 ] "r"(static_cast<std::int64_t>(src_inc2)),
[ src_inc3 ] "r"(static_cast<std::int64_t>(src_inc3)),
[ rows ] "r"(src_rows),
[ src_zero_point ] "r"(static_cast<int>(src_zero_point)),
[ input_xor ] "r"(input_xor)
: "cc", "memory", "x1", "x2", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28", "v29", "v30", "v31");
}