void NOINLINE gemmkernel_6x2_Avx512_fp16_fA0fB0fC0()

in src/FbgemmFP16UKernelsAvx512.cc [819:1037]


void NOINLINE gemmkernel_6x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
  asm volatile(
#if !defined(__clang__) || __clang_major__ >= 14
      "mov r14, %[gp]\t\n"
#else
      "mov %[gp], %%r14\t\n"
      ".intel_syntax noprefix\t\n"
#endif

      // Copy parameters
      // k
      "mov r8, [r14 + 0]\t\n"
      "dec r8\t\n"
      // A
      "mov r9, [r14 + 8]\t\n"
      // B
      "mov r10, [r14 + 16]\t\n"
      // beta
      "lea r15, [r14 + 24]\t\n"
      // C
      "mov r12, [r14 + 32]\t\n"
      // ldc
      "mov r13, [r14 + 40]\t\n"
      // b_block_cols
      "mov rdi, [r14 + 48]\t\n"
      // b_block_size
      "mov rsi, [r14 + 56]\t\n"

      // Make copies of A and C
      "mov rax, r9\t\n"
      "mov rcx, r12\t\n"

      "xor ebx, ebx\t\n"
      "loop_outter%=:\t\n"
      "mov r14, r8\t\n"
      "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
      "vcvtph2ps zmm13,YMMWORD PTR [r10 + 0]\t\n"
      "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n"
      "vxorps xmm0, xmm0, xmm0\t\n"
      "vcomiss xmm31, xmm0\t\n"
      "jz zero_regs%=\t\n"

      // Setup values with beta multiplication
      "vmulps zmm0, zmm31, [r12 + 0]\t\n"
      "vmulps zmm1, zmm31, [r12 + 64]\t\n"
      "add r12, r13\t\n"
      "vmulps zmm2, zmm31, [r12 + 0]\t\n"
      "vmulps zmm3, zmm31, [r12 + 64]\t\n"
      "add r12, r13\t\n"
      "vmulps zmm4, zmm31, [r12 + 0]\t\n"
      "vmulps zmm5, zmm31, [r12 + 64]\t\n"
      "add r12, r13\t\n"
      "vmulps zmm6, zmm31, [r12 + 0]\t\n"
      "vmulps zmm7, zmm31, [r12 + 64]\t\n"
      "add r12, r13\t\n"
      "vmulps zmm8, zmm31, [r12 + 0]\t\n"
      "vmulps zmm9, zmm31, [r12 + 64]\t\n"
      "add r12, r13\t\n"
      "vmulps zmm10, zmm31, [r12 + 0]\t\n"
      "vmulps zmm11, zmm31, [r12 + 64]\t\n"
      "test r14,r14\t\n"
      "jz skip_preload%=\t\n"
      "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
      "skip_preload%=:\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
      "vfmadd231ps zmm0,zmm13,zmm12\t\n"
      "vfmadd231ps zmm1,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
      "vfmadd231ps zmm2,zmm13,zmm12\t\n"
      "vfmadd231ps zmm3,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
      "vfmadd231ps zmm4,zmm13,zmm12\t\n"
      "vfmadd231ps zmm5,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
      "vfmadd231ps zmm6,zmm13,zmm12\t\n"
      "vfmadd231ps zmm7,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
      "vfmadd231ps zmm8,zmm13,zmm12\t\n"
      "vfmadd231ps zmm9,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
      "vfmadd231ps zmm10,zmm13,zmm12\t\n"
      "vfmadd231ps zmm11,zmm14,zmm12\t\n"
      "mov r12, rcx\t\n"
      "test r14,r14\t\n"
      "jnz next_inner%=\t\n"
      "add r10,64\t\n"
      "jmp dump_C%=\t\n"

      "zero_regs%=:\t\n"

      "test r14,r14\t\n"
      "jz skip_preload_b_zero%=\t\n"
      "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
      "skip_preload_b_zero%=:\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
      "vmulps zmm0,zmm13,zmm12\t\n"
      "vmulps zmm1,zmm14,zmm12\t\n"
      "add r12, r13\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
      "vmulps zmm2,zmm13,zmm12\t\n"
      "vmulps zmm3,zmm14,zmm12\t\n"
      "add r12, r13\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
      "vmulps zmm4,zmm13,zmm12\t\n"
      "vmulps zmm5,zmm14,zmm12\t\n"
      "add r12, r13\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
      "vmulps zmm6,zmm13,zmm12\t\n"
      "vmulps zmm7,zmm14,zmm12\t\n"
      "add r12, r13\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
      "vmulps zmm8,zmm13,zmm12\t\n"
      "vmulps zmm9,zmm14,zmm12\t\n"
      "add r12, r13\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
      "vmulps zmm10,zmm13,zmm12\t\n"
      "vmulps zmm11,zmm14,zmm12\t\n"
      "mov r12, rcx\t\n"
      "test r14,r14\t\n"
      "jnz next_inner%=\t\n"
      "add r10,64\t\n"
      "jmp dump_C%=\t\n"

      "loop_inner%=:\t\n"

      "vmovaps zmm13,zmm31\t\n"
      "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n"
      "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
      "vfmadd231ps zmm0,zmm13,zmm12\t\n"
      "vfmadd231ps zmm1,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
      "vfmadd231ps zmm2,zmm13,zmm12\t\n"
      "vfmadd231ps zmm3,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
      "vfmadd231ps zmm4,zmm13,zmm12\t\n"
      "vfmadd231ps zmm5,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
      "vfmadd231ps zmm6,zmm13,zmm12\t\n"
      "vfmadd231ps zmm7,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
      "vfmadd231ps zmm8,zmm13,zmm12\t\n"
      "vfmadd231ps zmm9,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
      "vfmadd231ps zmm10,zmm13,zmm12\t\n"
      "vfmadd231ps zmm11,zmm14,zmm12\t\n"

      "next_inner%=:\t\n"
      "add r9,24\t\n"
      "add r10,64\t\n"
      "dec r14\t\n"
      "jnz loop_inner%=\t\n"

      "vmovaps zmm13,zmm31\t\n"
      "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
      "vfmadd231ps zmm0,zmm13,zmm12\t\n"
      "vfmadd231ps zmm1,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
      "vfmadd231ps zmm2,zmm13,zmm12\t\n"
      "vfmadd231ps zmm3,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
      "vfmadd231ps zmm4,zmm13,zmm12\t\n"
      "vfmadd231ps zmm5,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
      "vfmadd231ps zmm6,zmm13,zmm12\t\n"
      "vfmadd231ps zmm7,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
      "vfmadd231ps zmm8,zmm13,zmm12\t\n"
      "vfmadd231ps zmm9,zmm14,zmm12\t\n"
      "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
      "vfmadd231ps zmm10,zmm13,zmm12\t\n"
      "vfmadd231ps zmm11,zmm14,zmm12\t\n"
      "add r9,24\t\n"
      "add r10,64\t\n"
      // Dump C
      "dump_C%=:\t\n"
      "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
      "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
      "add r12, r13\t\n"
      "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
      "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
      "add r12, r13\t\n"
      "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
      "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
      "add r12, r13\t\n"
      "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
      "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
      "add r12, r13\t\n"
      "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
      "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
      "add r12, r13\t\n"
      "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
      "vmovups zmmword PTR [r12 + 64], zmm11\t\n"

      // next outer iteration
      "add rcx, 128\t\n"
      "mov r12, rcx\t\n"
      "mov r9, rax\t\n"
      "inc rbx\t\n"
      "cmp rbx, rdi\t\n"
      "jl loop_outter%=\t\n"
      :
      : [gp] "rm"(gp)
      : "r8",
        "r9",
        "r10",
        "r11",
        "r13",
        "r14",
        "rax",
        "rcx",
        "rsi",
        "rdi",
        "rbx",
        "r12",
        "r15",
        "memory");
}