in src/FbgemmFP16UKernelsAvx512.cc [2418:2750]
void NOINLINE gemmkernel_12x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
asm volatile(
#if !defined(__clang__) || __clang_major__ >= 14
"mov r14, %[gp]\t\n"
#else
"mov %[gp], %%r14\t\n"
".intel_syntax noprefix\t\n"
#endif
// Copy parameters
// k
"mov r8, [r14 + 0]\t\n"
"dec r8\t\n"
// A
"mov r9, [r14 + 8]\t\n"
// B
"mov r10, [r14 + 16]\t\n"
// beta
"lea r15, [r14 + 24]\t\n"
// C
"mov r12, [r14 + 32]\t\n"
// ldc
"mov r13, [r14 + 40]\t\n"
// b_block_cols
"mov rdi, [r14 + 48]\t\n"
// b_block_size
"mov rsi, [r14 + 56]\t\n"
// Make copies of A and C
"mov rax, r9\t\n"
"mov rcx, r12\t\n"
"xor ebx, ebx\t\n"
"loop_outter%=:\t\n"
"mov r14, r8\t\n"
"vbroadcastss zmm31,DWORD PTR [r15]\t\n"
"vcvtph2ps zmm25,YMMWORD PTR [r10 + 0]\t\n"
"vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n"
"vxorps xmm0, xmm0, xmm0\t\n"
"vcomiss xmm31, xmm0\t\n"
"jz zero_regs%=\t\n"
// Setup values with beta multiplication
"vmulps zmm0, zmm31, [r12 + 0]\t\n"
"vmulps zmm1, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm2, zmm31, [r12 + 0]\t\n"
"vmulps zmm3, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm4, zmm31, [r12 + 0]\t\n"
"vmulps zmm5, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm6, zmm31, [r12 + 0]\t\n"
"vmulps zmm7, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm8, zmm31, [r12 + 0]\t\n"
"vmulps zmm9, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm10, zmm31, [r12 + 0]\t\n"
"vmulps zmm11, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm12, zmm31, [r12 + 0]\t\n"
"vmulps zmm13, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm14, zmm31, [r12 + 0]\t\n"
"vmulps zmm15, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm16, zmm31, [r12 + 0]\t\n"
"vmulps zmm17, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm18, zmm31, [r12 + 0]\t\n"
"vmulps zmm19, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm20, zmm31, [r12 + 0]\t\n"
"vmulps zmm21, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm22, zmm31, [r12 + 0]\t\n"
"vmulps zmm23, zmm31, [r12 + 64]\t\n"
"test r14,r14\t\n"
"jz skip_preload%=\t\n"
"vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
"skip_preload%=:\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
"vfmadd231ps zmm0,zmm25,zmm24\t\n"
"vfmadd231ps zmm1,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
"vfmadd231ps zmm2,zmm25,zmm24\t\n"
"vfmadd231ps zmm3,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
"vfmadd231ps zmm4,zmm25,zmm24\t\n"
"vfmadd231ps zmm5,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
"vfmadd231ps zmm6,zmm25,zmm24\t\n"
"vfmadd231ps zmm7,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
"vfmadd231ps zmm8,zmm25,zmm24\t\n"
"vfmadd231ps zmm9,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
"vfmadd231ps zmm10,zmm25,zmm24\t\n"
"vfmadd231ps zmm11,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
"vfmadd231ps zmm12,zmm25,zmm24\t\n"
"vfmadd231ps zmm13,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
"vfmadd231ps zmm14,zmm25,zmm24\t\n"
"vfmadd231ps zmm15,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
"vfmadd231ps zmm16,zmm25,zmm24\t\n"
"vfmadd231ps zmm17,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
"vfmadd231ps zmm18,zmm25,zmm24\t\n"
"vfmadd231ps zmm19,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
"vfmadd231ps zmm20,zmm25,zmm24\t\n"
"vfmadd231ps zmm21,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
"vfmadd231ps zmm22,zmm25,zmm24\t\n"
"vfmadd231ps zmm23,zmm26,zmm24\t\n"
"mov r12, rcx\t\n"
"test r14,r14\t\n"
"jnz next_inner%=\t\n"
"add r10,64\t\n"
"jmp dump_C%=\t\n"
"zero_regs%=:\t\n"
"test r14,r14\t\n"
"jz skip_preload_b_zero%=\t\n"
"vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
"skip_preload_b_zero%=:\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
"vmulps zmm0,zmm25,zmm24\t\n"
"vmulps zmm1,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
"vmulps zmm2,zmm25,zmm24\t\n"
"vmulps zmm3,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
"vmulps zmm4,zmm25,zmm24\t\n"
"vmulps zmm5,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
"vmulps zmm6,zmm25,zmm24\t\n"
"vmulps zmm7,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
"vmulps zmm8,zmm25,zmm24\t\n"
"vmulps zmm9,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
"vmulps zmm10,zmm25,zmm24\t\n"
"vmulps zmm11,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
"vmulps zmm12,zmm25,zmm24\t\n"
"vmulps zmm13,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
"vmulps zmm14,zmm25,zmm24\t\n"
"vmulps zmm15,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
"vmulps zmm16,zmm25,zmm24\t\n"
"vmulps zmm17,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
"vmulps zmm18,zmm25,zmm24\t\n"
"vmulps zmm19,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
"vmulps zmm20,zmm25,zmm24\t\n"
"vmulps zmm21,zmm26,zmm24\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
"vmulps zmm22,zmm25,zmm24\t\n"
"vmulps zmm23,zmm26,zmm24\t\n"
"mov r12, rcx\t\n"
"test r14,r14\t\n"
"jnz next_inner%=\t\n"
"add r10,64\t\n"
"jmp dump_C%=\t\n"
"loop_inner%=:\t\n"
"vmovaps zmm25,zmm31\t\n"
"vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n"
"vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
"vfmadd231ps zmm0,zmm25,zmm24\t\n"
"vfmadd231ps zmm1,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
"vfmadd231ps zmm2,zmm25,zmm24\t\n"
"vfmadd231ps zmm3,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
"vfmadd231ps zmm4,zmm25,zmm24\t\n"
"vfmadd231ps zmm5,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
"vfmadd231ps zmm6,zmm25,zmm24\t\n"
"vfmadd231ps zmm7,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
"vfmadd231ps zmm8,zmm25,zmm24\t\n"
"vfmadd231ps zmm9,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
"vfmadd231ps zmm10,zmm25,zmm24\t\n"
"vfmadd231ps zmm11,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
"vfmadd231ps zmm12,zmm25,zmm24\t\n"
"vfmadd231ps zmm13,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
"vfmadd231ps zmm14,zmm25,zmm24\t\n"
"vfmadd231ps zmm15,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
"vfmadd231ps zmm16,zmm25,zmm24\t\n"
"vfmadd231ps zmm17,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
"vfmadd231ps zmm18,zmm25,zmm24\t\n"
"vfmadd231ps zmm19,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
"vfmadd231ps zmm20,zmm25,zmm24\t\n"
"vfmadd231ps zmm21,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
"vfmadd231ps zmm22,zmm25,zmm24\t\n"
"vfmadd231ps zmm23,zmm26,zmm24\t\n"
"next_inner%=:\t\n"
"add r9,48\t\n"
"add r10,64\t\n"
"dec r14\t\n"
"jnz loop_inner%=\t\n"
"vmovaps zmm25,zmm31\t\n"
"vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
"vfmadd231ps zmm0,zmm25,zmm24\t\n"
"vfmadd231ps zmm1,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
"vfmadd231ps zmm2,zmm25,zmm24\t\n"
"vfmadd231ps zmm3,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
"vfmadd231ps zmm4,zmm25,zmm24\t\n"
"vfmadd231ps zmm5,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
"vfmadd231ps zmm6,zmm25,zmm24\t\n"
"vfmadd231ps zmm7,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
"vfmadd231ps zmm8,zmm25,zmm24\t\n"
"vfmadd231ps zmm9,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
"vfmadd231ps zmm10,zmm25,zmm24\t\n"
"vfmadd231ps zmm11,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
"vfmadd231ps zmm12,zmm25,zmm24\t\n"
"vfmadd231ps zmm13,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
"vfmadd231ps zmm14,zmm25,zmm24\t\n"
"vfmadd231ps zmm15,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
"vfmadd231ps zmm16,zmm25,zmm24\t\n"
"vfmadd231ps zmm17,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
"vfmadd231ps zmm18,zmm25,zmm24\t\n"
"vfmadd231ps zmm19,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
"vfmadd231ps zmm20,zmm25,zmm24\t\n"
"vfmadd231ps zmm21,zmm26,zmm24\t\n"
"vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
"vfmadd231ps zmm22,zmm25,zmm24\t\n"
"vfmadd231ps zmm23,zmm26,zmm24\t\n"
"add r9,48\t\n"
"add r10,64\t\n"
// Dump C
"dump_C%=:\t\n"
"vmovups zmmword PTR [r12 + 0], zmm0\t\n"
"vmovups zmmword PTR [r12 + 64], zmm1\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm2\t\n"
"vmovups zmmword PTR [r12 + 64], zmm3\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm4\t\n"
"vmovups zmmword PTR [r12 + 64], zmm5\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm6\t\n"
"vmovups zmmword PTR [r12 + 64], zmm7\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm8\t\n"
"vmovups zmmword PTR [r12 + 64], zmm9\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm10\t\n"
"vmovups zmmword PTR [r12 + 64], zmm11\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm12\t\n"
"vmovups zmmword PTR [r12 + 64], zmm13\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm14\t\n"
"vmovups zmmword PTR [r12 + 64], zmm15\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm16\t\n"
"vmovups zmmword PTR [r12 + 64], zmm17\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm18\t\n"
"vmovups zmmword PTR [r12 + 64], zmm19\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm20\t\n"
"vmovups zmmword PTR [r12 + 64], zmm21\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm22\t\n"
"vmovups zmmword PTR [r12 + 64], zmm23\t\n"
// next outer iteration
"add rcx, 128\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
"cmp rbx, rdi\t\n"
"jl loop_outter%=\t\n"
:
: [gp] "rm"(gp)
: "r8",
"r9",
"r10",
"r11",
"r13",
"r14",
"rax",
"rcx",
"rsi",
"rdi",
"rbx",
"r12",
"r15",
"memory");
}