in src/FbgemmFP16UKernelsAvx512.cc [1276:1532]
void NOINLINE gemmkernel_8x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
asm volatile(
#if !defined(__clang__) || __clang_major__ >= 14
"mov r14, %[gp]\t\n"
#else
"mov %[gp], %%r14\t\n"
".intel_syntax noprefix\t\n"
#endif
// Copy parameters
// k
"mov r8, [r14 + 0]\t\n"
"dec r8\t\n"
// A
"mov r9, [r14 + 8]\t\n"
// B
"mov r10, [r14 + 16]\t\n"
// beta
"lea r15, [r14 + 24]\t\n"
// C
"mov r12, [r14 + 32]\t\n"
// ldc
"mov r13, [r14 + 40]\t\n"
// b_block_cols
"mov rdi, [r14 + 48]\t\n"
// b_block_size
"mov rsi, [r14 + 56]\t\n"
// Make copies of A and C
"mov rax, r9\t\n"
"mov rcx, r12\t\n"
"xor ebx, ebx\t\n"
"loop_outter%=:\t\n"
"mov r14, r8\t\n"
"vbroadcastss zmm31,DWORD PTR [r15]\t\n"
"vcvtph2ps zmm17,YMMWORD PTR [r10 + 0]\t\n"
"vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n"
"vxorps xmm0, xmm0, xmm0\t\n"
"vcomiss xmm31, xmm0\t\n"
"jz zero_regs%=\t\n"
// Setup values with beta multiplication
"vmulps zmm0, zmm31, [r12 + 0]\t\n"
"vmulps zmm1, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm2, zmm31, [r12 + 0]\t\n"
"vmulps zmm3, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm4, zmm31, [r12 + 0]\t\n"
"vmulps zmm5, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm6, zmm31, [r12 + 0]\t\n"
"vmulps zmm7, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm8, zmm31, [r12 + 0]\t\n"
"vmulps zmm9, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm10, zmm31, [r12 + 0]\t\n"
"vmulps zmm11, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm12, zmm31, [r12 + 0]\t\n"
"vmulps zmm13, zmm31, [r12 + 64]\t\n"
"add r12, r13\t\n"
"vmulps zmm14, zmm31, [r12 + 0]\t\n"
"vmulps zmm15, zmm31, [r12 + 64]\t\n"
"test r14,r14\t\n"
"jz skip_preload%=\t\n"
"vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
"skip_preload%=:\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
"vfmadd231ps zmm0,zmm17,zmm16\t\n"
"vfmadd231ps zmm1,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
"vfmadd231ps zmm2,zmm17,zmm16\t\n"
"vfmadd231ps zmm3,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
"vfmadd231ps zmm4,zmm17,zmm16\t\n"
"vfmadd231ps zmm5,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
"vfmadd231ps zmm6,zmm17,zmm16\t\n"
"vfmadd231ps zmm7,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
"vfmadd231ps zmm8,zmm17,zmm16\t\n"
"vfmadd231ps zmm9,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
"vfmadd231ps zmm10,zmm17,zmm16\t\n"
"vfmadd231ps zmm11,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
"vfmadd231ps zmm12,zmm17,zmm16\t\n"
"vfmadd231ps zmm13,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
"vfmadd231ps zmm14,zmm17,zmm16\t\n"
"vfmadd231ps zmm15,zmm18,zmm16\t\n"
"mov r12, rcx\t\n"
"test r14,r14\t\n"
"jnz next_inner%=\t\n"
"add r10,64\t\n"
"jmp dump_C%=\t\n"
"zero_regs%=:\t\n"
"test r14,r14\t\n"
"jz skip_preload_b_zero%=\t\n"
"vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
"skip_preload_b_zero%=:\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
"vmulps zmm0,zmm17,zmm16\t\n"
"vmulps zmm1,zmm18,zmm16\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
"vmulps zmm2,zmm17,zmm16\t\n"
"vmulps zmm3,zmm18,zmm16\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
"vmulps zmm4,zmm17,zmm16\t\n"
"vmulps zmm5,zmm18,zmm16\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
"vmulps zmm6,zmm17,zmm16\t\n"
"vmulps zmm7,zmm18,zmm16\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
"vmulps zmm8,zmm17,zmm16\t\n"
"vmulps zmm9,zmm18,zmm16\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
"vmulps zmm10,zmm17,zmm16\t\n"
"vmulps zmm11,zmm18,zmm16\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
"vmulps zmm12,zmm17,zmm16\t\n"
"vmulps zmm13,zmm18,zmm16\t\n"
"add r12, r13\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
"vmulps zmm14,zmm17,zmm16\t\n"
"vmulps zmm15,zmm18,zmm16\t\n"
"mov r12, rcx\t\n"
"test r14,r14\t\n"
"jnz next_inner%=\t\n"
"add r10,64\t\n"
"jmp dump_C%=\t\n"
"loop_inner%=:\t\n"
"vmovaps zmm17,zmm31\t\n"
"vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n"
"vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
"vfmadd231ps zmm0,zmm17,zmm16\t\n"
"vfmadd231ps zmm1,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
"vfmadd231ps zmm2,zmm17,zmm16\t\n"
"vfmadd231ps zmm3,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
"vfmadd231ps zmm4,zmm17,zmm16\t\n"
"vfmadd231ps zmm5,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
"vfmadd231ps zmm6,zmm17,zmm16\t\n"
"vfmadd231ps zmm7,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
"vfmadd231ps zmm8,zmm17,zmm16\t\n"
"vfmadd231ps zmm9,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
"vfmadd231ps zmm10,zmm17,zmm16\t\n"
"vfmadd231ps zmm11,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
"vfmadd231ps zmm12,zmm17,zmm16\t\n"
"vfmadd231ps zmm13,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
"vfmadd231ps zmm14,zmm17,zmm16\t\n"
"vfmadd231ps zmm15,zmm18,zmm16\t\n"
"next_inner%=:\t\n"
"add r9,32\t\n"
"add r10,64\t\n"
"dec r14\t\n"
"jnz loop_inner%=\t\n"
"vmovaps zmm17,zmm31\t\n"
"vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
"vfmadd231ps zmm0,zmm17,zmm16\t\n"
"vfmadd231ps zmm1,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
"vfmadd231ps zmm2,zmm17,zmm16\t\n"
"vfmadd231ps zmm3,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
"vfmadd231ps zmm4,zmm17,zmm16\t\n"
"vfmadd231ps zmm5,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
"vfmadd231ps zmm6,zmm17,zmm16\t\n"
"vfmadd231ps zmm7,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
"vfmadd231ps zmm8,zmm17,zmm16\t\n"
"vfmadd231ps zmm9,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
"vfmadd231ps zmm10,zmm17,zmm16\t\n"
"vfmadd231ps zmm11,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
"vfmadd231ps zmm12,zmm17,zmm16\t\n"
"vfmadd231ps zmm13,zmm18,zmm16\t\n"
"vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
"vfmadd231ps zmm14,zmm17,zmm16\t\n"
"vfmadd231ps zmm15,zmm18,zmm16\t\n"
"add r9,32\t\n"
"add r10,64\t\n"
// Dump C
"dump_C%=:\t\n"
"vmovups zmmword PTR [r12 + 0], zmm0\t\n"
"vmovups zmmword PTR [r12 + 64], zmm1\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm2\t\n"
"vmovups zmmword PTR [r12 + 64], zmm3\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm4\t\n"
"vmovups zmmword PTR [r12 + 64], zmm5\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm6\t\n"
"vmovups zmmword PTR [r12 + 64], zmm7\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm8\t\n"
"vmovups zmmword PTR [r12 + 64], zmm9\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm10\t\n"
"vmovups zmmword PTR [r12 + 64], zmm11\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm12\t\n"
"vmovups zmmword PTR [r12 + 64], zmm13\t\n"
"add r12, r13\t\n"
"vmovups zmmword PTR [r12 + 0], zmm14\t\n"
"vmovups zmmword PTR [r12 + 64], zmm15\t\n"
// next outer iteration
"add rcx, 128\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
"cmp rbx, rdi\t\n"
"jl loop_outter%=\t\n"
:
: [gp] "rm"(gp)
: "r8",
"r9",
"r10",
"r11",
"r13",
"r14",
"rax",
"rcx",
"rsi",
"rdi",
"rbx",
"r12",
"r15",
"memory");
}