in src/FbgemmFP16UKernelsAvx512_256.cc [1077:1390]
void NOINLINE gemmkernel_11x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
asm volatile(
#if !defined(__clang__) || __clang_major__ >= 14
"mov r14, %[gp]\t\n"
#else
"mov %[gp], %%r14\t\n"
".intel_syntax noprefix\t\n"
#endif
// Copy parameters
// k
"mov r8, [r14 + 0]\t\n"
"dec r8\t\n"
// A
"mov r9, [r14 + 8]\t\n"
// B
"mov r10, [r14 + 16]\t\n"
// beta
"lea r15, [r14 + 24]\t\n"
// C
"mov r12, [r14 + 32]\t\n"
// ldc
"mov r13, [r14 + 40]\t\n"
// b_block_cols
"mov rdi, [r14 + 48]\t\n"
// b_block_size
"mov rsi, [r14 + 56]\t\n"
// Make copies of A and C
"mov rax, r9\t\n"
"mov rcx, r12\t\n"
"xor ebx, ebx\t\n"
"loop_outter%=:\t\n"
"mov r14, r8\t\n"
"vbroadcastss ymm31,DWORD PTR [r15]\t\n"
"vcvtph2ps ymm23,XMMWORD PTR [r10 + 0]\t\n"
"vcvtph2ps ymm24,XMMWORD PTR [r10 + 16]\t\n"
"vxorps xmm0, xmm0, xmm0\t\n"
"vcomiss xmm31, xmm0\t\n"
"jz zero_regs%=\t\n"
// Setup values with beta multiplication
"vmulps ymm0, ymm31, [r12 + 0]\t\n"
"vmulps ymm1, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm2, ymm31, [r12 + 0]\t\n"
"vmulps ymm3, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm4, ymm31, [r12 + 0]\t\n"
"vmulps ymm5, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm6, ymm31, [r12 + 0]\t\n"
"vmulps ymm7, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm8, ymm31, [r12 + 0]\t\n"
"vmulps ymm9, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm10, ymm31, [r12 + 0]\t\n"
"vmulps ymm11, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm12, ymm31, [r12 + 0]\t\n"
"vmulps ymm13, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm14, ymm31, [r12 + 0]\t\n"
"vmulps ymm15, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm16, ymm31, [r12 + 0]\t\n"
"vmulps ymm17, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm18, ymm31, [r12 + 0]\t\n"
"vmulps ymm19, ymm31, [r12 + 32]\t\n"
"add r12, r13\t\n"
"vmulps ymm20, ymm31, [r12 + 0]\t\n"
"vmulps ymm21, ymm31, [r12 + 32]\t\n"
"test r14,r14\t\n"
"jz skip_preload%=\t\n"
"vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
"skip_preload%=:\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
"vfmadd231ps ymm0,ymm23,ymm22\t\n"
"vfmadd231ps ymm1,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
"vfmadd231ps ymm2,ymm23,ymm22\t\n"
"vfmadd231ps ymm3,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
"vfmadd231ps ymm4,ymm23,ymm22\t\n"
"vfmadd231ps ymm5,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
"vfmadd231ps ymm6,ymm23,ymm22\t\n"
"vfmadd231ps ymm7,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
"vfmadd231ps ymm8,ymm23,ymm22\t\n"
"vfmadd231ps ymm9,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
"vfmadd231ps ymm10,ymm23,ymm22\t\n"
"vfmadd231ps ymm11,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
"vfmadd231ps ymm12,ymm23,ymm22\t\n"
"vfmadd231ps ymm13,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
"vfmadd231ps ymm14,ymm23,ymm22\t\n"
"vfmadd231ps ymm15,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
"vfmadd231ps ymm16,ymm23,ymm22\t\n"
"vfmadd231ps ymm17,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
"vfmadd231ps ymm18,ymm23,ymm22\t\n"
"vfmadd231ps ymm19,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
"vfmadd231ps ymm20,ymm23,ymm22\t\n"
"vfmadd231ps ymm21,ymm24,ymm22\t\n"
"mov r12, rcx\t\n"
"test r14,r14\t\n"
"jnz next_inner%=\t\n"
"add r10,32\t\n"
"jmp dump_C%=\t\n"
"zero_regs%=:\t\n"
"test r14,r14\t\n"
"jz skip_preload_b_zero%=\t\n"
"vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
"skip_preload_b_zero%=:\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
"vmulps ymm0,ymm23,ymm22\t\n"
"vmulps ymm1,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
"vmulps ymm2,ymm23,ymm22\t\n"
"vmulps ymm3,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
"vmulps ymm4,ymm23,ymm22\t\n"
"vmulps ymm5,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
"vmulps ymm6,ymm23,ymm22\t\n"
"vmulps ymm7,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
"vmulps ymm8,ymm23,ymm22\t\n"
"vmulps ymm9,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
"vmulps ymm10,ymm23,ymm22\t\n"
"vmulps ymm11,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
"vmulps ymm12,ymm23,ymm22\t\n"
"vmulps ymm13,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
"vmulps ymm14,ymm23,ymm22\t\n"
"vmulps ymm15,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
"vmulps ymm16,ymm23,ymm22\t\n"
"vmulps ymm17,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
"vmulps ymm18,ymm23,ymm22\t\n"
"vmulps ymm19,ymm24,ymm22\t\n"
"add r12, r13\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
"vmulps ymm20,ymm23,ymm22\t\n"
"vmulps ymm21,ymm24,ymm22\t\n"
"mov r12, rcx\t\n"
"test r14,r14\t\n"
"jnz next_inner%=\t\n"
"add r10,32\t\n"
"jmp dump_C%=\t\n"
"loop_inner%=:\t\n"
"vmovaps ymm23,ymm31\t\n"
"vcvtph2ps ymm24,XMMWORD PTR [r10 + 16]\t\n"
"vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
"vfmadd231ps ymm0,ymm23,ymm22\t\n"
"vfmadd231ps ymm1,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
"vfmadd231ps ymm2,ymm23,ymm22\t\n"
"vfmadd231ps ymm3,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
"vfmadd231ps ymm4,ymm23,ymm22\t\n"
"vfmadd231ps ymm5,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
"vfmadd231ps ymm6,ymm23,ymm22\t\n"
"vfmadd231ps ymm7,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
"vfmadd231ps ymm8,ymm23,ymm22\t\n"
"vfmadd231ps ymm9,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
"vfmadd231ps ymm10,ymm23,ymm22\t\n"
"vfmadd231ps ymm11,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
"vfmadd231ps ymm12,ymm23,ymm22\t\n"
"vfmadd231ps ymm13,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
"vfmadd231ps ymm14,ymm23,ymm22\t\n"
"vfmadd231ps ymm15,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
"vfmadd231ps ymm16,ymm23,ymm22\t\n"
"vfmadd231ps ymm17,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
"vfmadd231ps ymm18,ymm23,ymm22\t\n"
"vfmadd231ps ymm19,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
"vfmadd231ps ymm20,ymm23,ymm22\t\n"
"vfmadd231ps ymm21,ymm24,ymm22\t\n"
"next_inner%=:\t\n"
"add r9,44\t\n"
"add r10,32\t\n"
"dec r14\t\n"
"jnz loop_inner%=\t\n"
"vmovaps ymm23,ymm31\t\n"
"vcvtph2ps ymm24,XMMWORD PTR [r10 + 16]\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
"vfmadd231ps ymm0,ymm23,ymm22\t\n"
"vfmadd231ps ymm1,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
"vfmadd231ps ymm2,ymm23,ymm22\t\n"
"vfmadd231ps ymm3,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
"vfmadd231ps ymm4,ymm23,ymm22\t\n"
"vfmadd231ps ymm5,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
"vfmadd231ps ymm6,ymm23,ymm22\t\n"
"vfmadd231ps ymm7,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
"vfmadd231ps ymm8,ymm23,ymm22\t\n"
"vfmadd231ps ymm9,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
"vfmadd231ps ymm10,ymm23,ymm22\t\n"
"vfmadd231ps ymm11,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
"vfmadd231ps ymm12,ymm23,ymm22\t\n"
"vfmadd231ps ymm13,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
"vfmadd231ps ymm14,ymm23,ymm22\t\n"
"vfmadd231ps ymm15,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
"vfmadd231ps ymm16,ymm23,ymm22\t\n"
"vfmadd231ps ymm17,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
"vfmadd231ps ymm18,ymm23,ymm22\t\n"
"vfmadd231ps ymm19,ymm24,ymm22\t\n"
"vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
"vfmadd231ps ymm20,ymm23,ymm22\t\n"
"vfmadd231ps ymm21,ymm24,ymm22\t\n"
"add r9,44\t\n"
"add r10,32\t\n"
// Dump C
"dump_C%=:\t\n"
"vmovups ymmword PTR [r12 + 0], ymm0\t\n"
"vmovups ymmword PTR [r12 + 32], ymm1\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm2\t\n"
"vmovups ymmword PTR [r12 + 32], ymm3\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm4\t\n"
"vmovups ymmword PTR [r12 + 32], ymm5\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm6\t\n"
"vmovups ymmword PTR [r12 + 32], ymm7\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm8\t\n"
"vmovups ymmword PTR [r12 + 32], ymm9\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm10\t\n"
"vmovups ymmword PTR [r12 + 32], ymm11\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm12\t\n"
"vmovups ymmword PTR [r12 + 32], ymm13\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm14\t\n"
"vmovups ymmword PTR [r12 + 32], ymm15\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm16\t\n"
"vmovups ymmword PTR [r12 + 32], ymm17\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm18\t\n"
"vmovups ymmword PTR [r12 + 32], ymm19\t\n"
"add r12, r13\t\n"
"vmovups ymmword PTR [r12 + 0], ymm20\t\n"
"vmovups ymmword PTR [r12 + 32], ymm21\t\n"
// next outer iteration
"add rcx, 64\t\n"
"mov r12, rcx\t\n"
"mov r9, rax\t\n"
"inc rbx\t\n"
"cmp rbx, rdi\t\n"
"jl loop_outter%=\t\n"
:
: [gp] "rm"(gp)
: "r8",
"r9",
"r10",
"r11",
"r13",
"r14",
"rax",
"rcx",
"rsi",
"rdi",
"rbx",
"r12",
"r15",
"memory");
}