in source/intel/asm/crc32c_sse42_asm.c [126:204]
static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
__asm__ __volatile__(
"enter_1024_%=:"
"xor %%r11, %%r11 # zero all 64 bits in r11, will track crc1 \n"
"xor %%r10, %%r10 # zero all 64 bits in r10, will track crc2 \n"
"movl $5, %%r8d # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"
"loop_1024_%=:"
"prefetcht0 128(%[in]) # \n"
"prefetcht0 472(%[in]) # \n"
"prefetcht0 808(%[in]) # \n"
"crc32q 0(%[in]), %%rcx # crc0: stripe0 \n"
"crc32q 344(%[in]), %%r11 # crc1: stripe1 \n"
"crc32q 680(%[in]), %%r10 # crc2: stripe2 \n"
"crc32q 8(%[in]), %%rcx # crc0 \n"
"crc32q 352(%[in]), %%r11 # crc1 \n"
"crc32q 688(%[in]), %%r10 # crc2 \n"
"crc32q 16(%[in]), %%rcx # crc0 \n"
"crc32q 360(%[in]), %%r11 # crc1 \n"
"crc32q 696(%[in]), %%r10 # crc2 \n"
"crc32q 24(%[in]), %%rcx # crc0 \n"
"crc32q 368(%[in]), %%r11 # crc1 \n"
"crc32q 704(%[in]), %%r10 # crc2 \n"
"crc32q 32(%[in]), %%rcx # crc0 \n"
"crc32q 376(%[in]), %%r11 # crc1 \n"
"crc32q 712(%[in]), %%r10 # crc2 \n"
"crc32q 40(%[in]), %%rcx # crc0 \n"
"crc32q 384(%[in]), %%r11 # crc1 \n"
"crc32q 720(%[in]), %%r10 # crc2 \n"
"crc32q 48(%[in]), %%rcx # crc0 \n"
"crc32q 392(%[in]), %%r11 # crc1 \n"
"crc32q 728(%[in]), %%r10 # crc2 \n"
"crc32q 56(%[in]), %%rcx # crc0 \n"
"crc32q 400(%[in]), %%r11 # crc1 \n"
"crc32q 736(%[in]), %%r10 # crc2 \n"
"add $64, %[in] # \n"
"sub $1, %%r8d # \n"
"jnz loop_1024_%= # \n"
"crc32q 0(%[in]), %%rcx # crc0 \n"
"crc32q 344(%[in]), %%r11 # crc1 \n"
"crc32q 680(%[in]), %%r10 # crc2 \n"
"crc32q 8(%[in]), %%rcx # crc0 \n"
"crc32q 352(%[in]), %%r11 # crc1 \n"
"crc32q 688(%[in]), %%r10 # crc2 \n"
"crc32q 16(%[in]), %%rcx # crc0 \n"
"crc32q 696(%[in]), %%r10 # crc2 \n"
FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx
output registers
[crc] is an input and and output so it is marked read/write (i.e. "+c")
we clobber the register for [input] (via add instruction) so we must also
tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */
: "+c"(crc), "+d"(input)
/* input registers */
/* the numeric values match the position of the output registers */
: [ crc ] "c"(crc), [ in ] "d"(input)
/* additional clobbered registers */
/* "cc" is the flags - we add and sub, so the flags are also clobbered */
: "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
return crc;
}