static inline uint32_t s_crc32c_sse42_clmul_1024()

in source/intel/asm/crc32c_sse42_asm.c [126:204]


static inline uint32_t s_crc32c_sse42_clmul_1024(const uint8_t *input, uint32_t crc) {
    __asm__ __volatile__(
        "enter_1024_%=:"

        "xor          %%r11, %%r11    # zero all 64 bits in r11, will track crc1 \n"
        "xor          %%r10, %%r10    # zero all 64 bits in r10, will track crc2 \n"

        "movl            $5, %%r8d    # Loop 5 times through 64 byte chunks in 3 parallel stripes \n"

        "loop_1024_%=:"

        "prefetcht0  128(%[in])       # \n"
        "prefetcht0  472(%[in])       # \n"
        "prefetcht0  808(%[in])       # \n"

        "crc32q    0(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q  344(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q  680(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q    8(%[in]), %%rcx    # crc0 \n"
        "crc32q  352(%[in]), %%r11    # crc1 \n"
        "crc32q  688(%[in]), %%r10    # crc2 \n"

        "crc32q   16(%[in]), %%rcx    # crc0 \n"
        "crc32q  360(%[in]), %%r11    # crc1 \n"
        "crc32q  696(%[in]), %%r10    # crc2 \n"

        "crc32q   24(%[in]), %%rcx    # crc0 \n"
        "crc32q  368(%[in]), %%r11    # crc1 \n"
        "crc32q  704(%[in]), %%r10    # crc2 \n"

        "crc32q   32(%[in]), %%rcx    # crc0 \n"
        "crc32q  376(%[in]), %%r11    # crc1 \n"
        "crc32q  712(%[in]), %%r10    # crc2 \n"

        "crc32q   40(%[in]), %%rcx    # crc0 \n"
        "crc32q  384(%[in]), %%r11    # crc1 \n"
        "crc32q  720(%[in]), %%r10    # crc2 \n"

        "crc32q   48(%[in]), %%rcx    # crc0 \n"
        "crc32q  392(%[in]), %%r11    # crc1 \n"
        "crc32q  728(%[in]), %%r10    # crc2 \n"

        "crc32q   56(%[in]), %%rcx    # crc0 \n"
        "crc32q  400(%[in]), %%r11    # crc1 \n"
        "crc32q  736(%[in]), %%r10    # crc2 \n"

        "add            $64, %[in]    # \n"
        "sub             $1, %%r8d    # \n"
        "jnz loop_1024_%=             # \n"

        "crc32q    0(%[in]), %%rcx    # crc0 \n"
        "crc32q  344(%[in]), %%r11    # crc1 \n"
        "crc32q  680(%[in]), %%r10    # crc2 \n"

        "crc32q    8(%[in]), %%rcx    # crc0 \n"
        "crc32q  352(%[in]), %%r11    # crc1 \n"
        "crc32q  688(%[in]), %%r10    # crc2 \n"

        "crc32q   16(%[in]), %%rcx    # crc0 \n"
        "crc32q  696(%[in]), %%r10    # crc2 \n"

        FOLD_K1K2(1024, $0xe417f38a, $0x8f158014) /* Magic Constants used to fold crc stripes into ecx

                            output registers
                            [crc] is an input and and output so it is marked read/write (i.e. "+c")
                            we clobber the register for [input] (via add instruction) so we must also
                            tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber */
        : "+c"(crc), "+d"(input)

        /* input registers */
        /* the numeric values match the position of the output registers */
        : [ crc ] "c"(crc), [ in ] "d"(input)

        /* additional clobbered registers */
        /* "cc" is the flags - we add and sub, so the flags are also clobbered */
        : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");
    return crc;
}