static inline uint32_t s_crc32c_sse42_clmul_3072()

in source/intel/asm/crc32c_sse42_asm.c [214:285]


static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t crc) {
    __asm__ __volatile__(
        "enter_3072_%=:"

        "xor          %%r11, %%r11    # zero all 64 bits in r11, will track crc1 \n"
        "xor          %%r10, %%r10    # zero all 64 bits in r10, will track crc2 \n"

        "movl           $16, %%r8d    # Loop 16 times through 64 byte chunks in 3 parallel stripes \n"

        "loop_3072_%=:"

        "prefetcht0  128(%[in])       # \n"
        "prefetcht0 1152(%[in])       # \n"
        "prefetcht0 2176(%[in])       # \n"

        "crc32q    0(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1024(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2048(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q    8(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1032(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2056(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q   16(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1040(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2064(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q   24(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1048(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2072(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q   32(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1056(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2080(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q   40(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1064(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2088(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q   48(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1072(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2096(%[in]), %%r10    # crc2: stripe2 \n"

        "crc32q   56(%[in]), %%rcx    # crc0: stripe0 \n"
        "crc32q 1080(%[in]), %%r11    # crc1: stripe1 \n"
        "crc32q 2104(%[in]), %%r10    # crc2: stripe2 \n"

        "add            $64, %[in]    # \n"
        "sub             $1, %%r8d    # \n"
        "jnz loop_3072_%=             # \n"

        FOLD_K1K2(
            3072,
            $0xa51b6135,
            $0x170076fa) /* Magic Constants used to fold crc stripes into ecx

                            output registers
                            [crc] is an input and and output so it is marked read/write (i.e. "+c")
                            we clobber the register for [input] (via add instruction) so we must also
                            tag it read/write (i.e. "+d") in the list of outputs to tell gcc about the clobber*/
        : "+c"(crc), "+d"(input)

        /* input registers
           the numeric values match the position of the output registers */
        : [ crc ] "c"(crc), [ in ] "d"(input)

        /* additional clobbered registers
          "cc" is the flags - we add and sub, so the flags are also clobbered */
        : "%r8", "%r9", "%r11", "%r10", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "cc");

    return crc;
}