uint32_t aws_checksums_crc32c_hw()

in source/intel/asm/crc32c_sse42_asm.c [297:373]


uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {

    if (AWS_UNLIKELY(!detection_performed)) {
        detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
        /* Simply setting the flag true to skip HW detection next time
           Not using memory barriers since the worst that can
           happen is a fallback to the non HW accelerated code. */
        detection_performed = true;
    }

    uint32_t crc = ~previousCrc32;

    /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
    if (AWS_UNLIKELY(length < 8)) {
        while (length-- > 0) {
            __asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input));
            input++;
        }
        return ~crc;
    }

    /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
    int input_alignment = (unsigned long int)input & 0x7;

    /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
    int leading = (8 - input_alignment) & 0x7;

    /* reduce the length by the leading unaligned bytes we are about to process */
    length -= leading;

    /* spin through the leading unaligned input bytes (if any) one-by-one */
    while (leading-- > 0) {
        __asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input));
        input++;
    }

    /* Using likely to keep this code inlined */
    if (AWS_LIKELY(detected_clmul)) {

        while (AWS_LIKELY(length >= 3072)) {
            /* Compute crc32c on each block, chaining each crc result */
            crc = s_crc32c_sse42_clmul_3072(input, crc);
            input += 3072;
            length -= 3072;
        }
        while (AWS_LIKELY(length >= 1024)) {
            /* Compute crc32c on each block, chaining each crc result */
            crc = s_crc32c_sse42_clmul_1024(input, crc);
            input += 1024;
            length -= 1024;
        }
        while (AWS_LIKELY(length >= 256)) {
            /* Compute crc32c on each block, chaining each crc result */
            crc = s_crc32c_sse42_clmul_256(input, crc);
            input += 256;
            length -= 256;
        }
    }

    /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
    while (AWS_LIKELY(length >= 8)) {
        /* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */
        __asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input));
        input += 8;
        length -= 8;
    }

    /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
    while (length-- > 0) {
        __asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]"
                             : "+c"(crc)
                             : [ crc ] "c"(crc), [ in ] "r"(input));
        input++;
    }

    return ~crc;
}