in source/intel/asm/crc32c_sse42_asm.c [297:373]
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
if (AWS_UNLIKELY(!detection_performed)) {
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
/* Simply setting the flag true to skip HW detection next time
Not using memory barriers since the worst that can
happen is a fallback to the non HW accelerated code. */
detection_performed = true;
}
uint32_t crc = ~previousCrc32;
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (AWS_UNLIKELY(length < 8)) {
while (length-- > 0) {
__asm__("loop_small_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input));
input++;
}
return ~crc;
}
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
int input_alignment = (unsigned long int)input & 0x7;
/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
int leading = (8 - input_alignment) & 0x7;
/* reduce the length by the leading unaligned bytes we are about to process */
length -= leading;
/* spin through the leading unaligned input bytes (if any) one-by-one */
while (leading-- > 0) {
__asm__("loop_leading_%=: CRC32B (%[in]), %[crc]" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input));
input++;
}
/* Using likely to keep this code inlined */
if (AWS_LIKELY(detected_clmul)) {
while (AWS_LIKELY(length >= 3072)) {
/* Compute crc32c on each block, chaining each crc result */
crc = s_crc32c_sse42_clmul_3072(input, crc);
input += 3072;
length -= 3072;
}
while (AWS_LIKELY(length >= 1024)) {
/* Compute crc32c on each block, chaining each crc result */
crc = s_crc32c_sse42_clmul_1024(input, crc);
input += 1024;
length -= 1024;
}
while (AWS_LIKELY(length >= 256)) {
/* Compute crc32c on each block, chaining each crc result */
crc = s_crc32c_sse42_clmul_256(input, crc);
input += 256;
length -= 256;
}
}
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
while (AWS_LIKELY(length >= 8)) {
/* Hardcoding %rcx register (i.e. "+c") to allow use of qword instruction */
__asm__ __volatile__("loop_8_%=: CRC32Q (%[in]), %%rcx" : "+c"(crc) : [ crc ] "c"(crc), [ in ] "r"(input));
input += 8;
length -= 8;
}
/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
while (length-- > 0) {
__asm__ __volatile__("loop_trailing_%=: CRC32B (%[in]), %[crc]"
: "+c"(crc)
: [ crc ] "c"(crc), [ in ] "r"(input));
input++;
}
return ~crc;
}