in source/intel/visualc/visualc_crc32c_sse42.c [24:72]
uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32) {
uint32_t crc = ~previousCrc32;
int length_to_process = length;
slice_ptr_type temp = (slice_ptr_type)data;
/*to eek good performance out of the intel implementation, we need to only hit the hardware
once we are aligned on the byte boundaries we are using. So, peel off a byte at a time until we are
8 byte aligned (64 bit arch) or 4 byte aligned (32 bit arch)
first calculate how many bytes we need to burn before we are aligned.
for a 64 bit arch this is:
(8 - <how far we are past a boundary>) mod 8
32 bit:
(4 - <how far we are past a boundary>) mod 4 */
uint8_t alignment_offset = (sizeof(slice_ptr_int_type) - ((slice_ptr_int_type)temp % sizeof(slice_ptr_int_type))) %
sizeof(slice_ptr_int_type);
/*for every byte we need to burn off, just do them a byte at a time.
increment the temp pointer by one byte at a time until we get it on an alignment boundary */
while (alignment_offset != 0 && length_to_process) {
uint8_t *byte_pos = (uint8_t *)temp;
crc = (uint32_t)_mm_crc32_u8(crc, *byte_pos++);
temp = (slice_ptr_type)byte_pos;
--alignment_offset;
--length_to_process;
}
/*now whatever is left is properly aligned on a boundary*/
uint32_t slices = length_to_process / sizeof(temp);
uint32_t remainder = length_to_process % sizeof(temp);
while (slices--) {
# if defined(_M_X64)
crc = (uint32_t)_mm_crc32_u64(crc, *temp++);
# else
crc = _mm_crc32_u32(crc, *temp++);
# endif
}
/* process the remaining parts that can't be done on the slice size. */
uint8_t *remainderPos = (uint8_t *)temp;
while (remainder--) {
crc = (uint32_t)_mm_crc32_u8(crc, *remainderPos++);
}
return ~crc;
}