in cas_object/src/byte_grouping/bg4_prediction.rs [94:188]
fn add_data_impl(&mut self, offset: usize, data: &[u8], calc_u128_popcnt: impl Fn(u128) -> u128) {
if data.is_empty() {
return;
}
let mut ptr = data.as_ptr();
let mut remaining = data.len();
// Just copy it in and run it if we have a small amount.
if remaining <= 16 {
unsafe {
let mut buffer = [0u8; 16];
core::ptr::copy_nonoverlapping(ptr, buffer.as_mut_ptr(), remaining);
let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(buffer));
self.apply_perbyte_popcounts(per_byte_popcnt, offset, (0, remaining));
}
return;
}
// How many bytes from the start of data do we need move in order to get to an alignment boundary for
// aligned reads of u128 values?
let n_align_bytes = ptr.align_offset(core::mem::align_of::<u128>());
// Okay to compute one offset value for each u128 value, as it's just used
// modulo 4 to put things in the correct histograms.
let u128_common_offset = offset + n_align_bytes;
// Process the first bytes that are possibly unaligned.
if n_align_bytes != 0 {
let head_bytes = size_of::<u128>() - n_align_bytes;
// Copy the first `head_bytes` into the end of a temp buffer
let mut buffer = [0u8; 16];
unsafe {
core::ptr::copy_nonoverlapping(ptr, buffer.as_mut_ptr().add(head_bytes), n_align_bytes);
let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(buffer));
self.apply_perbyte_popcounts(per_byte_popcnt, u128_common_offset, (head_bytes, 16));
ptr = ptr.add(n_align_bytes);
}
remaining -= n_align_bytes;
}
// Body: aligned reads, several at once. 4 seems to benchmark the fastest.
const BLOCK_SIZE: usize = 4;
while remaining >= BLOCK_SIZE * 16 {
unsafe {
// Force the compiler to first perform an aligned read by casting to u128, then handle the endianness
// just for consistency. The latter part should be a no-op on little-endian machines.
let raw_input = *(ptr as *const [u128; BLOCK_SIZE]);
let mut popcnt_v = [0u128; BLOCK_SIZE];
// We can add the counts directly here; as long as 9 * BLOCK_SIZE < 256 so each byte doesn't overflow
// into the next byte over.
for i in 0..raw_input.len() {
// Ensure we're handling endianess correctly. Should optimize out endian switching calls on
// little-endian machines.
*popcnt_v.get_unchecked_mut(i) =
calc_u128_popcnt(u128::from_le_bytes(raw_input.get_unchecked(i).to_ne_bytes()));
}
// Now, translate this out to aggregated stuff
for i in 0..raw_input.len() {
self.apply_perbyte_popcounts(*popcnt_v.get_unchecked(i), u128_common_offset, (0, 16));
}
ptr = ptr.add(BLOCK_SIZE * 16);
remaining -= BLOCK_SIZE * 16;
}
}
// Body: aligned reads
while remaining >= 16 {
unsafe {
// Force the compiler to first perform an aligned read by casting to u128, then handle the endianness
// just for consistency. The latter part should be a no-op on little-endian machines.
let raw_input = *(ptr as *const u128);
let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(raw_input.to_ne_bytes()));
self.apply_perbyte_popcounts(per_byte_popcnt, u128_common_offset, (0, 16));
ptr = ptr.add(16);
remaining -= 16;
}
}
// Tail: copy final bytes into a zero-padded buffer
if remaining > 0 {
unsafe {
let mut buffer = [0u8; 16];
core::ptr::copy_nonoverlapping(ptr, buffer.as_mut_ptr(), remaining);
let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(buffer));
self.apply_perbyte_popcounts(per_byte_popcnt, u128_common_offset, (0, remaining));
}
}
}