fn add_data_impl()

in cas_object/src/byte_grouping/bg4_prediction.rs [94:188]


    fn add_data_impl(&mut self, offset: usize, data: &[u8], calc_u128_popcnt: impl Fn(u128) -> u128) {
        if data.is_empty() {
            return;
        }
        let mut ptr = data.as_ptr();
        let mut remaining = data.len();

        // Just copy it in and run it if we have a small amount.
        if remaining <= 16 {
            unsafe {
                let mut buffer = [0u8; 16];
                core::ptr::copy_nonoverlapping(ptr, buffer.as_mut_ptr(), remaining);
                let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(buffer));
                self.apply_perbyte_popcounts(per_byte_popcnt, offset, (0, remaining));
            }
            return;
        }

        // How many bytes from the start of data do we need move in order to get to an alignment boundary for
        // aligned reads of u128 values?
        let n_align_bytes = ptr.align_offset(core::mem::align_of::<u128>());

        // Okay to compute one offset value for each u128 value, as it's just used
        // modulo 4 to put things in the correct histograms.
        let u128_common_offset = offset + n_align_bytes;

        // Process the first bytes that are possibly unaligned.
        if n_align_bytes != 0 {
            let head_bytes = size_of::<u128>() - n_align_bytes;

            // Copy the first `head_bytes` into the end of a temp buffer
            let mut buffer = [0u8; 16];
            unsafe {
                core::ptr::copy_nonoverlapping(ptr, buffer.as_mut_ptr().add(head_bytes), n_align_bytes);

                let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(buffer));
                self.apply_perbyte_popcounts(per_byte_popcnt, u128_common_offset, (head_bytes, 16));

                ptr = ptr.add(n_align_bytes);
            }
            remaining -= n_align_bytes;
        }

        // Body: aligned reads, several at once.  4 seems to benchmark the fastest.
        const BLOCK_SIZE: usize = 4;
        while remaining >= BLOCK_SIZE * 16 {
            unsafe {
                // Force the compiler to first perform an aligned read by casting to u128, then handle the endianness
                // just for consistency.  The latter part should be a no-op on little-endian machines.
                let raw_input = *(ptr as *const [u128; BLOCK_SIZE]);
                let mut popcnt_v = [0u128; BLOCK_SIZE];

                // We can add the counts directly here; as long as 9 * BLOCK_SIZE < 256 so each byte doesn't overflow
                // into the next byte over.
                for i in 0..raw_input.len() {
                    // Ensure we're handling endianess correctly.  Should optimize out endian switching calls on
                    // little-endian machines.
                    *popcnt_v.get_unchecked_mut(i) =
                        calc_u128_popcnt(u128::from_le_bytes(raw_input.get_unchecked(i).to_ne_bytes()));
                }

                // Now, translate this out to aggregated stuff
                for i in 0..raw_input.len() {
                    self.apply_perbyte_popcounts(*popcnt_v.get_unchecked(i), u128_common_offset, (0, 16));
                }

                ptr = ptr.add(BLOCK_SIZE * 16);
                remaining -= BLOCK_SIZE * 16;
            }
        }

        // Body: aligned reads
        while remaining >= 16 {
            unsafe {
                // Force the compiler to first perform an aligned read by casting to u128, then handle the endianness
                // just for consistency.  The latter part should be a no-op on little-endian machines.
                let raw_input = *(ptr as *const u128);
                let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(raw_input.to_ne_bytes()));
                self.apply_perbyte_popcounts(per_byte_popcnt, u128_common_offset, (0, 16));

                ptr = ptr.add(16);
                remaining -= 16;
            }
        }

        // Tail: copy final bytes into a zero-padded buffer
        if remaining > 0 {
            unsafe {
                let mut buffer = [0u8; 16];
                core::ptr::copy_nonoverlapping(ptr, buffer.as_mut_ptr(), remaining);
                let per_byte_popcnt = calc_u128_popcnt(u128::from_le_bytes(buffer));
                self.apply_perbyte_popcounts(per_byte_popcnt, u128_common_offset, (0, remaining));
            }
        }
    }