deduplication/src/chunking.rs (764 lines of code) (raw):
use std::cmp::min;
use std::io::{Read, Seek, SeekFrom};
use bytes::Bytes;
use crate::constants::{MAXIMUM_CHUNK_MULTIPLIER, MINIMUM_CHUNK_DIVISOR, TARGET_CHUNK_SIZE};
use crate::Chunk;
/// Chunk Generator given an input stream. Do not use directly.
/// Use `chunk_target_default`.
pub struct Chunker {
// configs
hash: gearhash::Hasher<'static>,
minimum_chunk: usize,
maximum_chunk: usize,
mask: u64,
// generator state
chunkbuf: Vec<u8>,
}
impl Default for Chunker {
fn default() -> Self {
Self::new(*TARGET_CHUNK_SIZE)
}
}
impl Chunker {
pub fn new(target_chunk_size: usize) -> Self {
assert_eq!(target_chunk_size.count_ones(), 1);
// Some of the logic only works if the target_chunk_size is greater than the
// window size of the hash.
assert!(target_chunk_size > 64);
// note the strict lesser than. Combined with count_ones() == 1,
// this limits to 2^31
assert!(target_chunk_size < u32::MAX as usize);
let mask = (target_chunk_size - 1) as u64;
// we will like to shift the mask left by a bunch since the right
// bits of the gear hash are affected by only a small number of bytes
// really. we just shift it all the way left.
let mask = mask << mask.leading_zeros();
let minimum_chunk = target_chunk_size / *MINIMUM_CHUNK_DIVISOR;
let maximum_chunk = target_chunk_size * *MAXIMUM_CHUNK_MULTIPLIER;
assert!(maximum_chunk > minimum_chunk);
let hash = gearhash::Hasher::default();
Chunker {
hash,
minimum_chunk,
maximum_chunk,
mask,
// generator state init
chunkbuf: Vec::with_capacity(maximum_chunk),
}
}
/// Create a chunker with custom min chunk sizes.
/// Only used by the partitioner which has special requirements.
fn new_with_min(target_chunk_size: usize, min_chunk_size: usize) -> Self {
let mut chunker = Self::new(target_chunk_size);
chunker.minimum_chunk = min_chunk_size;
chunker
}
/// Looks for the next chunk boundary in the data. Assumes that whatever is in the current
/// state has been prepended to the current data. If a boundary cannot be found based on the
/// current amount of data, then None is returned.
#[inline]
pub fn next_boundary(&mut self, data: &[u8]) -> Option<usize> {
const HASH_WINDOW_SIZE: usize = 64;
let n_bytes = data.len();
if n_bytes == 0 {
return None;
}
let mut cur_chunk_len = self.chunkbuf.len();
let mut consume_len = 0;
let mut create_chunk = false;
// skip the minimum chunk size
// and noting that the hash has a window size of 64
// so we should be careful to skip only minimum_chunk - 64 - 1
if cur_chunk_len + HASH_WINDOW_SIZE < self.minimum_chunk {
let max_advance = min(self.minimum_chunk - cur_chunk_len - HASH_WINDOW_SIZE - 1, n_bytes);
consume_len += max_advance;
cur_chunk_len += max_advance;
}
// If we have a lot of data, don't read all the way to the end when we'll stop reading
// at the maximum chunk boundary.
let read_end = n_bytes.min(consume_len + self.maximum_chunk - cur_chunk_len);
let mut bytes_to_next_boundary;
if let Some(boundary) = self.hash.next_match(&data[consume_len..read_end], self.mask) {
// If we trigger a boundary before the end, create a chunk.
bytes_to_next_boundary = boundary;
create_chunk = true;
} else {
bytes_to_next_boundary = read_end - consume_len;
}
// if we hit maximum chunk we must create a chunk
if bytes_to_next_boundary + cur_chunk_len >= self.maximum_chunk {
bytes_to_next_boundary = self.maximum_chunk - cur_chunk_len;
create_chunk = true;
}
consume_len += bytes_to_next_boundary;
if create_chunk {
self.hash.set_hash(0); // Reset for the next time.
Some(consume_len)
} else {
None
}
}
/// Process more data; this is a continuation of any data from before when calls were
///
/// Returns the next chunk, if available, and the amount of data that was digested.
///
/// If is_final is true, then it is assumed that no more data after this block will come,
/// and any data currently present and at the end will be put into a final chunk.
pub fn next(&mut self, data: &[u8], is_final: bool) -> (Option<Chunk>, usize) {
let (data, consume): (Bytes, usize) = {
if let Some(next_boundary) = self.next_boundary(data) {
if self.chunkbuf.is_empty() {
(Bytes::copy_from_slice(&data[..next_boundary]), next_boundary)
} else {
self.chunkbuf.extend_from_slice(&data[..next_boundary]);
(std::mem::take(&mut self.chunkbuf).into(), next_boundary)
}
} else if is_final {
// Put the rest of the data in the chunkbuf.
if self.chunkbuf.is_empty() {
(Bytes::copy_from_slice(data), data.len())
} else {
self.chunkbuf.extend_from_slice(data);
(std::mem::take(&mut self.chunkbuf).into(), data.len())
}
} else {
self.chunkbuf.extend_from_slice(data);
return (None, data.len());
}
};
// Special case this specific case.
if data.is_empty() {
return (None, 0);
}
(Some(Chunk::new(data)), consume)
}
/// Keeps chunking until no more chunks can be reliably produced, returning a
/// vector of the resulting chunks.
pub fn next_block(&mut self, data: &[u8], is_final: bool) -> Vec<Chunk> {
let mut ret = Vec::new();
let mut pos = 0;
loop {
debug_assert!(pos <= data.len());
if pos == data.len() {
return ret;
}
let (maybe_chunk, bytes_consumed) = self.next(&data[pos..], is_final);
if let Some(chunk) = maybe_chunk {
ret.push(chunk);
}
pos += bytes_consumed;
}
}
/// Keeps chunking until no more chunks can be reliably produced, returning a
/// vector of the resulting chunks.
///
/// The data is inserted here as a Bytes object, which means that no copying of the data
/// is performed except at the boundaries. The resulting chunks then end up each holding
/// a reference to the original data object, which will not be deallocated until all the
/// original bytes are gone.
pub fn next_block_bytes(&mut self, data: &Bytes, is_final: bool) -> Vec<Chunk> {
let mut ret = Vec::new();
let mut pos = 0;
// In this case, we have to perform a single cut using the old method,
// which would copy the data.
if !self.chunkbuf.is_empty() {
let (maybe_chunk, skip_idx) = self.next(data, is_final);
if let Some(chunk) = maybe_chunk {
ret.push(chunk);
}
pos = skip_idx;
}
while pos < data.len() {
let maybe_next_boundary = self.next_boundary(&data[pos..]);
if let Some(chunk_size) = maybe_next_boundary {
let next_pos = pos + chunk_size;
ret.push(Chunk::new(data.slice(pos..next_pos)));
pos = next_pos;
} else {
// No more chunks in this block.
if is_final {
ret.push(Chunk::new(data.slice(pos..)));
} else {
self.chunkbuf.extend_from_slice(&data[pos..]);
}
break;
}
}
ret
}
// Finishes, returning the final chunk if it exists
pub fn finish(&mut self) -> Option<Chunk> {
self.next(&[], true).0
}
}
/// Find valid partition points in a file where we can
/// chunk in parallel. Returns the start points of each partition
/// (i.e. file offset 0 is always the first entry, and `file_size`
/// is never in the result).
/// Note that reader position is modified and not restored.
///
/// partition_scan_bytes is the number of bytes to scan at each
/// proposed partition boundary in search of a valid chunk.
///
/// Due to a known issue in how we do chunking, note that these
/// partitions are not 100% guaranteed to align. See the
/// parallel_chunking.pdf for details.
pub fn find_partitions<R: Read + Seek>(
reader: &mut R,
file_size: usize,
target_chunk_size: usize,
min_partition_size: usize,
partition_scan_bytes: usize,
) -> std::io::Result<Vec<usize>> {
assert!(min_partition_size > 0);
let mut partitions: Vec<usize> = Vec::new();
partitions.push(0);
// minumum chunk must be at least the hash window size.
// the way the chunker works, the minimum may be up to
// target_min_chunk_size - 64
let minimum_chunk = target_chunk_size / *MINIMUM_CHUNK_DIVISOR;
let maximum_chunk = target_chunk_size * *MAXIMUM_CHUNK_MULTIPLIER;
assert!(minimum_chunk > 64);
if maximum_chunk >= min_partition_size {
return Ok(partitions);
}
let mut buf = vec![0u8; partition_scan_bytes];
let mut curpos: usize = 0;
// we jump curpos forward by min_partition_size
// and read *PARALLEL_CHUNKING_PARTITION_SCAN_BYTES bytes
// and try to find a partition boundary condition.
//
// We should also make sure There should also be at least
// min_partition_size bytes remaining at curpos so that
// we do not make a teeny tiny partition.
while curpos < file_size {
curpos += min_partition_size;
// there are not enough bytes to make a full partition
// or not enough bytes to scan for a partition
if curpos + min_partition_size >= file_size || curpos + partition_scan_bytes >= file_size {
break;
}
// read and chunk the scan bytes
reader.seek(SeekFrom::Start(curpos as u64))?;
reader.read_exact(&mut buf)?;
let mut chunker = Chunker::new_with_min(target_chunk_size, 0);
// TODO: there is a definite optimization here
// as we really only need the chunk lengths and not the data
let chunks = chunker.next_block(&buf, false);
if chunks.is_empty() {
continue;
}
// skip the first chunk
let mut offset = chunks[0].data.len();
offset += chunks[1].data.len();
for i in 2..chunks.len() {
let cprev = chunks[i - 1].data.len();
let c = chunks[i].data.len();
offset += chunks[i].data.len();
if cprev > minimum_chunk
&& cprev < maximum_chunk - minimum_chunk
&& c > minimum_chunk
&& c < maximum_chunk - minimum_chunk
{
// we have a valid partition at this position
partitions.push(curpos + offset);
break;
}
}
}
Ok(partitions)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use std::io::Cursor;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*;
/// A helper to create random test data using a specified `seed` and `len`.
/// Using a fixed seed ensures tests are reproducible.
fn make_test_data(seed: u64, len: usize) -> Vec<u8> {
let mut rng = StdRng::seed_from_u64(seed);
let mut data = vec![0; len];
rng.fill(&mut data[..]);
data
}
fn check_chunks_equal(chunks: &[Chunk], data: &[u8]) {
// Validate all the chunks are exact.
let mut new_vec = Vec::with_capacity(10000);
for c in chunks.iter() {
new_vec.extend_from_slice(&c.data[..]);
}
assert!(new_vec == data);
}
// A chunker that wraps two versions of the Chunker class,
// exposing next_block but then internally testing next_bytes_block and next_block and
// verifying the output is identical.
#[derive(Default)]
struct ChunkerTestWrapper {
chunker_chunks: Chunker,
chunker_bytes: Chunker,
}
impl ChunkerTestWrapper {
fn new(target_chunk_size: usize) -> Self {
ChunkerTestWrapper {
chunker_chunks: Chunker::new(target_chunk_size),
chunker_bytes: Chunker::new(target_chunk_size),
}
}
fn next_block(&mut self, data: &[u8], is_final: bool) -> Vec<Chunk> {
let chunks = self.chunker_chunks.next_block(data, is_final);
let bytes_chunks = self.chunker_bytes.next_block_bytes(&Bytes::copy_from_slice(data), is_final);
// Check that the two match.
assert_eq!(chunks.len(), bytes_chunks.len());
for (c1, c2) in chunks.iter().zip(bytes_chunks.iter()) {
assert_eq!(c1.data, c2.data);
}
chunks
}
fn next_chunk(&mut self, data: &[u8], is_final: bool) -> (Option<Chunk>, usize) {
let (chunk, consumed) = self.chunker_chunks.next(data, is_final);
let (bytes_chunk, bytes_consumed) = self.chunker_bytes.next(&Bytes::copy_from_slice(data), is_final);
// Check that the two match.
if let Some(c) = &chunk {
assert_eq!(c.data, bytes_chunk.unwrap().data);
} else {
assert!(bytes_chunk.is_none());
}
(chunk, consumed.max(bytes_consumed))
}
}
#[test]
fn test_empty_data_no_chunk_until_final() {
let mut chunker = ChunkerTestWrapper::new(128);
// Passing empty slice without final => no chunk
let (chunk, consumed) = chunker.next_chunk(&[], false);
assert!(chunk.is_none());
assert_eq!(consumed, 0);
// Passing empty slice again with is_final = true => no leftover data, so no chunk
let (chunk, consumed) = chunker.next_chunk(&[], true);
assert!(chunk.is_none());
assert_eq!(consumed, 0);
}
#[test]
fn test_data_smaller_than_minimum_no_boundary() {
let mut chunker = ChunkerTestWrapper::new(128);
// Create a small random data buffer. For example, length=3.
let data = make_test_data(0, 63);
// We expect no chunk until we finalize, because there's not enough data
// to trigger a boundary, nor to reach the maximum chunk size.
let (chunk, consumed) = chunker.next_chunk(&data, false);
assert!(chunk.is_none());
assert_eq!(consumed, data.len());
// Now finalize: we expect a chunk with the leftover data
let (chunk, consumed) = chunker.next_chunk(&[], true);
assert!(chunk.is_some());
assert_eq!(consumed, 0);
let chunk = chunk.unwrap();
assert_eq!(chunk.data.len(), 63);
assert_eq!(&chunk.data[..], &data[..], "Chunk should contain exactly what was passed in");
}
#[test]
fn test_multiple_chunks_produced() {
let mut chunker = ChunkerTestWrapper::new(128);
// Produce 100 bytes of random data
let data = make_test_data(42, 10000);
// Pass everything at once, final = true
let chunks = chunker.next_block(&data, true);
assert!(!chunks.is_empty());
check_chunks_equal(&chunks, &data);
}
#[test]
fn test_repeated_calls_partial_consumption() {
// We'll feed in two pieces of data to ensure partial consumption
let data = make_test_data(42, 10000);
let mut chunks_1 = Vec::new();
let mut pos = 0;
let mut chunker = ChunkerTestWrapper::new(128);
while pos < data.len() {
for i in 0..16 {
let next_pos = (pos + i).min(data.len());
chunks_1.append(&mut chunker.next_block(&data[pos..next_pos], next_pos == data.len()));
pos = next_pos;
}
}
check_chunks_equal(&chunks_1, &data);
// Now, rechunk with all at once and make sure it's equal.
let chunks_2 = ChunkerTestWrapper::new(128).next_block(&data, true);
assert_eq!(chunks_1, chunks_2);
}
#[test]
fn test_exact_maximum_chunk() {
// If the data hits the maximum chunk size exactly, we should force a boundary.
// For target_chunk_size = 128, if MAXIMUM_CHUNK_MULTIPLIER = 2, then max = 256.
// Adjust if your constants differ.
let mut chunker = ChunkerTestWrapper::new(512);
// Use constant data
let data = vec![0; 8 * *MAXIMUM_CHUNK_MULTIPLIER * 512];
let chunks = chunker.next_block(&data, true);
assert_eq!(chunks.len(), 8);
for c in chunks.iter() {
assert_eq!(c.data.len(), *MAXIMUM_CHUNK_MULTIPLIER * 512);
}
}
#[test]
fn test_partition() {
for _i in 1..5 {
let data = make_test_data(42, 1000000);
let mut chunker = Chunker::new(1024);
let chunks = chunker.next_block(&data, true);
let mut chunk_offsets = HashSet::new();
let mut offset = 0;
eprintln!("{:?}", chunker.minimum_chunk);
for i in 0..chunks.len() {
chunk_offsets.insert(offset);
offset += chunks[i].data.len();
}
let partitions =
find_partitions(&mut Cursor::new(&mut data.as_slice()), data.len(), 1024, 100000, 10000).unwrap();
assert!(partitions.len() > 1);
for i in 0..partitions.len() {
assert!(chunk_offsets.contains(&partitions[i]));
}
}
}
/// Simple SplitMix64-based deterministic random number generator.
/// Portable to C, Python, etc. (see https://prng.di.unimi.it/splitmix64.c)
fn splitmix64_next(state: &mut u64) -> u64 {
*state = state.wrapping_add(0x9E3779B97F4A7C15);
let mut z = *state;
z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9);
z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB);
z ^ (z >> 31)
}
fn create_random_data(n: usize, seed: u64) -> Vec<u8> {
// This test will actually need to be run in different environments, so to generate
// the table below, create random data using a simple SplitMix rng that can be ported here
// as is without dependening on other po
let mut ret = Vec::with_capacity(n + 7);
let mut state = seed;
while ret.len() < n {
let next_u64 = splitmix64_next(&mut state);
ret.extend_from_slice(&next_u64.to_le_bytes());
}
// Has extra bits on there since we're adding in blocks of 8.
ret.resize(n, 0);
ret
}
fn get_chunk_boundaries(chunks: &[Chunk]) -> Vec<usize> {
chunks
.iter()
.scan(0, |state, chunk| {
*state += chunk.data.len();
Some(*state)
})
.collect()
}
#[test]
fn test_chunk_boundaries() {
let data = create_random_data(256000, 1);
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);
// Get the boundaries indices as determined by the size of the chunks above.
let ref_chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
// Test that it's correct across different chunk varieties.
for add_size in [1, 37, 255] {
let mut chunker = Chunker::default();
// Add repeatedly in blocks of add_size, appending to alt_chunks
let mut alt_chunks = Vec::with_capacity(chunks.len());
let mut pos = 0;
while pos < data.len() {
let next_pos = (pos + add_size).min(data.len());
let next_chunk = chunker.next_block(&data[pos..next_pos], next_pos == data.len());
alt_chunks.extend(next_chunk);
pos = next_pos;
}
let alt_boundaries = get_chunk_boundaries(&alt_chunks);
assert_eq!(alt_boundaries, ref_chunk_boundaries);
}
}
#[test]
fn test_correctness_1mb_random_data() {
// Test this data.
let data = create_random_data(1000000, 0);
// Uncomment these to create the lines below:
// eprintln!("(data[0], {});", data[0] as usize);
// eprintln!("(data[127], {});", data[127] as usize);
// eprintln!("(data[111111], {});", data[111111] as usize);
assert_eq!(data[0], 175);
assert_eq!(data[127], 132);
assert_eq!(data[111111], 118);
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);
// Get the boundaries indices as determined by the size of the chunks above.
let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
// Uncomment this to create the line below.
// eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
assert_eq!(
chunk_boundaries,
vec![
84493, 134421, 144853, 243318, 271793, 336457, 467529, 494581, 582000, 596735, 616815, 653164, 678202,
724510, 815591, 827760, 958832, 991092, 1000000
]
);
}
#[test]
fn test_correctness_1mb_const_data() {
// Test this data.
let data = vec![59u8; 1000000];
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);
// Get the boundaries indices as determined by the size of the chunks above.
let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
// Uncomment this to create the line below.
// eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
assert_eq!(chunk_boundaries, vec![131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000])
}
fn get_triggering_base_data(n: usize, padding: usize) -> Vec<u8> {
// This pattern is known to trigger the boundary detection in the chunker, so repeat it to test the
// correctness of the minimum chunk size processing.
let mut data = vec![
154, 52, 42, 34, 159, 75, 126, 224, 70, 236, 12, 196, 79, 236, 178, 124, 127, 50, 99, 178, 44, 176, 174,
126, 250, 235, 205, 174, 252, 122, 35, 10, 20, 101, 214, 69, 193, 8, 115, 105, 158, 228, 120, 111, 136,
162, 198, 251, 211, 183, 253, 252, 164, 147, 63, 16, 186, 162, 117, 23, 170, 36, 205, 187, 174, 76, 210,
174, 211, 175, 12, 173, 145, 59, 2, 70, 222, 181, 159, 227, 182, 156, 189, 51, 226, 106, 24, 50, 183, 157,
140, 10, 8, 23, 212, 70, 10, 234, 23, 33, 219, 254, 39, 236, 70, 49, 191, 116, 9, 115, 15, 101, 26, 159,
165, 220, 15, 170, 56, 125, 92, 163, 94, 235, 38, 40, 49, 81,
];
// Add padding so we can comprehensively test the nuances of boundaries.
data.resize(data.len() + padding, 0u8);
// Repeat the above pattern until we've filled out n bytes.
while data.len() < n {
let n_take = (n - data.len()).min(data.len());
data.extend_from_within(0..n_take);
}
data
}
#[test]
fn test_correctness_100kb_hitting_data() {
// To ensure we've checked all the nuances of dealing with minimum chunk boundaries,
// and with the correct chunks as well, run through all the different options with the padding,
// checking each one. With this, then, we have a pattern that hits once per pattern with varying
// bits between the widths.
let mut data_sample_at_11111 = [0u8; 128];
let mut ref_cb = vec![Vec::new(); 128];
data_sample_at_11111[0] = 236;
ref_cb[0] = vec![8256, 16448, 24640, 32832, 41024, 49216, 57408, 65536];
data_sample_at_11111[1] = 50;
ref_cb[1] = vec![8191, 16447, 24703, 32959, 41215, 49471, 57727, 65536];
data_sample_at_11111[2] = 36;
ref_cb[2] = vec![8254, 16574, 24894, 33214, 41534, 49854, 58174, 65536];
data_sample_at_11111[3] = 116;
ref_cb[3] = vec![8317, 16570, 24823, 33076, 41329, 49582, 57835, 65536];
data_sample_at_11111[4] = 126;
ref_cb[4] = vec![8248, 16564, 24880, 33196, 41512, 49828, 58144, 65536];
data_sample_at_11111[5] = 145;
ref_cb[5] = vec![8310, 16556, 24802, 33048, 41294, 49540, 57786, 65536];
data_sample_at_11111[6] = 235;
ref_cb[6] = vec![8238, 16546, 24854, 33162, 41470, 49778, 58086, 65536];
data_sample_at_11111[7] = 228;
ref_cb[7] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536];
data_sample_at_11111[8] = 70;
ref_cb[8] = vec![8224, 16520, 24816, 33112, 41408, 49704, 58000, 65536];
data_sample_at_11111[9] = 178;
ref_cb[9] = vec![8284, 16504, 24724, 32944, 41164, 49384, 57604, 65536];
data_sample_at_11111[10] = 173;
ref_cb[10] = vec![8206, 16486, 24766, 33046, 41326, 49606, 57886, 65536];
data_sample_at_11111[11] = 0;
ref_cb[11] = vec![8265, 16466, 24667, 32868, 41069, 49270, 57471, 65536];
data_sample_at_11111[12] = 252;
ref_cb[12] = vec![8324, 16452, 24704, 32832, 41084, 49212, 57464, 65536];
data_sample_at_11111[13] = 159;
ref_cb[13] = vec![8242, 16561, 24880, 33199, 41518, 49837, 58156, 65536];
data_sample_at_11111[14] = 69;
ref_cb[14] = vec![8300, 16536, 24772, 33008, 41244, 49480, 57716, 65536];
data_sample_at_11111[15] = 219;
ref_cb[15] = vec![8215, 16509, 24803, 33097, 41391, 49685, 57979, 65536];
data_sample_at_11111[16] = 126;
ref_cb[16] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[17] = 10;
ref_cb[17] = vec![8329, 16457, 24714, 32842, 41099, 49227, 57484, 65536];
data_sample_at_11111[18] = 124;
ref_cb[18] = vec![8240, 16562, 24884, 33206, 41528, 49850, 58172, 65536];
data_sample_at_11111[19] = 24;
ref_cb[19] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
data_sample_at_11111[20] = 196;
ref_cb[20] = vec![8204, 16492, 24780, 33068, 41356, 49644, 57932, 65536];
data_sample_at_11111[21] = 106;
ref_cb[21] = vec![8259, 16454, 24649, 32844, 41039, 49234, 57429, 65536];
data_sample_at_11111[22] = 196;
ref_cb[22] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
data_sample_at_11111[23] = 183;
ref_cb[23] = vec![8218, 16523, 24828, 33133, 41438, 49743, 58048, 65536];
data_sample_at_11111[24] = 124;
ref_cb[24] = vec![8128, 16328, 24536, 32744, 40952, 49160, 57368, 65536];
data_sample_at_11111[25] = 70;
ref_cb[25] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
data_sample_at_11111[26] = 126;
ref_cb[26] = vec![8226, 16542, 24858, 33174, 41490, 49806, 58122, 65536];
data_sample_at_11111[27] = 191;
ref_cb[27] = vec![8279, 16494, 24709, 32924, 41139, 49354, 57569, 65536];
data_sample_at_11111[28] = 69;
ref_cb[28] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
data_sample_at_11111[29] = 163;
ref_cb[29] = vec![8128, 16392, 24713, 33034, 41355, 49676, 57997, 65536];
data_sample_at_11111[30] = 252;
ref_cb[30] = vec![8280, 16496, 24712, 32928, 41144, 49360, 57576, 65536];
data_sample_at_11111[31] = 0;
ref_cb[31] = vec![8332, 16600, 24868, 33136, 41404, 49672, 57940, 65536];
data_sample_at_11111[32] = 173;
ref_cb[32] = vec![8224, 16544, 24864, 33184, 41504, 49824, 58144, 65536];
data_sample_at_11111[33] = 42;
ref_cb[33] = vec![8275, 16486, 24697, 32908, 41119, 49330, 57541, 65536];
data_sample_at_11111[34] = 70;
ref_cb[34] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
data_sample_at_11111[35] = 174;
ref_cb[35] = vec![8214, 16527, 24840, 33153, 41466, 49779, 58092, 65536];
data_sample_at_11111[36] = 235;
ref_cb[36] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
data_sample_at_11111[37] = 186;
ref_cb[37] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
data_sample_at_11111[38] = 0;
ref_cb[38] = vec![8198, 16498, 24798, 33098, 41398, 49698, 57998, 65536];
data_sample_at_11111[39] = 157;
ref_cb[39] = vec![8247, 16597, 24947, 33297, 41647, 49997, 58347, 65536];
data_sample_at_11111[40] = 126;
ref_cb[40] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
data_sample_at_11111[41] = 49;
ref_cb[41] = vec![8345, 16626, 24907, 33188, 41469, 49750, 58031, 65536];
data_sample_at_11111[42] = 36;
ref_cb[42] = vec![8224, 16554, 24884, 33214, 41544, 49874, 58204, 65536];
data_sample_at_11111[43] = 0;
ref_cb[43] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[44] = 236;
ref_cb[44] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
data_sample_at_11111[45] = 105;
ref_cb[45] = vec![8195, 16499, 24803, 33107, 41411, 49715, 58019, 65536];
data_sample_at_11111[46] = 0;
ref_cb[46] = vec![8242, 16594, 24946, 33298, 41650, 50002, 58354, 65536];
data_sample_at_11111[47] = 24;
ref_cb[47] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536];
data_sample_at_11111[48] = 126;
ref_cb[48] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536];
data_sample_at_11111[49] = 0;
ref_cb[49] = vec![8206, 16525, 24844, 33163, 41482, 49801, 58120, 65536];
data_sample_at_11111[50] = 70;
ref_cb[50] = vec![8252, 16618, 24984, 33350, 41716, 50082, 58448, 65536];
data_sample_at_11111[51] = 236;
ref_cb[51] = vec![8298, 16532, 24766, 33000, 41234, 49468, 57702, 65536];
data_sample_at_11111[52] = 0;
ref_cb[52] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[53] = 12;
ref_cb[53] = vec![8209, 16337, 24680, 32808, 41151, 49279, 57622, 65536];
data_sample_at_11111[54] = 236;
ref_cb[54] = vec![8254, 16626, 24998, 33370, 41742, 50114, 58486, 65536];
data_sample_at_11111[55] = 0;
ref_cb[55] = vec![8299, 16534, 24769, 33004, 41239, 49474, 57709, 65536];
data_sample_at_11111[56] = 173;
ref_cb[56] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[57] = 196;
ref_cb[57] = vec![8204, 16529, 24854, 33179, 41504, 49829, 58154, 65536];
data_sample_at_11111[58] = 0;
ref_cb[58] = vec![8248, 16618, 24988, 33358, 41728, 50098, 58468, 65536];
data_sample_at_11111[59] = 159;
ref_cb[59] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536];
data_sample_at_11111[60] = 178;
ref_cb[60] = vec![8336, 16608, 24880, 33152, 41424, 49696, 57968, 65536];
data_sample_at_11111[61] = 0;
ref_cb[61] = vec![8191, 16507, 24823, 33139, 41455, 49771, 58087, 65536];
data_sample_at_11111[62] = 10;
ref_cb[62] = vec![8234, 16594, 24954, 33314, 41674, 50034, 58394, 65536];
data_sample_at_11111[63] = 101;
ref_cb[63] = vec![8277, 16490, 24703, 32916, 41129, 49342, 57555, 65536];
data_sample_at_11111[64] = 0;
ref_cb[64] = vec![8320, 16576, 24832, 33088, 41344, 49600, 57856, 65536];
data_sample_at_11111[65] = 15;
ref_cb[65] = vec![8363, 16662, 24961, 33260, 41559, 49858, 58157, 65536];
data_sample_at_11111[66] = 147;
ref_cb[66] = vec![8212, 16554, 24896, 33238, 41580, 49922, 58264, 65536];
data_sample_at_11111[67] = 0;
ref_cb[67] = vec![8254, 16639, 25024, 33409, 41794, 50179, 58564, 65536];
data_sample_at_11111[68] = 0;
ref_cb[68] = vec![8296, 16528, 24760, 32992, 41224, 49456, 57688, 65536];
data_sample_at_11111[69] = 227;
ref_cb[69] = vec![8338, 16612, 24886, 33160, 41434, 49708, 57982, 65536];
data_sample_at_11111[70] = 126;
ref_cb[70] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
data_sample_at_11111[71] = 0;
ref_cb[71] = vec![8223, 16581, 24939, 33297, 41655, 50013, 58371, 65536];
data_sample_at_11111[72] = 101;
ref_cb[72] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
data_sample_at_11111[73] = 186;
ref_cb[73] = vec![8305, 16546, 24787, 33028, 41269, 49510, 57751, 65536];
data_sample_at_11111[74] = 52;
ref_cb[74] = vec![8346, 16628, 24910, 33192, 41474, 49756, 58038, 65536];
data_sample_at_11111[75] = 0;
ref_cb[75] = vec![8387, 16515, 24830, 32958, 41273, 49401, 57716, 65536];
data_sample_at_11111[76] = 70;
ref_cb[76] = vec![8224, 16588, 24952, 33316, 41680, 50044, 58408, 65536];
data_sample_at_11111[77] = 228;
ref_cb[77] = vec![8264, 16464, 24664, 32864, 41064, 49264, 57464, 65536];
data_sample_at_11111[78] = 0;
ref_cb[78] = vec![8128, 16338, 24578, 32818, 41058, 49298, 57538, 65536];
data_sample_at_11111[79] = 0;
ref_cb[79] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[80] = 50;
ref_cb[80] = vec![8384, 16704, 25024, 33344, 41664, 49984, 58304, 65536];
data_sample_at_11111[81] = 214;
ref_cb[81] = vec![8215, 16575, 24935, 33295, 41655, 50015, 58375, 65536];
data_sample_at_11111[82] = 0;
ref_cb[82] = vec![8254, 16654, 25054, 33454, 41854, 50254, 58654, 65536];
data_sample_at_11111[83] = 0;
ref_cb[83] = vec![8293, 16522, 24751, 32980, 41209, 49438, 57667, 65536];
data_sample_at_11111[84] = 50;
ref_cb[84] = vec![8128, 16388, 24656, 32924, 41192, 49460, 57728, 65536];
data_sample_at_11111[85] = 69;
ref_cb[85] = vec![8371, 16678, 24985, 33292, 41599, 49906, 58213, 65536];
data_sample_at_11111[86] = 0;
ref_cb[86] = vec![8196, 16324, 24674, 32802, 41152, 49280, 57630, 65536];
data_sample_at_11111[87] = 0;
ref_cb[87] = vec![8234, 16619, 25004, 33389, 41774, 50159, 58544, 65536];
data_sample_at_11111[88] = 70;
ref_cb[88] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[89] = 136;
ref_cb[89] = vec![8128, 16339, 24585, 32831, 41077, 49323, 57569, 65536];
data_sample_at_11111[90] = 0;
ref_cb[90] = vec![8348, 16632, 24916, 33200, 41484, 49768, 58052, 65536];
data_sample_at_11111[91] = 0;
ref_cb[91] = vec![8386, 16708, 25030, 33352, 41674, 49996, 58318, 65536];
data_sample_at_11111[92] = 101;
ref_cb[92] = vec![8204, 16564, 24924, 33284, 41644, 50004, 58364, 65536];
data_sample_at_11111[93] = 36;
ref_cb[93] = vec![8241, 16639, 25037, 33435, 41833, 50231, 58629, 65536];
data_sample_at_11111[94] = 196;
ref_cb[94] = vec![8278, 16492, 24706, 32920, 41134, 49348, 57562, 65536];
data_sample_at_11111[95] = 0;
ref_cb[95] = vec![8315, 16566, 24817, 33068, 41319, 49570, 57821, 65536];
data_sample_at_11111[96] = 0;
ref_cb[96] = vec![8352, 16640, 24928, 33216, 41504, 49792, 58080, 65536];
data_sample_at_11111[97] = 24;
ref_cb[97] = vec![8389, 16714, 25039, 33364, 41689, 50014, 58339, 65536];
data_sample_at_11111[98] = 8;
ref_cb[98] = vec![8200, 16562, 24924, 33286, 41648, 50010, 58372, 65536];
data_sample_at_11111[99] = 0;
ref_cb[99] = vec![8236, 16635, 25034, 33433, 41832, 50231, 58630, 65536];
data_sample_at_11111[100] = 0;
ref_cb[100] = vec![8272, 16480, 24688, 32896, 41104, 49312, 57520, 65536];
data_sample_at_11111[101] = 125;
ref_cb[101] = vec![8308, 16552, 24796, 33040, 41284, 49528, 57772, 65536];
data_sample_at_11111[102] = 173;
ref_cb[102] = vec![8344, 16624, 24904, 33184, 41464, 49744, 58024, 65536];
data_sample_at_11111[103] = 126;
ref_cb[103] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
data_sample_at_11111[104] = 0;
ref_cb[104] = vec![8416, 16544, 24888, 33016, 41360, 49488, 57832, 65536];
data_sample_at_11111[105] = 0;
ref_cb[105] = vec![8219, 16607, 24995, 33383, 41771, 50159, 58547, 65536];
data_sample_at_11111[106] = 159;
ref_cb[106] = vec![8254, 16678, 25102, 33526, 41950, 50374, 58798, 65536];
data_sample_at_11111[107] = 210;
ref_cb[107] = vec![8289, 16514, 24739, 32964, 41189, 49414, 57639, 65536];
data_sample_at_11111[108] = 178;
ref_cb[108] = vec![8324, 16584, 24844, 33104, 41364, 49624, 57884, 65536];
data_sample_at_11111[109] = 0;
ref_cb[109] = vec![8359, 16654, 24949, 33244, 41539, 49834, 58129, 65536];
data_sample_at_11111[110] = 0;
ref_cb[110] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536];
data_sample_at_11111[111] = 170;
ref_cb[111] = vec![8429, 16794, 25159, 33524, 41889, 50254, 58619, 65536];
data_sample_at_11111[112] = 173;
ref_cb[112] = vec![8224, 16624, 25024, 33424, 41824, 50224, 58624, 65536];
data_sample_at_11111[113] = 235;
ref_cb[113] = vec![8258, 16452, 24646, 32840, 41034, 49228, 57422, 65536];
data_sample_at_11111[114] = 0;
ref_cb[114] = vec![8292, 16520, 24748, 32976, 41204, 49432, 57660, 65536];
data_sample_at_11111[115] = 0;
ref_cb[115] = vec![8326, 16588, 24850, 33112, 41374, 49636, 57898, 65536];
data_sample_at_11111[116] = 0;
ref_cb[116] = vec![8360, 16656, 24952, 33248, 41544, 49840, 58136, 65536];
data_sample_at_11111[117] = 24;
ref_cb[117] = vec![8394, 16724, 25054, 33384, 41714, 50044, 58374, 65536];
data_sample_at_11111[118] = 228;
ref_cb[118] = vec![8428, 16792, 25156, 33520, 41884, 50248, 58612, 65536];
data_sample_at_11111[119] = 0;
ref_cb[119] = vec![8215, 16613, 25011, 33409, 41807, 50205, 58603, 65536];
data_sample_at_11111[120] = 0;
ref_cb[120] = vec![8248, 16680, 25112, 33544, 41976, 50408, 58840, 65536];
data_sample_at_11111[121] = 0;
ref_cb[121] = vec![8281, 16498, 24715, 32932, 41149, 49366, 57583, 65536];
data_sample_at_11111[122] = 101;
ref_cb[122] = vec![8314, 16564, 24814, 33064, 41314, 49564, 57814, 65536];
data_sample_at_11111[123] = 174;
ref_cb[123] = vec![8347, 16630, 24913, 33196, 41479, 49762, 58045, 65536];
data_sample_at_11111[124] = 126;
ref_cb[124] = vec![8380, 16696, 25012, 33328, 41644, 49960, 58276, 65536];
data_sample_at_11111[125] = 0;
ref_cb[125] = vec![8413, 16762, 25111, 33460, 41809, 50158, 58507, 65536];
data_sample_at_11111[126] = 0;
ref_cb[126] = vec![8192, 16574, 24956, 33338, 41720, 50102, 58484, 65536];
data_sample_at_11111[127] = 0;
ref_cb[127] = vec![8224, 16639, 25054, 33469, 41884, 50299, 58714, 65536];
// Now run the loop with this reference data.
for i in 0..128 {
let data = get_triggering_base_data(65536, i);
// This check is here so that the tests written against this chunker
// can verify that the test data input is correct.
assert_eq!(data[11111], data_sample_at_11111[i]);
// Uncomment to create the line above.
// eprintln!("data_sample_at_11111[{i}]={};", data[11111]);
// Now, run the chunks through the default chunker.
let chunks = ChunkerTestWrapper::default().next_block(&data, true);
// Get the boundaries indices as determined by the size of the chunks above.
let chunk_boundaries: Vec<usize> = get_chunk_boundaries(&chunks);
// Uncomment this to generate the table above.
// eprintln!("ref_cb[{i}]=vec!{chunk_boundaries:?};");
assert_eq!(chunk_boundaries, ref_cb[i]);
}
// eprintln!("assert_eq!(chunk_boundaries, vec!{chunk_boundaries:?})");
// assert_eq!(chunk_boundaries, vec![131072, 262144, 393216, 524288, 655360, 786432, 917504, 1000000])
}
}