in dedupe_estimator.cpp [317:383]
size_t add_chunks(std::ifstream& file, chunk_ctr_type& chunk_ctr, size_t file_num, std::vector<size_t> &chunks_seen, bool tensor_compression) {
char buffer[1024 * 1024];
uint64_t h = 0;
size_t buflen = 0;
std::streamsize prev_offset = 0;
std::string buf;
size_t total_len = 0;
size_t ctr = 0;
while (!file.eof()) {
file.read(reinterpret_cast<char*>(buffer), sizeof(buffer));
std::streamsize bytesread = file.gcount();
for (std::streamsize i = 0; i < bytesread; ++i) {
h = (h << 1) ^ GEAR_HASH_TABLE[buffer[i]];
buflen++;
if ((buflen >= MIN_LEN && (h & MASK) == 0) || buflen >= MAX_LEN) {
std::copy(buffer + prev_offset, buffer + i + 1, std::inserter(buf, buf.end()));
prev_offset = i + 1;
size_t chunk_hash = std::hash<std::string>()(buf);
// insert into chunk counter
chunk_ctr_type::const_iterator iter = chunk_ctr.find(chunk_hash);
if (iter == chunk_ctr.end()) {
chunk_info info;
info.len = buflen;
if (tensor_compression) {
info.compressed_len = get_lz4_compressed_size_split(buf);
} else {
info.compressed_len = get_lz4_compressed_size(buf);
}
info.first_seen = file_num;
chunk_ctr[chunk_hash] = info;
chunks_seen.push_back(file_num);
} else {
chunks_seen.push_back(iter->second.first_seen);
}
total_len += buflen;
// reset buffer
buflen = 0;
buf.clear();
ctr += 1;
if (ctr % 10000 == 0) {
std::cout << "\tProcessed bytes: " << total_len << std::endl;
}
}
}
std::copy(buffer + prev_offset, buffer + bytesread, std::inserter(buf, buf.end()));
prev_offset = 0;
}
// insert final hash into chunk counter
size_t chunk_hash = std::hash<std::string>()(buf);
chunk_ctr_type::const_iterator iter = chunk_ctr.find(chunk_hash);
if (iter == chunk_ctr.end()) {
chunk_info info;
info.len = buflen;
info.first_seen = file_num;
chunk_ctr[chunk_hash] = info;
chunks_seen.push_back(file_num);
} else {
chunks_seen.push_back(iter->second.first_seen);
}
total_len += buflen;
return total_len;
}