size_t add_chunks()

in dedupe_estimator.cpp [317:383]


size_t add_chunks(std::ifstream& file, chunk_ctr_type& chunk_ctr, size_t file_num, std::vector<size_t> &chunks_seen, bool tensor_compression) {
  char buffer[1024 * 1024];
  uint64_t h = 0;
  size_t buflen = 0;
  std::streamsize prev_offset = 0;
  std::string buf;
  size_t total_len = 0;
  size_t ctr = 0;

  while (!file.eof()) {
    file.read(reinterpret_cast<char*>(buffer), sizeof(buffer));
    std::streamsize bytesread = file.gcount();

    for (std::streamsize i = 0; i < bytesread; ++i) {
      h = (h << 1) ^ GEAR_HASH_TABLE[buffer[i]];
      buflen++;
      if ((buflen >= MIN_LEN && (h & MASK) == 0) || buflen >= MAX_LEN) {
        std::copy(buffer + prev_offset, buffer + i + 1, std::inserter(buf, buf.end()));
        prev_offset = i + 1;
        size_t chunk_hash = std::hash<std::string>()(buf);

        // insert into chunk counter
        chunk_ctr_type::const_iterator iter = chunk_ctr.find(chunk_hash);
        if (iter == chunk_ctr.end()) {
          chunk_info info;
          info.len = buflen;
          if (tensor_compression) {
            info.compressed_len = get_lz4_compressed_size_split(buf);
          } else {
            info.compressed_len = get_lz4_compressed_size(buf);
          }
          info.first_seen = file_num;
          chunk_ctr[chunk_hash] = info;
          chunks_seen.push_back(file_num);
        } else {
          chunks_seen.push_back(iter->second.first_seen);
        }
        total_len += buflen;
        // reset buffer
        buflen = 0;
        buf.clear();
        ctr += 1;
        if (ctr % 10000 == 0) {
          std::cout << "\tProcessed bytes: " << total_len << std::endl;
        }
      }
    }

    std::copy(buffer + prev_offset, buffer + bytesread, std::inserter(buf, buf.end()));
    prev_offset = 0;
  }

  // insert final hash into chunk counter
  size_t chunk_hash = std::hash<std::string>()(buf);
  chunk_ctr_type::const_iterator iter = chunk_ctr.find(chunk_hash);
  if (iter == chunk_ctr.end()) {
    chunk_info info;
    info.len = buflen;
    info.first_seen = file_num;
    chunk_ctr[chunk_hash] = info;
    chunks_seen.push_back(file_num);
  } else {
    chunks_seen.push_back(iter->second.first_seen);
  }
  total_len += buflen;
  return total_len;
}