in dedupe_estimator.cpp [493:553]
int main(int argc, char* argv[]) {
if (argc < 2 ||
(argc == 2 && std::string(argv[1]) == "-h") ||
(argc == 2 && std::string(argv[1]) == "--help")
) {
std::cout<< "Usage: dedupe_estimator [-t] FILE1 [FILE2 ...]\n\n";
std::cout << "Estimates the amount of chunk level dedupe available in a\n"
"collection of files. All files will be chunked together\n"
"so if there are multiple versions of the file, all versions\n"
"should be provided together to see how much can be saved.\n"
"using Hugging Face's dedupped storage architecture.\n\n"
"The chunking algorithm used here is **not** the same\n"
"as the one being deployed, but it should provide a\n"
"reasonable estimate.\n\n";
std::cout << "The -t flag will enable an experimental tensor compression mode"
<< "for any neural network file\n\n";
return 1;
}
bool tensor_compression = false;
if (std::string(argv[1]) == "-t") {
tensor_compression = true;
std::cerr << "Experimental tensor compression mode on" << std::endl;
argv += 1;
argc -= 1;
}
chunk_ctr_type hs;
size_t total_len = 0;
for (int i = 1; i < argc; ++i) {
std::vector<size_t> chunks_seen;
std::ifstream file(argv[i], std::ios::binary);
if (!file.is_open()) {
std::cerr << "Error opening file: " << argv[i] << std::endl;
continue;
}
std::cerr << "Processing file " << argv[i] << std::endl;
total_len += add_chunks(file, hs, i-1, chunks_seen, tensor_compression);
if (i != 1) {
std::vector<RGB> colors = generate_color_sequence(chunks_seen);
write_ppm(colors, std::string(argv[i]) + ".dedupe_image.ppm");
}
file.close();
}
size_t chunk_bytes = 0;
size_t compressed_chunk_bytes = 0;
chunk_ctr_type::const_iterator iter = hs.begin();
while (iter != hs.end()) {
chunk_bytes += iter->second.len;
compressed_chunk_bytes += iter->second.compressed_len;
++iter;
}
std::cout << "Total bytes in all files: " << total_len << std::endl;
std::cout << "Total deduped bytes: " << chunk_bytes << std::endl;
std::cout << "Total deduped compressed bytes: " << compressed_chunk_bytes << std::endl;
return 0;
}