de/estimate.py (35 lines of code) (raw):

import subprocess import tempfile import os from .core import estimate def estimate_de(paths): string_paths = list(map(str, paths)) total_bytes, chunk_bytes, compressed_chunk_bytes = estimate(string_paths) return { "total_len": total_bytes, "chunk_bytes": chunk_bytes, "compressed_chunk_bytes": compressed_chunk_bytes, } def estimate_xtool(paths): with tempfile.NamedTemporaryFile(suffix=".json") as tmp: env = os.environ.copy() env["DEFAULT_MIN_N_CHUNKS_PER_RANGE"] = "1" cmd = [ "xtool", "--repo-type", "dataset", "--repo-id", "kszucs/pq", "--token", os.environ["XTOOL_TOKEN"], "dedup", "-s", "-o", tmp.name, *map(str, paths), ] result = subprocess.run( cmd, check=True, capture_output=True, text=True, env=env ) # stderr looks like: # 'Dedupping 26 files...\nUsing lz4 compression\n\n\nClean results:\nTransmitted 3180990288 bytes in total.\n' transmitted = int(result.stderr.splitlines()[-1].split()[1]) return {"transmitted_xtool_bytes": transmitted}