in utils/hf_dataset_subsampling.py [0:0]
def get_total_byte_size(dataset): return pa.compute.sum(dataset.data["bytes_len"]).as_py()