in utils/hf_dataset_subsampling.py [0:0]
def get_size_per_example(texts: List[str]) -> Dict: size_values = [len(text.encode()) for text in texts] examples = {"bytes_len": size_values} return examples