in benchmarks/gray_sort_benchmark.py [0:0]
def main():
SortBenchTool.ensure_installed()
driver = Driver()
driver.add_argument("-R", "--record_nbytes", type=int, default=100)
driver.add_argument("-K", "--key_nbytes", type=int, default=10)
driver.add_argument("-T", "--total_data_nbytes", type=int, default=None)
driver.add_argument("-B", "--gensort_batch_nbytes", type=int, default=512 * MB)
driver.add_argument("-n", "--num_data_partitions", type=int, default=None)
driver.add_argument("-t", "--num_sort_partitions", type=int, default=None)
driver.add_argument("-i", "--input_paths", nargs="+", default=[])
driver.add_argument("-e", "--shuffle_engine", default="duckdb", choices=("duckdb", "arrow"))
driver.add_argument("-s", "--sort_engine", default="duckdb", choices=("duckdb", "arrow", "polars"))
driver.add_argument("-H", "--hive_partitioning", action="store_true")
driver.add_argument("-V", "--validate_results", action="store_true")
driver.add_argument("-C", "--shuffle_cpu_limit", type=int, default=ShuffleNode.default_cpu_limit)
driver.add_argument(
"-M",
"--shuffle_memory_limit",
type=int,
default=ShuffleNode.default_memory_limit,
)
driver.add_argument("-TC", "--sort_cpu_limit", type=int, default=8)
driver.add_argument("-TM", "--sort_memory_limit", type=int, default=None)
driver.add_argument("-NC", "--cpus_per_node", type=int, default=psutil.cpu_count(logical=False))
driver.add_argument("-NM", "--memory_per_node", type=int, default=psutil.virtual_memory().total)
driver.add_argument("-CP", "--parquet_compression", default=None)
driver.add_argument("-LV", "--parquet_compression_level", type=int, default=None)
user_args, driver_args = driver.parse_arguments()
assert len(user_args.input_paths) == 0 or user_args.num_sort_partitions is not None
total_num_cpus = max(1, driver_args.num_executors) * user_args.cpus_per_node
memory_per_cpu = user_args.memory_per_node // user_args.cpus_per_node
user_args.sort_cpu_limit = 1 if user_args.sort_engine == "arrow" else user_args.sort_cpu_limit
sort_memory_limit = user_args.sort_memory_limit or user_args.sort_cpu_limit * memory_per_cpu
user_args.total_data_nbytes = user_args.total_data_nbytes or max(1, driver_args.num_executors) * user_args.memory_per_node
user_args.num_data_partitions = user_args.num_data_partitions or total_num_cpus // 2
user_args.num_sort_partitions = user_args.num_sort_partitions or max(
total_num_cpus // user_args.sort_cpu_limit,
user_args.total_data_nbytes // (sort_memory_limit // 4),
)
plan = gray_sort_benchmark(**vars(user_args))
driver.run(plan)