in tpch/tpchgen.py [0:0]
def generate_tpch(scale_factor: int, partitions: int):
start_time = time.time()
if partitions == 1:
command = f"docker run -v `pwd`/data:/data -t --rm ghcr.io/scalytics/tpch-docker:main -vf -s {scale_factor}"
run_and_log_output(command, "/tmp/tpchgen.log")
else:
max_threads = os.cpu_count()
# List of commands to run
commands = [
(f"docker run -v `pwd`/data:/data -t --rm ghcr.io/scalytics/tpch-docker:main -vf -s {scale_factor} -C {partitions} -S {part}",
f"/tmp/tpchgen-part{part}.log")
for part in range(1, partitions + 1)
]
# run commands in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = [executor.submit(run_and_log_output, command, log_file) for (command, log_file) in commands]
# wait for all futures to complete
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"Command failed with exception: {e}")
end_time = time.time()
print(f"Generated CSV data in {round(end_time - start_time, 2)} seconds")