in tpch/tpchgen.py [0:0]
def convert_tpch(scale_factor: int, partitions: int):
start_time = time.time()
ctx = SessionContext()
if partitions == 1:
# convert to parquet
for table in table_names:
convert_tbl_to_parquet(ctx, table, f"data/{table}.tbl", "tbl", f"data/{table}.parquet")
else:
for table in table_names:
run(f"mkdir -p data/{table}.parquet")
if table == "nation" or table == "region":
# nation and region are special cases and do not generate multiple files
convert_tbl_to_parquet(ctx, table, f"data/{table}.tbl", "tbl", f"data/{table}.parquet/part1.parquet")
else:
for part in range(1, partitions + 1):
convert_tbl_to_parquet(ctx, table, f"data/{table}.tbl.{part}", f"tbl.{part}", f"data/{table}.parquet/part{part}.parquet")
end_time = time.time()
print(f"Converted CSV to Parquet in {round(end_time - start_time, 2)} seconds")