in tpcds/tpcdsgen.py [0:0]
def convert_tpcds(scale_factor: int, partitions: int):
start_time = time.time()
ctx = SessionContext()
if partitions == 1:
# convert to parquet
for table in table_names:
convert_dat_to_parquet(ctx, table, f"data/{table}.dat", "dat", f"data/{table}.parquet")
else:
for table in table_names:
run(f"mkdir -p data/{table}.parquet")
for part in range(1, partitions + 1):
source_file = f"data/{table}.dat/part-{part}.dat"
if os.path.exists(source_file):
convert_dat_to_parquet(ctx, table, source_file, "dat", f"data/{table}.parquet/part{part}.parquet")
end_time = time.time()
print(f"Converted CSV to Parquet in {round(end_time - start_time, 2)} seconds")