in tpcds/tpcdsgen.py [0:0]
def convert_dat_to_parquet(ctx: SessionContext, table: str, dat_filename: str, file_extension: str, parquet_filename: str):
print(f"Converting {dat_filename} to {parquet_filename} ...")
table_schema = all_schemas[table].copy()
# Pre-collect the output columns so we can ignore the null field we add
# in to handle the trailing | in the file
output_cols = [r.name for r in table_schema]
# Trailing | requires extra field for in processing
table_schema.append(pyarrow.field("some_null", pyarrow.null(), nullable=True))
schema = pyarrow.schema(table_schema)
df = ctx.read_csv(dat_filename, schema=schema, has_header=False, file_extension=file_extension, delimiter="|")
df = df.select_columns(*output_cols)
df.write_parquet(parquet_filename, compression="snappy")