def convert_dat_to_parquet()

in tpcds/tpcdsgen.py [0:0]


def convert_dat_to_parquet(ctx: SessionContext, table: str, dat_filename: str, file_extension: str, parquet_filename: str):
    print(f"Converting {dat_filename} to {parquet_filename} ...")

    table_schema = all_schemas[table].copy()

    # Pre-collect the output columns so we can ignore the null field we add
    # in to handle the trailing | in the file
    output_cols = [r.name for r in table_schema]

    # Trailing | requires extra field for in processing
    table_schema.append(pyarrow.field("some_null", pyarrow.null(), nullable=True))

    schema = pyarrow.schema(table_schema)

    df = ctx.read_csv(dat_filename, schema=schema, has_header=False, file_extension=file_extension, delimiter="|")
    df = df.select_columns(*output_cols)
    df.write_parquet(parquet_filename, compression="snappy")