def convert_tbl_to_parquet()

in tpch/tpchgen.py [0:0]


def convert_tbl_to_parquet(ctx: SessionContext, table: str, tbl_filename: str, file_extension: str, parquet_filename: str):
    print(f"Converting {tbl_filename} to {parquet_filename} ...")

    # schema manipulation code copied from DataFusion Python tpch example
    table_schema = [pyarrow.field(r[0].lower(), r[1], nullable=False) for r in all_schemas[table]]

    # Pre-collect the output columns so we can ignore the null field we add
    # in to handle the trailing | in the file
    output_cols = [r.name for r in table_schema]

    # Trailing | requires extra field for in processing
    table_schema.append(pyarrow.field("some_null", pyarrow.null(), nullable=True))

    schema = pyarrow.schema(table_schema)

    df = ctx.read_csv(tbl_filename, schema=schema, has_header=False, file_extension=file_extension, delimiter="|")
    df = df.select_columns(*output_cols)
    df.write_parquet(parquet_filename, compression="snappy")