def CompactParquetFiles()

in dataproc/compact_parquet_files.py [0:0]


def CompactParquetFiles(source, destination, numberOfPartitions):
    print("CompactParquetFiles: source:             ",source)
    print("CompactParquetFiles: destination:        ",destination)
    print("CompactParquetFiles: numberOfPartitions: ",str(numberOfPartitions))

    spark = SparkSession \
        .builder \
        .appName("CompactParquetFiles") \
        .getOrCreate()

    df = spark.read.parquet(source)

    # Write as Parquet
    df \
        .repartition(numberOfPartitions) \
        .coalesce(numberOfPartitions) \
        .write \
        .mode("overwrite") \
        .parquet(destination)

    spark.stop()