in dataproc/compact_parquet_files.py [0:0]
def CompactParquetFiles(source, destination, numberOfPartitions):
print("CompactParquetFiles: source: ",source)
print("CompactParquetFiles: destination: ",destination)
print("CompactParquetFiles: numberOfPartitions: ",str(numberOfPartitions))
spark = SparkSession \
.builder \
.appName("CompactParquetFiles") \
.getOrCreate()
df = spark.read.parquet(source)
# Write as Parquet
df \
.repartition(numberOfPartitions) \
.coalesce(numberOfPartitions) \
.write \
.mode("overwrite") \
.parquet(destination)
spark.stop()