in scala-spark-sdk/src/main/scala/software/amazon/sagemaker/featurestore/sparksdk/helpers/DataFrameRepartitioner.scala [38:52]
def repartition(inputDataFrame: DataFrame): DataFrame = {
val sparkContext = inputDataFrame.sparkSession.sparkContext
val parallelism = getParallelism(inputDataFrame)
val maxPartitions = sparkContext.getConf.get("spark.sql.shuffle.partitions", DEFAULT_SHUFFLE_PARTITIONS).toInt
val partitionsCount = inputDataFrame.rdd.getNumPartitions
// Repartitioning is a very costly operation, only do this when:
// 1. Too few partitions, even less than parallelism
// 2. Too many partitions, even greater than number of shuffle partitions
if (partitionsCount < parallelism || partitionsCount > maxPartitions) {
inputDataFrame.repartition(parallelism)
} else {
inputDataFrame
}
}