def repartition()

in scala-spark-sdk/src/main/scala/software/amazon/sagemaker/featurestore/sparksdk/helpers/DataFrameRepartitioner.scala [38:52]


  def repartition(inputDataFrame: DataFrame): DataFrame = {
    val sparkContext    = inputDataFrame.sparkSession.sparkContext
    val parallelism     = getParallelism(inputDataFrame)
    val maxPartitions   = sparkContext.getConf.get("spark.sql.shuffle.partitions", DEFAULT_SHUFFLE_PARTITIONS).toInt
    val partitionsCount = inputDataFrame.rdd.getNumPartitions

    // Repartitioning is a very costly operation, only do this when:
    // 1. Too few partitions, even less than parallelism
    // 2. Too many partitions, even greater than number of shuffle partitions
    if (partitionsCount < parallelism || partitionsCount > maxPartitions) {
      inputDataFrame.repartition(parallelism)
    } else {
      inputDataFrame
    }
  }