in datafu-spark/src/main/scala/datafu/spark/DataFrameOps.scala [36:117]
implicit def columnToColumns(c: Column): Seq[Column] = Seq(c)
implicit class someDataFrameUtils(df: DataFrame) {
def dedupWithOrder(groupCol: Column, orderCols: Column*): DataFrame =
SparkDFUtils.dedupWithOrder(df, groupCol, orderCols: _*)
def dedupTopN(n: Int, groupCol: Column, orderCols: Column*): DataFrame =
SparkDFUtils.dedupTopN(df, n, groupCol, orderCols: _*)
def dedupWithCombiner(groupCol: Seq[Column],
orderByCol: Seq[Column],
desc: Boolean = true,
moreAggFunctions: Seq[Column] = Nil,
columnsFilter: Seq[String] = Nil,
columnsFilterKeep: Boolean = true): DataFrame =
SparkDFUtils.dedupWithCombiner(df,
groupCol,
orderByCol,
desc,
moreAggFunctions,
columnsFilter,
columnsFilterKeep)
def dedupByAllExcept(ignoredColumn: String, aggFunction : String => Column = org.apache.spark.sql.functions.max) : DataFrame =
SparkDFUtils.dedupByAllExcept(df, ignoredColumn, aggFunction)
def flatten(colName: String): DataFrame = SparkDFUtils.flatten(df, colName)
def changeSchema(newScheme: String*): DataFrame =
SparkDFUtils.changeSchema(df, newScheme: _*)
def joinWithRange(colSingle: String,
dfRange: DataFrame,
colRangeStart: String,
colRangeEnd: String,
DECREASE_FACTOR: Long = 2 ^ 8): DataFrame =
SparkDFUtils.joinWithRange(df,
colSingle,
dfRange,
colRangeStart,
colRangeEnd,
DECREASE_FACTOR)
def joinWithRangeAndDedup(colSingle: String,
dfRange: DataFrame,
colRangeStart: String,
colRangeEnd: String,
DECREASE_FACTOR: Long = 2 ^ 8,
dedupSmallRange: Boolean = true): DataFrame =
SparkDFUtils.joinWithRangeAndDedup(df,
colSingle,
dfRange,
colRangeStart,
colRangeEnd,
DECREASE_FACTOR,
dedupSmallRange)
def broadcastJoinSkewed(skewed: DataFrame,
joinCol: String,
numberCustsToBroadcast: Int,
filterCnt: Option[Long] = None,
joinType: String = "inner"): DataFrame =
SparkDFUtils.broadcastJoinSkewed(df,
skewed,
joinCol,
numberCustsToBroadcast,
filterCnt,
joinType)
def joinSkewed(notSkewed: DataFrame,
joinExprs: Column,
numShards: Int = 1000,
joinType: String = "inner"): DataFrame =
SparkDFUtils.joinSkewed(df, notSkewed, joinExprs, numShards, joinType)
def explodeArray(arrayCol: Column,
alias: String) =
SparkDFUtils.explodeArray(df, arrayCol, alias)
def dedupRandomN(df: DataFrame, groupCol: Column, maxSize: Int): DataFrame =
SparkDFUtils.dedupRandomN(df, groupCol, maxSize)
}