in measure/src/main/scala/org/apache/griffin/measure/execution/impl/ProfilingMeasure.scala [236:276]
private def getProfilingExprs(
field: StructField,
roundScale: Int,
approxDistinctCount: Boolean,
dataSetSample: Double): Seq[Column] = {
val colName = field.name
val colType = field.dataType
val column = col(colName)
val lengthColExpr = col(lengthColFn(colName))
val nullColExpr = col(nullsInColFn(colName))
val (distinctCountName, distinctCountExpr) =
if (approxDistinctCount) {
(
lit(s"$ApproxPrefix$DistinctCount"),
approx_count_distinct(column).as(s"$ApproxPrefix$DistinctCount"))
} else {
(lit(DistinctCount), countDistinct(column).as(DistinctCount))
}
val distinctExpr = if (dataSetSample == 1) {
Seq(lit(distinctCountName), distinctCountExpr)
} else Nil
Seq(
Seq(lit(DataTypeStr), lit(colType.catalogString).as(DataTypeStr)),
Seq(lit(Total), sum(lit(1)).as(Total)),
Seq(lit(MinColLength), min(lengthColExpr).as(MinColLength)),
Seq(lit(MaxColLength), max(lengthColExpr).as(MaxColLength)),
Seq(lit(AvgColLength), avg(lengthColExpr).as(AvgColLength)),
Seq(lit(Min), forNumericFn(colType, min(column), Min)),
Seq(lit(Max), forNumericFn(colType, max(column), Max)),
Seq(lit(Avg), forNumericFn(colType, bround(avg(column), roundScale), Avg)),
Seq(
lit(StdDeviation),
forNumericFn(colType, bround(stddev(column), roundScale), StdDeviation)),
Seq(lit(Variance), forNumericFn(colType, bround(variance(column), roundScale), Variance)),
Seq(lit(Kurtosis), forNumericFn(colType, bround(kurtosis(column), roundScale), Kurtosis)),
distinctExpr,
Seq(lit(NullCount), sum(nullColExpr).as(NullCount))).flatten
}