in src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala [374:441]
private[this] def extractGenericStatistics(
columns: Seq[String],
schema: StructType,
results: AnalyzerContext,
predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty)
: GenericColumnStatistics = {
val numRecords = results.metricMap
.collect { case (_: Size, metric: DoubleMetric) => metric.value.get }
.head
.toLong
val inferredTypes = results.metricMap
.filterNot{
case (analyzer: DataType, _) => predefinedTypes.contains(analyzer.column)
case _ => true
}
.collect { case (analyzer: DataType, metric: HistogramMetric) =>
val typeHistogram = metric.value.get
analyzer.column -> DataTypeHistogram.determineType(typeHistogram)
}
val typeDetectionHistograms = results.metricMap
.filterNot{
case (analyzer: DataType, _) => predefinedTypes.contains(analyzer.column)
case _ => true
}
.collect { case (analyzer: DataType, metric: HistogramMetric) =>
val typeCounts = metric.value.get.values
.map { case (key, distValue) => key -> distValue.absolute }
analyzer.column -> typeCounts
}
val approximateNumDistincts = results.metricMap
.collect { case (analyzer: ApproxCountDistinct, metric: DoubleMetric) =>
analyzer.column -> metric.value.get.toLong
}
val completenesses = results.metricMap
.collect { case (analyzer: Completeness, metric: DoubleMetric) =>
analyzer.column -> metric.value.get
}
val knownTypes = schema.fields
.filter { column => columns.contains(column.name) }
.filterNot { column => predefinedTypes.contains(column.name)}
.filter {
_.dataType != StringType
}
.map { field =>
val knownType = field.dataType match {
case ShortType | LongType | IntegerType => Integral
case DecimalType() | FloatType | DoubleType => Fractional
case BooleanType => Boolean
case TimestampType => String // TODO We should have support for dates in deequ...
case _ =>
println(s"Unable to map type ${field.dataType}")
Unknown
}
field.name -> knownType
}
.toMap
GenericColumnStatistics(numRecords, inferredTypes, knownTypes, typeDetectionHistograms,
approximateNumDistincts, completenesses, predefinedTypes)
}