private[this] def extractGenericStatistics()

in src/main/scala/com/amazon/deequ/profiles/ColumnProfiler.scala [374:441]


  private[this] def extractGenericStatistics(
      columns: Seq[String],
      schema: StructType,
      results: AnalyzerContext,
      predefinedTypes: Map[String, DataTypeInstances.Value] = Map.empty)
    : GenericColumnStatistics = {

    val numRecords = results.metricMap
      .collect { case (_: Size, metric: DoubleMetric) => metric.value.get }
      .head
      .toLong


    val inferredTypes = results.metricMap
      .filterNot{
        case (analyzer: DataType, _) => predefinedTypes.contains(analyzer.column)
        case _ => true
      }
      .collect { case (analyzer: DataType, metric: HistogramMetric) =>
          val typeHistogram = metric.value.get
          analyzer.column -> DataTypeHistogram.determineType(typeHistogram)
      }

    val typeDetectionHistograms = results.metricMap
      .filterNot{
        case (analyzer: DataType, _) => predefinedTypes.contains(analyzer.column)
        case _ => true
      }
      .collect { case (analyzer: DataType, metric: HistogramMetric) =>
          val typeCounts = metric.value.get.values
            .map { case (key, distValue) => key -> distValue.absolute }
          analyzer.column -> typeCounts
      }

    val approximateNumDistincts = results.metricMap
      .collect { case (analyzer: ApproxCountDistinct, metric: DoubleMetric) =>
        analyzer.column -> metric.value.get.toLong
      }

    val completenesses = results.metricMap
      .collect { case (analyzer: Completeness, metric: DoubleMetric) =>
        analyzer.column -> metric.value.get
      }

    val knownTypes = schema.fields
      .filter { column => columns.contains(column.name) }
      .filterNot { column => predefinedTypes.contains(column.name)}
      .filter {
        _.dataType != StringType
      }
      .map { field =>
        val knownType = field.dataType match {
          case ShortType | LongType | IntegerType => Integral
          case DecimalType() | FloatType | DoubleType => Fractional
          case BooleanType => Boolean
          case TimestampType => String // TODO We should have support for dates in deequ...
          case _ =>
            println(s"Unable to map type ${field.dataType}")
            Unknown
        }

        field.name -> knownType
      }
      .toMap

    GenericColumnStatistics(numRecords, inferredTypes, knownTypes, typeDetectionHistograms,
      approximateNumDistincts, completenesses, predefinedTypes)
  }