in src/main/scala/com/amazon/deequ/suggestions/ConstraintSuggestionRunner.scala [161:209]
private[suggestions] def profileAndSuggest(
trainingData: DataFrame,
constraintRules: Seq[ConstraintRule[ColumnProfile]],
restrictToColumns: Option[Seq[String]],
lowCardinalityHistogramThreshold: Int,
printStatusUpdates: Boolean,
metricsRepositoryOptions: ConstraintSuggestionMetricsRepositoryOptions,
kllParameters: Option[KLLParameters],
predefinedTypes: Map[String, DataTypeInstances.Value])
: (ColumnProfiles, Seq[ConstraintSuggestion]) = {
var columnProfilerRunner = ColumnProfilerRunner()
.onData(trainingData)
.printStatusUpdates(printStatusUpdates)
.withLowCardinalityHistogramThreshold(lowCardinalityHistogramThreshold)
restrictToColumns.foreach { restrictToColumns =>
columnProfilerRunner = columnProfilerRunner.restrictToColumns(restrictToColumns)
}
columnProfilerRunner = columnProfilerRunner.setKLLParameters(kllParameters)
columnProfilerRunner =
columnProfilerRunner.setPredefinedTypes(predefinedTypes)
metricsRepositoryOptions.metricsRepository.foreach { metricsRepository =>
var columnProfilerRunnerWithRepository = columnProfilerRunner.useRepository(metricsRepository)
metricsRepositoryOptions.reuseExistingResultsKey.foreach { reuseExistingResultsKey =>
columnProfilerRunnerWithRepository = columnProfilerRunnerWithRepository
.reuseExistingResultsForKey(reuseExistingResultsKey,
metricsRepositoryOptions.failIfResultsForReusingMissing)
}
metricsRepositoryOptions.saveOrAppendResultsKey.foreach { saveOrAppendResultsKey =>
columnProfilerRunnerWithRepository = columnProfilerRunnerWithRepository
.saveOrAppendResult(saveOrAppendResultsKey)
}
columnProfilerRunner = columnProfilerRunnerWithRepository
}
val profiles = columnProfilerRunner.run()
val relevantColumns = getRelevantColumns(trainingData.schema, restrictToColumns)
val suggestions = applyRules(constraintRules, profiles, relevantColumns)
(profiles, suggestions)
}