in src/main/scala/com/amazon/deequ/profiles/ColumnProfilerRunner.scala [43:86]
private[profiles] def run(
data: DataFrame,
restrictToColumns: Option[Seq[String]],
lowCardinalityHistogramThreshold: Int,
printStatusUpdates: Boolean,
cacheInputs: Boolean,
fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions,
metricsRepositoryOptions: ColumnProfilerRunBuilderMetricsRepositoryOptions,
kllProfiling: Boolean,
kllParameters: Option[KLLParameters],
predefinedTypes: Map[String, DataTypeInstances.Value])
: ColumnProfiles = {
if (cacheInputs) {
data.cache()
}
val columnProfiles = ColumnProfiler
.profile(
data,
restrictToColumns,
printStatusUpdates,
lowCardinalityHistogramThreshold,
metricsRepositoryOptions.metricsRepository,
metricsRepositoryOptions.reuseExistingResultsKey,
metricsRepositoryOptions.failIfResultsForReusingMissing,
metricsRepositoryOptions.saveOrAppendResultsKey,
kllProfiling,
kllParameters,
predefinedTypes
)
saveColumnProfilesJsonToFileSystemIfNecessary(
fileOutputOptions,
printStatusUpdates,
columnProfiles
)
if (cacheInputs) {
data.unpersist()
}
columnProfiles
}