in tensorflow_data_validation/anomalies/feature_statistics_validator.cc [101:172]
tensorflow::Status ValidateFeatureStatistics(
const tensorflow::metadata::v0::DatasetFeatureStatistics&
feature_statistics,
const tensorflow::metadata::v0::Schema& schema_proto,
const absl::optional<string>& environment,
const absl::optional<tensorflow::metadata::v0::DatasetFeatureStatistics>&
prev_span_feature_statistics,
const absl::optional<tensorflow::metadata::v0::DatasetFeatureStatistics>&
serving_feature_statistics,
const absl::optional<metadata::v0::DatasetFeatureStatistics>&
prev_version_feature_statistics,
const absl::optional<FeaturesNeeded>& features_needed,
const ValidationConfig& validation_config, bool enable_diff_regions,
tensorflow::metadata::v0::Anomalies* result) {
// TODO(b/113295423): Clean up the optional conversions.
const absl::optional<string> maybe_environment =
environment ? absl::optional<string>(*environment)
: absl::optional<string>();
FeatureStatisticsToProtoConfig feature_statistics_to_proto_config;
feature_statistics_to_proto_config.set_enum_threshold(kDefaultEnumThreshold);
feature_statistics_to_proto_config.set_new_features_are_warnings(
validation_config.new_features_are_warnings());
*feature_statistics_to_proto_config.mutable_severity_overrides() =
validation_config.severity_overrides();
const bool by_weight =
DatasetStatsView(feature_statistics).WeightedStatisticsExist();
if (feature_statistics.num_examples() == 0) {
*result->mutable_baseline() = schema_proto;
result->set_data_missing(true);
} else {
SchemaAnomalies schema_anomalies(schema_proto);
std::shared_ptr<DatasetStatsView> previous_span =
(prev_span_feature_statistics)
? std::make_shared<DatasetStatsView>(
prev_span_feature_statistics.value(), by_weight,
maybe_environment,
/* previous_span= */ nullptr,
/* serving= */ nullptr,
/* previous_version= */ nullptr)
: nullptr;
std::shared_ptr<DatasetStatsView> serving =
(serving_feature_statistics) ? std::make_shared<DatasetStatsView>(
serving_feature_statistics.value(),
by_weight, maybe_environment,
/* previous_span= */ nullptr,
/* serving= */ nullptr,
/* previous_version= */ nullptr)
: nullptr;
std::shared_ptr<DatasetStatsView> previous_version =
(prev_version_feature_statistics)
? std::make_shared<DatasetStatsView>(
prev_version_feature_statistics.value(), by_weight,
maybe_environment,
/* previous_span= */ nullptr,
/* serving= */ nullptr,
/* previous_version= */ nullptr)
: nullptr;
const DatasetStatsView training =
DatasetStatsView(feature_statistics, by_weight, maybe_environment,
previous_span, serving, previous_version);
TF_RETURN_IF_ERROR(
schema_anomalies.FindChanges(training, features_needed,
feature_statistics_to_proto_config));
*result = schema_anomalies.GetSchemaDiff(enable_diff_regions);
}
return tensorflow::Status::OK();
}