tensorflow::Status ValidateFeatureStatistics()

in tensorflow_data_validation/anomalies/feature_statistics_validator.cc [101:172]


tensorflow::Status ValidateFeatureStatistics(
    const tensorflow::metadata::v0::DatasetFeatureStatistics&
        feature_statistics,
    const tensorflow::metadata::v0::Schema& schema_proto,
    const absl::optional<string>& environment,
    const absl::optional<tensorflow::metadata::v0::DatasetFeatureStatistics>&
        prev_span_feature_statistics,
    const absl::optional<tensorflow::metadata::v0::DatasetFeatureStatistics>&
        serving_feature_statistics,
    const absl::optional<metadata::v0::DatasetFeatureStatistics>&
        prev_version_feature_statistics,
    const absl::optional<FeaturesNeeded>& features_needed,
    const ValidationConfig& validation_config, bool enable_diff_regions,
    tensorflow::metadata::v0::Anomalies* result) {
  // TODO(b/113295423): Clean up the optional conversions.
  const absl::optional<string> maybe_environment =
      environment ? absl::optional<string>(*environment)
                  : absl::optional<string>();
  FeatureStatisticsToProtoConfig feature_statistics_to_proto_config;
  feature_statistics_to_proto_config.set_enum_threshold(kDefaultEnumThreshold);
  feature_statistics_to_proto_config.set_new_features_are_warnings(
      validation_config.new_features_are_warnings());
  *feature_statistics_to_proto_config.mutable_severity_overrides() =
      validation_config.severity_overrides();

  const bool by_weight =
      DatasetStatsView(feature_statistics).WeightedStatisticsExist();
  if (feature_statistics.num_examples() == 0) {
    *result->mutable_baseline() = schema_proto;
    result->set_data_missing(true);
  } else {
    SchemaAnomalies schema_anomalies(schema_proto);
    std::shared_ptr<DatasetStatsView> previous_span =
        (prev_span_feature_statistics)
            ? std::make_shared<DatasetStatsView>(
                  prev_span_feature_statistics.value(), by_weight,
                  maybe_environment,
                  /* previous_span= */ nullptr,
                  /* serving= */ nullptr,
                  /* previous_version= */ nullptr)
            : nullptr;

    std::shared_ptr<DatasetStatsView> serving =
        (serving_feature_statistics) ? std::make_shared<DatasetStatsView>(
                                           serving_feature_statistics.value(),
                                           by_weight, maybe_environment,
                                           /* previous_span= */ nullptr,
                                           /* serving= */ nullptr,
                                           /* previous_version= */ nullptr)
                                     : nullptr;

    std::shared_ptr<DatasetStatsView> previous_version =
        (prev_version_feature_statistics)
            ? std::make_shared<DatasetStatsView>(
                  prev_version_feature_statistics.value(), by_weight,
                  maybe_environment,
                  /* previous_span= */ nullptr,
                  /* serving= */ nullptr,
                  /* previous_version= */ nullptr)
            : nullptr;

    const DatasetStatsView training =
        DatasetStatsView(feature_statistics, by_weight, maybe_environment,
                         previous_span, serving, previous_version);
    TF_RETURN_IF_ERROR(
        schema_anomalies.FindChanges(training, features_needed,
                                     feature_statistics_to_proto_config));
    *result = schema_anomalies.GetSchemaDiff(enable_diff_regions);
  }

  return tensorflow::Status::OK();
}