std::vector UpdateBoolDomain()

in tensorflow_data_validation/anomalies/bool_domain_util.cc [180:310]


std::vector<Description> UpdateBoolDomain(const FeatureStatsView& feature_stats,
                                          Feature* feature) {
  std::vector<Description> descriptions;
  switch (feature_stats.type()) {
    case FeatureNameStatistics::BYTES:
      LOG(ERROR) << "Should not call UpdateBoolDomain with BYTES";
      DCHECK(false);
      return {};
    case FeatureNameStatistics::INT: {
      const NumericStatistics& numeric_statistics = feature_stats.num_stats();
      if (numeric_statistics.min() < 0.0) {
        IntDomain* int_domain = feature->mutable_int_domain();
        int_domain->set_max(numeric_statistics.max());
        int_domain->set_min(numeric_statistics.min());
        return {{tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_SMALL_INT,
                 kNonBooleanValues,
                 absl::StrCat("Integers (such as ",
                              absl::SixDigits(numeric_statistics.min()),
                              ") not in {0, 1}: converting to an integer.")}};
      }
      if (numeric_statistics.max() > 1.0) {
        IntDomain* int_domain = feature->mutable_int_domain();
        int_domain->set_max(numeric_statistics.max());
        int_domain->set_min(numeric_statistics.min());
        return {{tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_BIG_INT,
                 kNonBooleanValues,
                 absl::StrCat("Integers (such as ",
                              absl::SixDigits(numeric_statistics.max()),
                              ") not in {0, 1}: converting to an integer.")}};
      }
      return {};
    }
    case FeatureNameStatistics::FLOAT: {
      const NumericStatistics& numeric_statistics = feature_stats.num_stats();
      auto set_float_domain = [](const NumericStatistics& numeric_statistics,
                                 Feature* feature) -> void {
        FloatDomain* float_domain = feature->mutable_float_domain();
        float_domain->set_max(numeric_statistics.max());
        float_domain->set_min(numeric_statistics.min());
      };
      if (numeric_statistics.min() != 0.0 && numeric_statistics.min() != 1.0) {
        set_float_domain(numeric_statistics, feature);
        return {
            {tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_UNEXPECTED_FLOAT,
             kNonBooleanValues,
             absl::StrCat("Floats (such as ",
                          absl::SixDigits(numeric_statistics.min()),
                          ") not in {0, 1}: converting to float_domain.")}};
      }
      if (numeric_statistics.max() != 0.0 && numeric_statistics.max() != 1.0) {
        set_float_domain(numeric_statistics, feature);
        return {
            {tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_UNEXPECTED_FLOAT,
             kNonBooleanValues,
             absl::StrCat("Floats (such as ",
                          absl::SixDigits(numeric_statistics.max()),
                          ") not in {0, 1}: converting to float_domain.")}};
      }
      for (const auto& histogram : numeric_statistics.histograms()) {
        // Any non-empty boundary should include 0 or 1, otherwise the feature
        // must not be boolean. Note: if histograms are not computed, or there
        // are values inside the range 0 to 1, invalid bool_domain values will
        // not be detected.
        if (histogram.num_nan() > 0) {
          set_float_domain(numeric_statistics, feature);
          return {{tensorflow::metadata::v0::AnomalyInfo::
                       BOOL_TYPE_UNEXPECTED_FLOAT,
                   kNonBooleanValues,
                   absl::StrCat("Floats (such as NaN) not in {0, 1}: "
                                "converting to float_domain.")}};
        }
        for (const auto& bucket : histogram.buckets()) {
          if (bucket.sample_count() <= 0) {
            continue;
          }
          if (bucket.high_value() < 0) {
            set_float_domain(numeric_statistics, feature);
            return {{tensorflow::metadata::v0::AnomalyInfo::
                         BOOL_TYPE_UNEXPECTED_FLOAT,
                     kNonBooleanValues,
                     absl::StrCat("Float values < 0 not in {0, 1}: converting "
                                  "to float_domain.")}};
          } else if (bucket.low_value() > 1) {
            set_float_domain(numeric_statistics, feature);
            return {{tensorflow::metadata::v0::AnomalyInfo::
                         BOOL_TYPE_UNEXPECTED_FLOAT,
                     kNonBooleanValues,
                     absl::StrCat("Float values > 1 not in {0, 1}: converting "
                                  "to float_domain.")}};
          } else if (histogram.type() == metadata::v0::Histogram::QUANTILES &&
                     bucket.high_value() < 1 && bucket.low_value() > 0) {
            set_float_domain(numeric_statistics, feature);
            return {{tensorflow::metadata::v0::AnomalyInfo::
                         BOOL_TYPE_UNEXPECTED_FLOAT,
                     kNonBooleanValues,
                     absl::StrCat("Float values falling between 0 and 1: "
                                  "converting to float_domain.")}};
          }
        }
      }
      return {};
    }
    case FeatureNameStatistics::STRING: {
      const BoolDomain& bool_domain = feature->bool_domain();
      const std::set<string> valid_strings =
          BoolDomainValidStrings(bool_domain);
      const std::vector<string> string_values = feature_stats.GetStringValues();
      for (const string& str : string_values) {
        if (!ContainsKey(valid_strings, str)) {
          // We might be able to replace this with an enum, but since it is
          // in all likelihood an error, let's just wipe the bool_domain.
          const string valid_strings_desc =
              BoolDomainValidStringsDescription(bool_domain);
          // Note that this clears the oneof field domain_info.
          feature->clear_bool_domain();
          return {{tensorflow::metadata::v0::AnomalyInfo::
                       BOOL_TYPE_UNEXPECTED_STRING,
                   kNonBooleanValues,
                   absl::StrCat("Saw unexpected value \"", str,
                                "\" instead of ", valid_strings_desc, ".")}};
        }
      }
      return {};
    }
    default:
      LOG(ERROR) << "Should not be here with unknown type: "
                 << feature_stats.type();
      DCHECK(false);
      return {};
  }
}