UpdateSummary UpdateStringDomain()

in tensorflow_data_validation/anomalies/string_domain_util.cc [155:215]


UpdateSummary UpdateStringDomain(const Schema::Updater& updater,
                                 const FeatureStatsView& stats,
                                 double max_off_domain,
                                 StringDomain* string_domain) {
  UpdateSummary summary;
  if (stats.HasInvalidUTF8Strings()) {
    summary.descriptions.push_back(
        {tensorflow::metadata::v0::AnomalyInfo::ENUM_TYPE_INVALID_UTF8,
         "Invalid UTF8 strings",
         "Found strings that were not valid UTF8 strings."
         " If this enum previously included __BYTE_VALUES__, it was"
         " created in error, and this is an alert to fix it."});
    summary.clear_field = true;
    return summary;
  }
  const std::map<string, double> missing =
      StringDomainGetMissing(stats, *string_domain);
  // Total number of values in the dataset that do not appear in the schema.
  const double missing_count = absl::c_accumulate(
      missing, /*init=*/0.0,
      [](double count, const std::pair<const string, double>& p) -> double {
        return count + p.second;
      });
  const double total_value_count = stats.GetTotalValueCountInExamples();
  if ((missing_count / total_value_count) > max_off_domain ||
      (max_off_domain == 0 && !missing.empty())) {
    const Description description = {
        tensorflow::metadata::v0::AnomalyInfo::
            ENUM_TYPE_UNEXPECTED_STRING_VALUES,
        "Unexpected string values",
        absl::StrCat(
            "Examples contain values missing from the schema: ",
            absl::StrJoin(
                missing, ", ",
                [total_value_count](
                    string* out,
                    const std::pair<string, int64>& value_and_freq) {
                  absl::StrAppend(
                      out,
                      Printf(
                          "%s (%s)",
                          absl::Utf8SafeCEscape(value_and_freq.first).c_str(),
                          PercentageAsString(value_and_freq.second,
                                             total_value_count)
                              .c_str()));
                }),
            ". ")};
    summary.descriptions.push_back(description);
    StringDomainAddMissing(missing, string_domain);
  }
  const int domain_size = string_domain->value().size();
  if (updater.string_domain_too_big(domain_size)) {
    summary.clear_field = true;

    summary.descriptions.push_back(
        {tensorflow::metadata::v0::AnomalyInfo::INVALID_DOMAIN_SPECIFICATION,
         "String domain has too many values",
         Printf("String domain has too many values (%d).", domain_size)});
  }
  return summary;
}