in tensorflow_data_validation/anomalies/string_domain_util.cc [155:215]
UpdateSummary UpdateStringDomain(const Schema::Updater& updater,
const FeatureStatsView& stats,
double max_off_domain,
StringDomain* string_domain) {
UpdateSummary summary;
if (stats.HasInvalidUTF8Strings()) {
summary.descriptions.push_back(
{tensorflow::metadata::v0::AnomalyInfo::ENUM_TYPE_INVALID_UTF8,
"Invalid UTF8 strings",
"Found strings that were not valid UTF8 strings."
" If this enum previously included __BYTE_VALUES__, it was"
" created in error, and this is an alert to fix it."});
summary.clear_field = true;
return summary;
}
const std::map<string, double> missing =
StringDomainGetMissing(stats, *string_domain);
// Total number of values in the dataset that do not appear in the schema.
const double missing_count = absl::c_accumulate(
missing, /*init=*/0.0,
[](double count, const std::pair<const string, double>& p) -> double {
return count + p.second;
});
const double total_value_count = stats.GetTotalValueCountInExamples();
if ((missing_count / total_value_count) > max_off_domain ||
(max_off_domain == 0 && !missing.empty())) {
const Description description = {
tensorflow::metadata::v0::AnomalyInfo::
ENUM_TYPE_UNEXPECTED_STRING_VALUES,
"Unexpected string values",
absl::StrCat(
"Examples contain values missing from the schema: ",
absl::StrJoin(
missing, ", ",
[total_value_count](
string* out,
const std::pair<string, int64>& value_and_freq) {
absl::StrAppend(
out,
Printf(
"%s (%s)",
absl::Utf8SafeCEscape(value_and_freq.first).c_str(),
PercentageAsString(value_and_freq.second,
total_value_count)
.c_str()));
}),
". ")};
summary.descriptions.push_back(description);
StringDomainAddMissing(missing, string_domain);
}
const int domain_size = string_domain->value().size();
if (updater.string_domain_too_big(domain_size)) {
summary.clear_field = true;
summary.descriptions.push_back(
{tensorflow::metadata::v0::AnomalyInfo::INVALID_DOMAIN_SPECIFICATION,
"String domain has too many values",
Printf("String domain has too many values (%d).", domain_size)});
}
return summary;
}