in tensorflow_data_validation/anomalies/bool_domain_util.cc [180:310]
std::vector<Description> UpdateBoolDomain(const FeatureStatsView& feature_stats,
Feature* feature) {
std::vector<Description> descriptions;
switch (feature_stats.type()) {
case FeatureNameStatistics::BYTES:
LOG(ERROR) << "Should not call UpdateBoolDomain with BYTES";
DCHECK(false);
return {};
case FeatureNameStatistics::INT: {
const NumericStatistics& numeric_statistics = feature_stats.num_stats();
if (numeric_statistics.min() < 0.0) {
IntDomain* int_domain = feature->mutable_int_domain();
int_domain->set_max(numeric_statistics.max());
int_domain->set_min(numeric_statistics.min());
return {{tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_SMALL_INT,
kNonBooleanValues,
absl::StrCat("Integers (such as ",
absl::SixDigits(numeric_statistics.min()),
") not in {0, 1}: converting to an integer.")}};
}
if (numeric_statistics.max() > 1.0) {
IntDomain* int_domain = feature->mutable_int_domain();
int_domain->set_max(numeric_statistics.max());
int_domain->set_min(numeric_statistics.min());
return {{tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_BIG_INT,
kNonBooleanValues,
absl::StrCat("Integers (such as ",
absl::SixDigits(numeric_statistics.max()),
") not in {0, 1}: converting to an integer.")}};
}
return {};
}
case FeatureNameStatistics::FLOAT: {
const NumericStatistics& numeric_statistics = feature_stats.num_stats();
auto set_float_domain = [](const NumericStatistics& numeric_statistics,
Feature* feature) -> void {
FloatDomain* float_domain = feature->mutable_float_domain();
float_domain->set_max(numeric_statistics.max());
float_domain->set_min(numeric_statistics.min());
};
if (numeric_statistics.min() != 0.0 && numeric_statistics.min() != 1.0) {
set_float_domain(numeric_statistics, feature);
return {
{tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_UNEXPECTED_FLOAT,
kNonBooleanValues,
absl::StrCat("Floats (such as ",
absl::SixDigits(numeric_statistics.min()),
") not in {0, 1}: converting to float_domain.")}};
}
if (numeric_statistics.max() != 0.0 && numeric_statistics.max() != 1.0) {
set_float_domain(numeric_statistics, feature);
return {
{tensorflow::metadata::v0::AnomalyInfo::BOOL_TYPE_UNEXPECTED_FLOAT,
kNonBooleanValues,
absl::StrCat("Floats (such as ",
absl::SixDigits(numeric_statistics.max()),
") not in {0, 1}: converting to float_domain.")}};
}
for (const auto& histogram : numeric_statistics.histograms()) {
// Any non-empty boundary should include 0 or 1, otherwise the feature
// must not be boolean. Note: if histograms are not computed, or there
// are values inside the range 0 to 1, invalid bool_domain values will
// not be detected.
if (histogram.num_nan() > 0) {
set_float_domain(numeric_statistics, feature);
return {{tensorflow::metadata::v0::AnomalyInfo::
BOOL_TYPE_UNEXPECTED_FLOAT,
kNonBooleanValues,
absl::StrCat("Floats (such as NaN) not in {0, 1}: "
"converting to float_domain.")}};
}
for (const auto& bucket : histogram.buckets()) {
if (bucket.sample_count() <= 0) {
continue;
}
if (bucket.high_value() < 0) {
set_float_domain(numeric_statistics, feature);
return {{tensorflow::metadata::v0::AnomalyInfo::
BOOL_TYPE_UNEXPECTED_FLOAT,
kNonBooleanValues,
absl::StrCat("Float values < 0 not in {0, 1}: converting "
"to float_domain.")}};
} else if (bucket.low_value() > 1) {
set_float_domain(numeric_statistics, feature);
return {{tensorflow::metadata::v0::AnomalyInfo::
BOOL_TYPE_UNEXPECTED_FLOAT,
kNonBooleanValues,
absl::StrCat("Float values > 1 not in {0, 1}: converting "
"to float_domain.")}};
} else if (histogram.type() == metadata::v0::Histogram::QUANTILES &&
bucket.high_value() < 1 && bucket.low_value() > 0) {
set_float_domain(numeric_statistics, feature);
return {{tensorflow::metadata::v0::AnomalyInfo::
BOOL_TYPE_UNEXPECTED_FLOAT,
kNonBooleanValues,
absl::StrCat("Float values falling between 0 and 1: "
"converting to float_domain.")}};
}
}
}
return {};
}
case FeatureNameStatistics::STRING: {
const BoolDomain& bool_domain = feature->bool_domain();
const std::set<string> valid_strings =
BoolDomainValidStrings(bool_domain);
const std::vector<string> string_values = feature_stats.GetStringValues();
for (const string& str : string_values) {
if (!ContainsKey(valid_strings, str)) {
// We might be able to replace this with an enum, but since it is
// in all likelihood an error, let's just wipe the bool_domain.
const string valid_strings_desc =
BoolDomainValidStringsDescription(bool_domain);
// Note that this clears the oneof field domain_info.
feature->clear_bool_domain();
return {{tensorflow::metadata::v0::AnomalyInfo::
BOOL_TYPE_UNEXPECTED_STRING,
kNonBooleanValues,
absl::StrCat("Saw unexpected value \"", str,
"\" instead of ", valid_strings_desc, ".")}};
}
}
return {};
}
default:
LOG(ERROR) << "Should not be here with unknown type: "
<< feature_stats.type();
DCHECK(false);
return {};
}
}