in tensorflow_data_validation/anomalies/dataset_constraints_util.cc [46:126]
std::vector<Description> UpdateNumExamplesComparatorDirect(
const DatasetStatsView& stats, const DatasetComparatorType comparator_type,
tensorflow::metadata::v0::NumericValueComparator* comparator) {
if (!comparator->has_min_fraction_threshold() &&
!comparator->has_max_fraction_threshold()) {
return {};
}
double num_examples = stats.GetNumExamples();
// ValidateFeatureStatistics does not attempt to detect anomalies in
// datasets that have num_examples == 0. Check that here.
CHECK(num_examples > 0.0)
<< "Invalid input. Num examples must be greater than "
"0.";
const absl::optional<DatasetStatsView> control_stats =
((comparator_type == DatasetComparatorType::DRIFT)
? stats.GetPreviousSpan()
: stats.GetPreviousVersion());
if (!control_stats) {
return {};
}
const string control_name =
(comparator_type == DatasetComparatorType::DRIFT ? "previous span"
: "previous version");
std::vector<Description> descriptions = {};
double control_num_examples = control_stats->GetNumExamples();
CHECK(control_num_examples >= 0.0) << "Invalid input. Control num examples "
"must not be negative";
double num_examples_ratio;
if (control_num_examples != 0.0) {
num_examples_ratio = num_examples / control_num_examples;
}
// TODO(b/138589350): Check for possible case of ratio == 1.0 but num_examples
// != control_num_examples.
if (comparator->has_max_fraction_threshold()) {
double max_threshold = comparator->max_fraction_threshold();
if (control_num_examples == 0.0) {
comparator->clear_max_fraction_threshold();
descriptions.push_back(
{tensorflow::metadata::v0::AnomalyInfo::COMPARATOR_HIGH_NUM_EXAMPLES,
absl::StrCat("High num examples in current dataset versus the ",
control_name, ", which has 0."),
absl::StrCat("The ", control_name,
" has 0 examples, so there is a high number of "
"examples in the current dataset versus the ",
control_name, ".")});
} else if (num_examples_ratio > max_threshold) {
comparator->set_max_fraction_threshold(num_examples_ratio);
descriptions.push_back(
{tensorflow::metadata::v0::AnomalyInfo::COMPARATOR_HIGH_NUM_EXAMPLES,
absl::StrCat("High num examples in current dataset versus the ",
control_name, "."),
absl::StrCat(
"The ratio of num examples in the current dataset versus the ",
control_name, " is ", absl::SixDigits(num_examples_ratio),
" (up to six significant digits), which is above the "
"threshold ",
absl::SixDigits(max_threshold), ".")});
}
}
if (comparator->has_min_fraction_threshold()) {
double min_threshold = comparator->min_fraction_threshold();
if (control_num_examples != 0.0 &&
num_examples_ratio < comparator->min_fraction_threshold()) {
comparator->set_min_fraction_threshold(num_examples_ratio);
descriptions.push_back(
{tensorflow::metadata::v0::AnomalyInfo::COMPARATOR_LOW_NUM_EXAMPLES,
absl::StrCat("Low num examples in current dataset versus the ",
control_name, "."),
absl::StrCat(
"The ratio of num examples in the current dataset versus the ",
control_name, " is ", absl::SixDigits(num_examples_ratio),
" (up to six significant digits), which is below the threshold ",
absl::SixDigits(min_threshold), ".")});
}
}
return descriptions;
}