in tensorflow_data_validation/anomalies/natural_language_domain_util.cc [76:141]
void VerifyTokenConstraints(
const NaturalLanguageStatistics::TokenStatistics& token_stats,
const string& token_string, SequenceValueConstraints* constraint,
std::vector<Description>* result) {
if (constraint->has_min_fraction_of_sequences() &&
(constraint->min_fraction_of_sequences() >
token_stats.fraction_of_sequences())) {
result->push_back(
{tensorflow::metadata::v0::AnomalyInfo ::
SEQUENCE_VALUE_TOO_SMALL_FRACTION,
"Value occurs in too small a fraction of sequences.",
absl::StrCat("Fraction of sequences with value: ", token_string,
" is: ", token_stats.fraction_of_sequences(),
" which is lower than the threshold set in the "
"Schema: ",
constraint->min_fraction_of_sequences(), ".")});
constraint->set_min_fraction_of_sequences(
token_stats.fraction_of_sequences());
}
if (constraint->has_max_fraction_of_sequences() &&
(constraint->max_fraction_of_sequences() <
token_stats.fraction_of_sequences())) {
result->push_back(
{tensorflow::metadata::v0::AnomalyInfo ::
SEQUENCE_VALUE_TOO_LARGE_FRACTION,
"Value occurs in too large a fraction of sequences.",
absl::StrCat("Fraction of sequences with value: ", token_string,
" is: ", token_stats.fraction_of_sequences(),
" which is higher than the threshold set in the "
"Schema: ",
constraint->max_fraction_of_sequences(), ".")});
constraint->set_max_fraction_of_sequences(
token_stats.fraction_of_sequences());
}
if (constraint->has_min_per_sequence() &&
(constraint->min_per_sequence() >
token_stats.per_sequence_min_frequency())) {
result->push_back(
{tensorflow::metadata::v0::AnomalyInfo ::
SEQUENCE_VALUE_TOO_FEW_OCCURRENCES,
"Value has too few per-sequence occurrences.",
absl::StrCat("Value: ", token_string, " occurs at least: ",
token_stats.per_sequence_min_frequency(),
" times within a sequence, which is lower than the "
"threshold set in the Schema: ",
constraint->min_per_sequence(), ".")});
constraint->set_min_per_sequence(token_stats.per_sequence_min_frequency());
}
if (constraint->has_max_per_sequence() &&
(constraint->max_per_sequence() <
token_stats.per_sequence_max_frequency())) {
result->push_back(
{tensorflow::metadata::v0::AnomalyInfo ::
SEQUENCE_VALUE_TOO_MANY_OCCURRENCES,
"Value has too many per-sequence occurrences.",
absl::StrCat("Value: ", token_string, " occurs at most: ",
token_stats.per_sequence_max_frequency(),
" times within a sequence, which is higher than the "
"threshold set in the Schema: ",
constraint->max_per_sequence(), ".")});
constraint->set_max_per_sequence(token_stats.per_sequence_max_frequency());
}
}