void VerifyTokenConstraints()

in tensorflow_data_validation/anomalies/natural_language_domain_util.cc [76:141]


void VerifyTokenConstraints(
    const NaturalLanguageStatistics::TokenStatistics& token_stats,
    const string& token_string, SequenceValueConstraints* constraint,
    std::vector<Description>* result) {
  if (constraint->has_min_fraction_of_sequences() &&
      (constraint->min_fraction_of_sequences() >
       token_stats.fraction_of_sequences())) {
    result->push_back(
        {tensorflow::metadata::v0::AnomalyInfo ::
             SEQUENCE_VALUE_TOO_SMALL_FRACTION,
         "Value occurs in too small a fraction of sequences.",
         absl::StrCat("Fraction of sequences with value: ", token_string,
                      " is: ", token_stats.fraction_of_sequences(),
                      " which is lower than the threshold set in the "
                      "Schema: ",
                      constraint->min_fraction_of_sequences(), ".")});
    constraint->set_min_fraction_of_sequences(
        token_stats.fraction_of_sequences());
  }

  if (constraint->has_max_fraction_of_sequences() &&
      (constraint->max_fraction_of_sequences() <
       token_stats.fraction_of_sequences())) {
    result->push_back(
        {tensorflow::metadata::v0::AnomalyInfo ::
             SEQUENCE_VALUE_TOO_LARGE_FRACTION,
         "Value occurs in too large a fraction of sequences.",
         absl::StrCat("Fraction of sequences with value: ", token_string,
                      " is: ", token_stats.fraction_of_sequences(),
                      " which is higher than the threshold set in the "
                      "Schema: ",
                      constraint->max_fraction_of_sequences(), ".")});
    constraint->set_max_fraction_of_sequences(
        token_stats.fraction_of_sequences());
  }

  if (constraint->has_min_per_sequence() &&
      (constraint->min_per_sequence() >
       token_stats.per_sequence_min_frequency())) {
    result->push_back(
        {tensorflow::metadata::v0::AnomalyInfo ::
             SEQUENCE_VALUE_TOO_FEW_OCCURRENCES,
         "Value has too few per-sequence occurrences.",
         absl::StrCat("Value: ", token_string, " occurs at least: ",
                      token_stats.per_sequence_min_frequency(),
                      " times within a sequence, which is lower than the "
                      "threshold set in the Schema: ",
                      constraint->min_per_sequence(), ".")});
    constraint->set_min_per_sequence(token_stats.per_sequence_min_frequency());
  }

  if (constraint->has_max_per_sequence() &&
      (constraint->max_per_sequence() <
       token_stats.per_sequence_max_frequency())) {
    result->push_back(
        {tensorflow::metadata::v0::AnomalyInfo ::
             SEQUENCE_VALUE_TOO_MANY_OCCURRENCES,
         "Value has too many per-sequence occurrences.",
         absl::StrCat("Value: ", token_string, " occurs at most: ",
                      token_stats.per_sequence_max_frequency(),
                      " times within a sequence, which is higher than the "
                      "threshold set in the Schema: ",
                      constraint->max_per_sequence(), ".")});
    constraint->set_max_per_sequence(token_stats.per_sequence_max_frequency());
  }
}