std::vector UpdateNaturalLanguageDomain()

in tensorflow_data_validation/anomalies/natural_language_domain_util.cc [145:220]


std::vector<Description> UpdateNaturalLanguageDomain(
    const FeatureStatsView& feature_stats, Feature* feature) {
  std::vector<Description> result;

  const CustomStatistic* nl_custom_stats =
      feature_stats.GetCustomStatByName("nl_statistics");

  NaturalLanguageStatistics nl_stats;
  bool found_nl_stats = false;
  if (nl_custom_stats) {
    if (!nl_custom_stats->any().UnpackTo(&nl_stats)) {
      LOG(WARNING) << "nl_statistics for feature " << feature->name()
                   << "do not have the expected "
                   << "NaturalLanguageStatistics message format.";
      return result;
    }
    found_nl_stats = true;
  }

  static const auto& kMissingStatsDescription = *new Description{
      tensorflow::metadata::v0::AnomalyInfo::STATS_NOT_AVAILABLE,
      "Natural language stats are not computed.",
      "Constraints specified in natural language domain cannot be "
      "verified because natural language stats have not been computed."};

  NaturalLanguageDomain* nl_domain = feature->mutable_natural_language_domain();
  if ((nl_domain->coverage().has_min_coverage() ||
       nl_domain->coverage().has_min_avg_token_length() ||
       nl_domain->token_constraints_size() > 0) &&
      !found_nl_stats) {
    result.push_back(kMissingStatsDescription);
    feature->clear_natural_language_domain();
    return result;
  }

  VerifyCoverageConstraints(nl_stats, nl_domain, &result);

  std::map<absl::variant<std::string, int>,
           const NaturalLanguageStatistics::TokenStatistics&>
      token_stats_map;

  for (auto& token_stats : nl_stats.token_statistics()) {
    if (token_stats.token_case() ==
        NaturalLanguageStatistics::TokenStatistics::TokenCase::kIntToken) {
      token_stats_map.emplace(token_stats.int_token(), token_stats);
    } else if (token_stats.token_case() ==
               NaturalLanguageStatistics::TokenStatistics::TokenCase::
                   kStringToken) {
      token_stats_map.emplace(token_stats.string_token(), token_stats);
    }
  }

  for (auto& constraint : *nl_domain->mutable_token_constraints()) {
    absl::variant<string, int> constraint_name;
    std::string token_string;
    if (constraint.has_int_value()) {
      constraint_name = constraint.int_value();
      token_string = absl::StrCat(constraint.int_value());

    } else if (constraint.has_string_value()) {
      constraint_name = constraint.string_value();
      token_string = constraint.string_value();
    } else {
      continue;
    }

    auto iter = token_stats_map.find(constraint_name);
    if (iter == token_stats_map.end()) {
      result.push_back(kMissingStatsDescription);
      feature->clear_natural_language_domain();
      return result;
    }
    VerifyTokenConstraints(iter->second, token_string, &constraint, &result);
  }
  return result;
}