in tensorflow_data_validation/anomalies/natural_language_domain_util.cc [145:220]
std::vector<Description> UpdateNaturalLanguageDomain(
const FeatureStatsView& feature_stats, Feature* feature) {
std::vector<Description> result;
const CustomStatistic* nl_custom_stats =
feature_stats.GetCustomStatByName("nl_statistics");
NaturalLanguageStatistics nl_stats;
bool found_nl_stats = false;
if (nl_custom_stats) {
if (!nl_custom_stats->any().UnpackTo(&nl_stats)) {
LOG(WARNING) << "nl_statistics for feature " << feature->name()
<< "do not have the expected "
<< "NaturalLanguageStatistics message format.";
return result;
}
found_nl_stats = true;
}
static const auto& kMissingStatsDescription = *new Description{
tensorflow::metadata::v0::AnomalyInfo::STATS_NOT_AVAILABLE,
"Natural language stats are not computed.",
"Constraints specified in natural language domain cannot be "
"verified because natural language stats have not been computed."};
NaturalLanguageDomain* nl_domain = feature->mutable_natural_language_domain();
if ((nl_domain->coverage().has_min_coverage() ||
nl_domain->coverage().has_min_avg_token_length() ||
nl_domain->token_constraints_size() > 0) &&
!found_nl_stats) {
result.push_back(kMissingStatsDescription);
feature->clear_natural_language_domain();
return result;
}
VerifyCoverageConstraints(nl_stats, nl_domain, &result);
std::map<absl::variant<std::string, int>,
const NaturalLanguageStatistics::TokenStatistics&>
token_stats_map;
for (auto& token_stats : nl_stats.token_statistics()) {
if (token_stats.token_case() ==
NaturalLanguageStatistics::TokenStatistics::TokenCase::kIntToken) {
token_stats_map.emplace(token_stats.int_token(), token_stats);
} else if (token_stats.token_case() ==
NaturalLanguageStatistics::TokenStatistics::TokenCase::
kStringToken) {
token_stats_map.emplace(token_stats.string_token(), token_stats);
}
}
for (auto& constraint : *nl_domain->mutable_token_constraints()) {
absl::variant<string, int> constraint_name;
std::string token_string;
if (constraint.has_int_value()) {
constraint_name = constraint.int_value();
token_string = absl::StrCat(constraint.int_value());
} else if (constraint.has_string_value()) {
constraint_name = constraint.string_value();
token_string = constraint.string_value();
} else {
continue;
}
auto iter = token_stats_map.find(constraint_name);
if (iter == token_stats_map.end()) {
result.push_back(kMissingStatsDescription);
feature->clear_natural_language_domain();
return result;
}
VerifyTokenConstraints(iter->second, token_string, &constraint, &result);
}
return result;
}