in tensorflow_data_validation/anomalies/schema.cc [932:1164]
void Schema::UpdateFeatureInternal(
const Updater& updater, const FeatureStatsView& view, Feature* feature,
std::vector<Description>* descriptions,
absl::optional<tensorflow::metadata::v0::DriftSkewInfo>* drift_skew_info) {
*descriptions = UpdateFeatureSelf(feature);
// feature can be deprecated inside of UpdateFeatureSelf.
if (::tensorflow::data_validation::FeatureIsDeprecated(*feature)) {
return;
}
// This is to cover the rare case where there is actually no examples with
// this feature, but there is still a dataset_stats object.
const bool feature_missing = view.GetNumPresent() == 0;
// If the feature is missing, but should be present, create an anomaly.
// Otherwise, return without checking anything else.
if (feature_missing) {
if (IsExistenceRequired(*feature, view.environment())) {
descriptions->push_back(
{tensorflow::metadata::v0::AnomalyInfo::FEATURE_TYPE_NOT_PRESENT,
"Column dropped", "The feature was not present in any examples."});
::tensorflow::data_validation::DeprecateFeature(feature);
return;
} else {
return;
}
}
// If the feature is present in the dataset_stats and the schema, but is
// excluded from the environment of the dataset_stats, then add it to that
// environment.
if (!feature_missing &&
!IsFeatureInEnvironment(*feature, view.environment())) {
// environment must be specified here, otherwise all features would be
// present.
CHECK(view.environment());
const string view_environment = *view.environment();
if (ContainsValue(feature->not_in_environment(), view_environment)) {
RemoveIf(feature->mutable_not_in_environment(),
[view_environment](const string* other) {
return *other == view_environment;
});
}
// Even if we remove the feature from not in environment, we may need to
// add it to in_environment.
if (!IsFeatureInEnvironment(*feature, view.environment())) {
feature->add_in_environment(view_environment);
}
descriptions->push_back(
{tensorflow::metadata::v0::AnomalyInfo::SCHEMA_NEW_COLUMN,
"Column missing in environment",
absl::StrCat("New column ", view.GetPath().Serialize(),
" found in data but not in the "
"environment ",
view_environment, " in the schema.")});
}
auto add_to_descriptions =
[descriptions](const std::vector<Description>& other_descriptions) {
descriptions->insert(descriptions->end(), other_descriptions.begin(),
other_descriptions.end());
};
// Clear domain_info if clear_field is set.
// Either way, append descriptions.
auto handle_update_summary = [feature, &add_to_descriptions](
const UpdateSummary& update_summary) {
add_to_descriptions(update_summary.descriptions);
if (update_summary.clear_field) {
// Note that this clears the oneof field domain_info.
::tensorflow::data_validation::ClearDomain(feature);
}
};
if (feature->has_value_count() || feature->has_value_counts()) {
add_to_descriptions(UpdateFeatureValueCounts(view, feature));
}
if (feature->has_shape()) {
add_to_descriptions(
UpdateFeatureShape(view, generate_legacy_feature_spec(), feature));
}
if (feature->has_presence()) {
add_to_descriptions(::tensorflow::data_validation::UpdatePresence(
view, feature->mutable_presence()));
}
if (view.GetFeatureType() != feature->type()) {
// Basically, deprecate the feature. The rest is just getting a meaningful
// message out.
::tensorflow::data_validation::DeprecateFeature(feature);
const ::tensorflow::protobuf::EnumValueDescriptor* descriptor =
tensorflow::metadata::v0::FeatureNameStatistics_Type_descriptor()
->FindValueByNumber(view.type());
string data_type_name = (descriptor == nullptr)
? absl::StrCat("unknown(", view.type(), ")")
: descriptor->name();
const ::tensorflow::protobuf::EnumValueDescriptor* schema_descriptor =
tensorflow::metadata::v0::FeatureType_descriptor()->FindValueByNumber(
feature->type());
string schema_type_name =
(schema_descriptor == nullptr)
? absl::StrCat("unknown(", feature->type(), ")")
: schema_descriptor->name();
descriptions->push_back(
{tensorflow::metadata::v0::AnomalyInfo::UNEXPECTED_DATA_TYPE,
absl::StrCat("Expected data of type: ", schema_type_name, " but got ",
data_type_name)});
}
if (view.type() == FeatureNameStatistics::BYTES &&
!ContainsKey(
std::set<Feature::DomainInfoCase>(
{Feature::DOMAIN_INFO_NOT_SET, Feature::kNaturalLanguageDomain,
Feature::kImageDomain, Feature::kUrlDomain}),
feature->domain_info_case())) {
// Note that this clears the oneof field domain_info.
::tensorflow::data_validation::ClearDomain(feature);
descriptions->push_back(
{tensorflow::metadata::v0::AnomalyInfo::DOMAIN_INVALID_FOR_TYPE,
absl::StrCat("Data is marked as BYTES with incompatible "
"domain_info: ",
feature->DebugString())});
}
switch (feature->domain_info_case()) {
case Feature::kDomain: {
UpdateSummary update_summary =
::tensorflow::data_validation::UpdateStringDomain(
updater, view,
::tensorflow::data_validation::GetMaxOffDomain(
feature->distribution_constraints()),
CHECK_NOTNULL(GetExistingStringDomain(feature->domain())));
add_to_descriptions(update_summary.descriptions);
if (update_summary.clear_field) {
// Note that this clears the oneof field domain_info.
const string domain = feature->domain();
ClearStringDomain(domain);
}
}
break;
case Feature::kBoolDomain:
add_to_descriptions(
::tensorflow::data_validation::UpdateBoolDomain(view, feature));
break;
case Feature::kIntDomain:
handle_update_summary(::tensorflow::data_validation::UpdateIntDomain(
view, feature->mutable_int_domain()));
break;
case tensorflow::metadata::v0::Feature::kFloatDomain:
handle_update_summary(::tensorflow::data_validation::UpdateFloatDomain(
view, feature->mutable_float_domain()));
break;
case tensorflow::metadata::v0::Feature::kStringDomain:
handle_update_summary(::tensorflow::data_validation::UpdateStringDomain(
updater, view,
::tensorflow::data_validation::GetMaxOffDomain(
feature->distribution_constraints()),
feature->mutable_string_domain()));
break;
case Feature::kImageDomain:
add_to_descriptions(
::tensorflow::data_validation::UpdateImageDomain(view, feature));
break;
case Feature::kNaturalLanguageDomain:
add_to_descriptions(
::tensorflow::data_validation::UpdateNaturalLanguageDomain(view,
feature));
break;
case Feature::kMidDomain:
case Feature::kUrlDomain:
case Feature::kTimeDomain:
// Updating existing semantic domains is not supported currently.
break;
case Feature::kStructDomain:
// struct_domain is handled recursively.
break;
case Feature::DOMAIN_INFO_NOT_SET:
// If the domain_info is not set, it is safe to try best-effort
// semantic type update.
if (BestEffortUpdateCustomDomain(view.custom_stats(), feature)) {
descriptions->push_back(
{tensorflow::metadata::v0::AnomalyInfo::SEMANTIC_DOMAIN_UPDATE,
"Updated semantic domain",
absl::StrCat("Updated semantic domain for feature: ",
feature->name())});
}
break;
default:
// In theory, default should have already been handled inside
// UpdateFeatureSelf().
LOG(ERROR) << "Internal error: unknown domains should be cleared inside "
"UpdateFeatureSelf.";
DCHECK(false);
}
if (feature->has_unique_constraints()) {
add_to_descriptions(UpdateUniqueConstraints(view, feature));
}
const std::vector<FeatureComparatorType> all_comparator_types = {
FeatureComparatorType::DRIFT, FeatureComparatorType::SKEW};
// Handle comparators here.
for (const auto& comparator_type : all_comparator_types) {
if (FeatureHasComparator(*feature, comparator_type)) {
auto feature_comparison_result = UpdateFeatureComparatorDirect(
view, comparator_type,
GetFeatureComparator(feature, comparator_type));
add_to_descriptions(feature_comparison_result.descriptions);
if (!feature_comparison_result.measurements.empty()) {
if (!drift_skew_info->has_value()) {
drift_skew_info->emplace();
*(*drift_skew_info)->mutable_path() = view.GetPath().AsProto();
}
if (comparator_type == FeatureComparatorType::DRIFT) {
for (const auto& measurement :
feature_comparison_result.measurements) {
*(*drift_skew_info)->add_drift_measurements() = measurement;
}
} else if (comparator_type == FeatureComparatorType::SKEW) {
for (const auto& measurement :
feature_comparison_result.measurements) {
*(*drift_skew_info)->add_skew_measurements() = measurement;
}
}
}
}
}
}