void Schema::UpdateFeatureInternal()

in tensorflow_data_validation/anomalies/schema.cc [932:1164]


void Schema::UpdateFeatureInternal(
    const Updater& updater, const FeatureStatsView& view, Feature* feature,
    std::vector<Description>* descriptions,
    absl::optional<tensorflow::metadata::v0::DriftSkewInfo>* drift_skew_info) {
  *descriptions = UpdateFeatureSelf(feature);

  // feature can be deprecated inside of UpdateFeatureSelf.
  if (::tensorflow::data_validation::FeatureIsDeprecated(*feature)) {
    return;
  }

  // This is to cover the rare case where there is actually no examples with
  // this feature, but there is still a dataset_stats object.
  const bool feature_missing = view.GetNumPresent() == 0;

  // If the feature is missing, but should be present, create an anomaly.
  // Otherwise, return without checking anything else.
  if (feature_missing) {
    if (IsExistenceRequired(*feature, view.environment())) {
      descriptions->push_back(
          {tensorflow::metadata::v0::AnomalyInfo::FEATURE_TYPE_NOT_PRESENT,
           "Column dropped", "The feature was not present in any examples."});
      ::tensorflow::data_validation::DeprecateFeature(feature);
      return;
    } else {
      return;
    }
  }

  // If the feature is present in the dataset_stats and the schema, but is
  // excluded from the environment of the dataset_stats, then add it to that
  // environment.
  if (!feature_missing &&
      !IsFeatureInEnvironment(*feature, view.environment())) {
    // environment must be specified here, otherwise all features would be
    // present.
    CHECK(view.environment());
    const string view_environment = *view.environment();
    if (ContainsValue(feature->not_in_environment(), view_environment)) {
      RemoveIf(feature->mutable_not_in_environment(),
               [view_environment](const string* other) {
                 return *other == view_environment;
               });
    }
    // Even if we remove the feature from not in environment, we may need to
    // add it to in_environment.
    if (!IsFeatureInEnvironment(*feature, view.environment())) {
      feature->add_in_environment(view_environment);
    }
    descriptions->push_back(
        {tensorflow::metadata::v0::AnomalyInfo::SCHEMA_NEW_COLUMN,
         "Column missing in environment",
         absl::StrCat("New column ", view.GetPath().Serialize(),
                      " found in data but not in the "
                      "environment ",
                      view_environment, " in the schema.")});
  }

  auto add_to_descriptions =
      [descriptions](const std::vector<Description>& other_descriptions) {
        descriptions->insert(descriptions->end(), other_descriptions.begin(),
                             other_descriptions.end());
      };

  // Clear domain_info if clear_field is set.
  // Either way, append descriptions.
  auto handle_update_summary = [feature, &add_to_descriptions](
                                   const UpdateSummary& update_summary) {
    add_to_descriptions(update_summary.descriptions);
    if (update_summary.clear_field) {
      // Note that this clears the oneof field domain_info.
      ::tensorflow::data_validation::ClearDomain(feature);
    }
  };

  if (feature->has_value_count() || feature->has_value_counts()) {
    add_to_descriptions(UpdateFeatureValueCounts(view, feature));
  }

  if (feature->has_shape()) {
    add_to_descriptions(
        UpdateFeatureShape(view, generate_legacy_feature_spec(), feature));
  }

  if (feature->has_presence()) {
    add_to_descriptions(::tensorflow::data_validation::UpdatePresence(
        view, feature->mutable_presence()));
  }

  if (view.GetFeatureType() != feature->type()) {
    // Basically, deprecate the feature. The rest is just getting a meaningful
    // message out.
    ::tensorflow::data_validation::DeprecateFeature(feature);
    const ::tensorflow::protobuf::EnumValueDescriptor* descriptor =
        tensorflow::metadata::v0::FeatureNameStatistics_Type_descriptor()
            ->FindValueByNumber(view.type());
    string data_type_name = (descriptor == nullptr)
                                ? absl::StrCat("unknown(", view.type(), ")")
                                : descriptor->name();

    const ::tensorflow::protobuf::EnumValueDescriptor* schema_descriptor =
        tensorflow::metadata::v0::FeatureType_descriptor()->FindValueByNumber(
            feature->type());
    string schema_type_name =
        (schema_descriptor == nullptr)
            ? absl::StrCat("unknown(", feature->type(), ")")
            : schema_descriptor->name();
    descriptions->push_back(
        {tensorflow::metadata::v0::AnomalyInfo::UNEXPECTED_DATA_TYPE,
         absl::StrCat("Expected data of type: ", schema_type_name, " but got ",
                      data_type_name)});
  }

  if (view.type() == FeatureNameStatistics::BYTES &&
      !ContainsKey(
          std::set<Feature::DomainInfoCase>(
              {Feature::DOMAIN_INFO_NOT_SET, Feature::kNaturalLanguageDomain,
               Feature::kImageDomain, Feature::kUrlDomain}),
          feature->domain_info_case())) {
    // Note that this clears the oneof field domain_info.
    ::tensorflow::data_validation::ClearDomain(feature);
    descriptions->push_back(
        {tensorflow::metadata::v0::AnomalyInfo::DOMAIN_INVALID_FOR_TYPE,
         absl::StrCat("Data is marked as BYTES with incompatible "
                      "domain_info: ",
                      feature->DebugString())});
  }
  switch (feature->domain_info_case()) {
    case Feature::kDomain: {
      UpdateSummary update_summary =
          ::tensorflow::data_validation::UpdateStringDomain(
              updater, view,
              ::tensorflow::data_validation::GetMaxOffDomain(
                  feature->distribution_constraints()),
              CHECK_NOTNULL(GetExistingStringDomain(feature->domain())));

      add_to_descriptions(update_summary.descriptions);
      if (update_summary.clear_field) {
        // Note that this clears the oneof field domain_info.
        const string domain = feature->domain();
        ClearStringDomain(domain);
      }
    }

    break;
    case Feature::kBoolDomain:
      add_to_descriptions(
          ::tensorflow::data_validation::UpdateBoolDomain(view, feature));
      break;
    case Feature::kIntDomain:
      handle_update_summary(::tensorflow::data_validation::UpdateIntDomain(
          view, feature->mutable_int_domain()));
      break;
    case tensorflow::metadata::v0::Feature::kFloatDomain:
      handle_update_summary(::tensorflow::data_validation::UpdateFloatDomain(
          view, feature->mutable_float_domain()));
      break;
    case tensorflow::metadata::v0::Feature::kStringDomain:
      handle_update_summary(::tensorflow::data_validation::UpdateStringDomain(
          updater, view,
          ::tensorflow::data_validation::GetMaxOffDomain(
              feature->distribution_constraints()),
          feature->mutable_string_domain()));
      break;
    case Feature::kImageDomain:
      add_to_descriptions(
          ::tensorflow::data_validation::UpdateImageDomain(view, feature));
      break;
    case Feature::kNaturalLanguageDomain:
      add_to_descriptions(
          ::tensorflow::data_validation::UpdateNaturalLanguageDomain(view,
                                                                     feature));
      break;
    case Feature::kMidDomain:
    case Feature::kUrlDomain:
    case Feature::kTimeDomain:
      // Updating existing semantic domains is not supported currently.
      break;
    case Feature::kStructDomain:
      // struct_domain is handled recursively.
      break;
    case Feature::DOMAIN_INFO_NOT_SET:
      // If the domain_info is not set, it is safe to try best-effort
      // semantic type update.
      if (BestEffortUpdateCustomDomain(view.custom_stats(), feature)) {
        descriptions->push_back(
            {tensorflow::metadata::v0::AnomalyInfo::SEMANTIC_DOMAIN_UPDATE,
             "Updated semantic domain",
             absl::StrCat("Updated semantic domain for feature: ",
                          feature->name())});
      }
      break;
    default:
      // In theory, default should have already been handled inside
      // UpdateFeatureSelf().
      LOG(ERROR) << "Internal error: unknown domains should be cleared inside "
                    "UpdateFeatureSelf.";
      DCHECK(false);
  }

  if (feature->has_unique_constraints()) {
    add_to_descriptions(UpdateUniqueConstraints(view, feature));
  }

  const std::vector<FeatureComparatorType> all_comparator_types = {
      FeatureComparatorType::DRIFT, FeatureComparatorType::SKEW};
  // Handle comparators here.
  for (const auto& comparator_type : all_comparator_types) {
    if (FeatureHasComparator(*feature, comparator_type)) {
      auto feature_comparison_result = UpdateFeatureComparatorDirect(
          view, comparator_type,
          GetFeatureComparator(feature, comparator_type));
      add_to_descriptions(feature_comparison_result.descriptions);
      if (!feature_comparison_result.measurements.empty()) {
        if (!drift_skew_info->has_value()) {
          drift_skew_info->emplace();
          *(*drift_skew_info)->mutable_path() = view.GetPath().AsProto();
        }
        if (comparator_type == FeatureComparatorType::DRIFT) {
          for (const auto& measurement :
               feature_comparison_result.measurements) {
            *(*drift_skew_info)->add_drift_measurements() = measurement;
          }
        } else if (comparator_type == FeatureComparatorType::SKEW) {
          for (const auto& measurement :
               feature_comparison_result.measurements) {
            *(*drift_skew_info)->add_skew_measurements() = measurement;
          }
        }
      }
    }
  }
}