operator/pkg/metrics/monitoring.go (120 lines of code) (raw):

// Copyright Istio Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package metrics defines metrics and monitoring functionality // used throughout operator. package metrics import ( "istio.io/pkg/monitoring" ) var ( // OperatorVersionLabel describes version of running binary. OperatorVersionLabel = monitoring.MustCreateLabel("version") // MergeErrorLabel describes the type of merge error. MergeErrorLabel = monitoring.MustCreateLabel("error_type") // RenderErrorLabel describes the type of the error while rendering. RenderErrorLabel = monitoring.MustCreateLabel("render_error") // CRFetchErrorReasonLabel describes the reason/HTTP code // for failing to fetch CR. CRFetchErrorReasonLabel = monitoring.MustCreateLabel("reason") // ComponentNameLabel represents istio component name - like // core, pilot, istio-cni etc. ComponentNameLabel = monitoring.MustCreateLabel("component") // ResourceKindLabel indicates the kind of resource owned // or created or updated or deleted or pruned by operator. ResourceKindLabel = monitoring.MustCreateLabel("kind") ) // MergeErrorType describes the class of errors that could // occur while merging profile, user supplied YAML, values // overridden by --set and so on. type MergeErrorType string const ( // CannotFetchProfileError occurs when profile cannot be found. CannotFetchProfileError MergeErrorType = "cannot_fetch_profile" // OverlayError overlaying YAMLs to combine profile, user // defined settings in CR, Hub-tag etc fails. OverlayError MergeErrorType = "overlay" // IOPFormatError occurs when supplied CR cannot be marshaled // or unmarshaled to/from YAML. IOPFormatError MergeErrorType = "iop_format" // TranslateValuesError occurs when translating from legacy API fails. TranslateValuesError MergeErrorType = "translate_values" // InternalYAMLParseError occurs when spec section in merged CR // cannot be accessed for some reason (either missing or multiple). InternalYAMLParseError MergeErrorType = "internal_yaml_parse" ) // RenderErrorType describes the class of errors that could // occur while rendering Kubernetes manifest from given CR. type RenderErrorType string const ( RenderNotStartedError RenderErrorType = "render_not_started" // HelmTranslateIOPToValuesError describes render error where renderer for // a component cannot create values.yaml tree from given CR. HelmTranslateIOPToValuesError RenderErrorType = "helm_translate_iop_to_values" // HelmChartRenderError describes error where Helm charts cannot be rendered // for the generated values.yaml tree. HelmChartRenderError RenderErrorType = "helm_chart_render" // K8SSettingsOverlayError describes the K8s overlay error after // rendering Helm charts successfully. K8SSettingsOverlayError RenderErrorType = "k8s_settings_overlay" // K8SManifestPatchError describes errors while patching generated manifest. K8SManifestPatchError RenderErrorType = "k8s_manifest_patch" ) var ( // Version is the version of the operator binary running currently. // This is required for fleet level metrics although it is available from // ControlZ (more precisely versionz endpoint). Version = monitoring.NewGauge( "version", "Version of operator binary", monitoring.WithLabels(OperatorVersionLabel), ) // GetCRErrorTotal counts the number of times fetching // CR fails from API server. GetCRErrorTotal = monitoring.NewSum( "get_cr_error_total", "Number of times fetching CR from apiserver failed", monitoring.WithLabels(CRFetchErrorReasonLabel), ) // CRMergeFailureTotal counts number of CR merge failures. CRMergeFailureTotal = monitoring.NewSum( "cr_merge_failure_total", "Number of IstioOperator CR merge failures", monitoring.WithLabels(MergeErrorLabel), ) // CRDeletionTotal counts the number of times // IstioOperator CR was deleted. CRDeletionTotal = monitoring.NewSum( "cr_deletion_total", "Number of IstioOperator CR deleted", ) // CRValidationErrorTotal counts the number of CR // validation failures. CRValidationErrorTotal = monitoring.NewSum( "cr_validation_error_total", "Number of IstioOperator CR validation failures", ) // RenderManifestTotal counts the number of manifest // renders at each component level. RenderManifestTotal = monitoring.NewSum( "render_manifest_total", "Number of component manifests rendered", monitoring.WithLabels(ComponentNameLabel), ) // OwnedResourceTotal indicates the number of resources // currently owned by the CR with given name and revision. OwnedResourceTotal = monitoring.NewGauge( "owned_resource_total", "Number of resources currently owned by the operator", monitoring.WithLabels(ResourceKindLabel), ) // ResourceCreationTotal indicates the number of resources // created by the operator for a CR and revision. ResourceCreationTotal = monitoring.NewSum( "resource_creation_total", "Number of resources created by the operator", monitoring.WithLabels(ResourceKindLabel), ) // ResourceUpdateTotal indicates the number of resources updated by // the operator in response to CR updates for a revision. ResourceUpdateTotal = monitoring.NewSum( "resource_update_total", "Number of resources updated by the operator", monitoring.WithLabels(ResourceKindLabel), ) // ResourceDeletionTotal indicates the number of resources deleted // by the operator in response to CR update or delete operation (like // ingress-gateway which was enabled could be disabled and this requires // deleting ingress-gateway deployment). ResourceDeletionTotal = monitoring.NewSum( "resource_deletion_total", "Number of resources deleted by the operator", monitoring.WithLabels(ResourceKindLabel), ) // ResourcePruneTotal indicates the resources pruned as a result of update. ResourcePruneTotal = monitoring.NewSum( "resource_prune_total", "Number of resources pruned by the operator", monitoring.WithLabels(ResourceKindLabel), ) // ManifestPatchErrorTotal counts the total number of K8S patch errors. ManifestPatchErrorTotal = monitoring.NewSum( "manifest_patch_error_total", "Number of times K8S patch overlays failed", ) // ManifestRenderErrorTotal counts errors occurred while rendering manifest. ManifestRenderErrorTotal = monitoring.NewSum( "manifest_render_error_total", "Number of times error occurred during rendering output manifest", monitoring.WithLabels(ComponentNameLabel, RenderErrorLabel), ) // LegacyPathTranslationTotal counts the translations from legacy API to new one. LegacyPathTranslationTotal = monitoring.NewSum( "legacy_path_translation_total", "Number of times a legacy API path is translated", ) // CacheFlushTotal counts number of cache flushes. CacheFlushTotal = monitoring.NewSum( "cache_flush_total", "number of times operator cache was flushed", ) ) func init() { monitoring.MustRegister( Version, GetCRErrorTotal, CRMergeFailureTotal, CRValidationErrorTotal, CRDeletionTotal, RenderManifestTotal, OwnedResourceTotal, ResourceCreationTotal, ResourceUpdateTotal, ResourceDeletionTotal, ResourcePruneTotal, ManifestPatchErrorTotal, ManifestRenderErrorTotal, LegacyPathTranslationTotal, CacheFlushTotal, ) initOperatorCrdResourceMetrics() }