metrics/metrics.go (492 lines of code) (raw):

package main import ( "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "gitlab.com/gitlab-com/gl-infra/ringctl/lib/patch" "gitlab.com/gitlab-org/release-tools/metrics/internal/experiments" "gitlab.com/gitlab-org/release-tools/metrics/internal/handlers" "gitlab.com/gitlab-org/release-tools/metrics/internal/metrics" "gitlab.com/gitlab-org/release-tools/metrics/internal/metrics/labels" ) // initMetrics initializes and resets all the metrics. // Every metrics that requires an API handler must be returned in []handlers.Pluggable // // NOTE: the labels order is important! Pay attention to never change metric.WithLabel // order when initializing a metric. func initMetrics(logger *logrus.Logger, consulHost string, consulPort int) ([]handlers.Pluggable, error) { _, err := metrics.NewCounterVec( metrics.WithName("info"), metrics.WithSubsystem("version"), metrics.WithHelp("Version info metadata"), metrics.WithLabel(labels.Nil("build_date")), metrics.WithLabel(labels.Nil("revision")), metrics.WithLabelReset(BuildDate, Revision)) if err != nil { return nil, err } pluggables := make([]handlers.Pluggable, 0) err = initDeploymentMetrics(&pluggables) if err != nil { return pluggables, err } err = initPackagesMetrics(&pluggables) if err != nil { return pluggables, err } err = initExperimentsMetrics(&pluggables) if err != nil { return pluggables, err } err = initAutoDeployMetrics(&pluggables, logger, consulHost, consulPort) if err != nil { return pluggables, err } err = initAutoBuildMetrics(&pluggables) if err != nil { return pluggables, err } err = initReleaseMetrics(&pluggables, logger, consulHost, consulPort) if err != nil { return pluggables, err } err = initTissueMetrics(&pluggables) if err != nil { return pluggables, err } return pluggables, nil } func initDeploymentMetrics(pluggables *[]handlers.Pluggable) error { subsystem := "deployment" deploymentLabelValues := map[string][]string{ "deployment_type": {"coordinator_pipeline"}, "target_env": {"gstg-ref", "gstg-cny", "gstg", "gprd-cny", "gprd"}, } durationHistogram, err := metrics.NewHistogramVec( metrics.WithName("duration_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Duration of the coordinated deployment pipeline, from staging to production"), metrics.WithBuckets(prometheus.LinearBuckets(12_600, 30*60, 14)), // 14 buckets of 30 minutes ranging from 3.5hrs to 10h. metrics.WithLabel(labels.FromValues("deployment_type", deploymentLabelValues["deployment_type"])), metrics.WithLabel(labels.SuccessOrFailed("status")), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewHistogram(durationHistogram)) durationGauge, err := metrics.NewGaugeVec( metrics.WithName("duration_last_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Duration of the last coordinated deployment pipeline, from staging to production"), metrics.WithLabel(labels.FromValues("deployment_type", deploymentLabelValues["deployment_type"])), metrics.WithLabel(labels.SuccessOrFailed("status")), metrics.WithLabel(labels.FullDeployVersion("deploy_version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(durationGauge)) leadtimeGauge, err := metrics.NewGaugeVec( metrics.WithName("merge_request_lead_time_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Time it takes from MR merge to production"), metrics.WithLabel(labels.Environment("target_env")), metrics.WithLabel(labels.Stage("target_stage")), metrics.WithLabel(labels.Integer("deployment_id")), metrics.WithLabel(labels.Integer("mr_id")), metrics.WithLabel(labels.FullDeployVersion("deploy_version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(leadtimeGauge)) adjustedLeadtimeGauge, err := metrics.NewGaugeVec( metrics.WithName("merge_request_adjusted_lead_time_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Time it takes from MR merge to production, adjusted to ignore weekends"), metrics.WithLabel(labels.Environment("target_env")), metrics.WithLabel(labels.Stage("target_stage")), metrics.WithLabel(labels.Integer("deployment_id")), metrics.WithLabel(labels.Integer("mr_id")), metrics.WithLabel(labels.FullDeployVersion("deploy_version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(adjustedLeadtimeGauge)) envLabelValues := labels.Environment("").Values() stageLabelValues := labels.Stage("").Values() envLabelValuesWithEmpty := append(envLabelValues, "") stageLabelValuesWithEmpty := append(stageLabelValues, "") pipelineDurationGauge, err := metrics.NewGaugeVec( metrics.WithName("pipeline_duration_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Wall clock duration of pipelines"), metrics.WithLabel(labels.AutoDeployPipelineProjects("project_name")), metrics.WithLabel(labels.FullDeployVersion("deploy_version")), metrics.WithLabel(labels.PipelineStatus("pipeline_status")), metrics.WithLabel(labels.AnyString("pipeline_name")), metrics.WithLabel(labels.Integer("pipeline_id")), metrics.WithLabel(labels.WebURL("web_url")), // Used for the dashboard links metrics.WithLabel(labels.FromValues("target_env", envLabelValuesWithEmpty)), metrics.WithLabel(labels.FromValues("target_stage", stageLabelValuesWithEmpty)), metrics.WithLabel(labels.AnyString("upstream_pipeline_name")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(pipelineDurationGauge)) deployerPipelineDurationHistogram, err := metrics.NewHistogramVec( metrics.WithName("coordinator_pipeline_duration_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Wall clock duration of deployment coordinator pipelines"), // Deployment pipelines usually take about 4-6 hours metrics.WithBuckets(prometheus.LinearBuckets(12_600, 30*60, 14)), // 14 buckets of 30 minutes ranging from 3.5hrs to 10h. metrics.WithLabel(labels.PipelineStatus("pipeline_status")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewHistogram(deployerPipelineDurationHistogram)) packagerOmnibusPipelineDurationHistogram, err := metrics.NewHistogramVec( metrics.WithName("packager_omnibus_pipeline_duration_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Duration of Omnibus packaging pipeline"), // Omnibus packaging pipelines typically take about 50 minutes - 1.5 hours metrics.WithBuckets(prometheus.LinearBuckets(1800, 10*60, 10)), // 10 buckets of 10 minutes ranging from 30 minutes to 2 hours 10 minutes metrics.WithLabel(labels.PipelineStatus("pipeline_status")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewHistogram(packagerOmnibusPipelineDurationHistogram)) packagerCngPipelineDurationHistogram, err := metrics.NewHistogramVec( metrics.WithName("packager_cng_pipeline_duration_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Duration of CNG packaging pipeline"), // CNG packaging pipelines typically take about 40 minutes - 1 hour metrics.WithBuckets(prometheus.LinearBuckets(1800, 6*60, 10)), // 10 buckets of 6 minutes ranging from 30 minutes to 1.5 hours metrics.WithLabel(labels.PipelineStatus("pipeline_status")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewHistogram(packagerCngPipelineDurationHistogram)) jobDurationGauge, err := metrics.NewGaugeVec( metrics.WithName("job_duration_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Wall clock duration of jobs"), metrics.WithLabel(labels.AnyString("job_name")), metrics.WithLabel(labels.AnyString("job_stage")), metrics.WithLabel(labels.SuccessOrFailed("job_status")), metrics.WithLabel(labels.AutoDeployPipelineProjects("project_name")), metrics.WithLabel(labels.FullDeployVersion("deploy_version")), metrics.WithLabel(labels.FromValues("target_env", envLabelValuesWithEmpty)), metrics.WithLabel(labels.FromValues("target_stage", stageLabelValuesWithEmpty)), metrics.WithLabel(labels.AnyString("short_job_name")), metrics.WithLabel(labels.WebURL("web_url")), // Used for the dashboard links metrics.WithLabel(labels.Integer("job_id")), metrics.WithLabel(labels.Integer("pipeline_id")), metrics.WithLabel(labels.AnyString("pipeline_name")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(jobDurationGauge)) deploymentStartedCounter, err := metrics.NewCounterVec( metrics.WithName("started_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of deployments started for each environment"), metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(deploymentStartedCounter)) deploymentCanRollbackCounter, err := metrics.NewCounterVec( metrics.WithName("can_rollback_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of deployments suitable for rollback for each environment"), metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(deploymentCanRollbackCounter)) rollbacksCounter, err := metrics.NewCounterVec( metrics.WithName("rollbacks_started_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of rollbacks started for each environment"), metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(rollbacksCounter)) deploymentCompletedCounter, err := metrics.NewCounterVec( metrics.WithName("completed_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of deployments completed for each environment"), metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(deploymentCompletedCounter)) deploymentStartedGauge, err := metrics.NewGaugeVec( metrics.WithName("started"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of deployments started for each environment as gauge"), metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])), metrics.WithLabel(labels.FullDeployVersion("version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(deploymentStartedGauge)) deploymentCompletedGauge, err := metrics.NewGaugeVec( metrics.WithName("completed"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of deployments completed for each environment as gauge"), metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])), metrics.WithLabel(labels.FullDeployVersion("version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(deploymentCompletedGauge)) deploymentFailedAtLeastOnceGauge, err := metrics.NewGaugeVec( metrics.WithName("failed_atleast_once"), metrics.WithSubsystem(subsystem), metrics.WithHelp("A deployment with at least one failed job"), metrics.WithLabel(labels.FullDeployVersion("deploy_version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(deploymentFailedAtLeastOnceGauge)) deploymentBlockerCountGauge, err := metrics.NewGaugeVec( metrics.WithName("blocker_count"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Deployment Blocker Count Per Category"), metrics.WithLabel(labels.RootCause("root_cause")), metrics.WithLabel(labels.Date("week")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(deploymentBlockerCountGauge)) deploymentHoursBlockedGauge, err := metrics.NewGaugeVec( metrics.WithName("hours_blocked"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Deployment Hours Blocked"), metrics.WithLabel(labels.RootCause("root_cause")), metrics.WithLabel(labels.Environment("target_env")), metrics.WithLabel(labels.Date("week")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(deploymentHoursBlockedGauge)) return nil } func initPackagesMetrics(pluggables *[]handlers.Pluggable) error { subsystem := "packages" //NOTE (nolith): we should consider tracking also RCs and public packages pkgTypes := []string{"auto_deploy", "monthly", "patch", "rc", "security"} securityTypes := []string{"no", "regular", "critical"} taggingCounter, err := metrics.NewCounterVec( metrics.WithName("tagging_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of tagged packages by package type"), metrics.WithLabel(labels.FromValues("pkg_type", pkgTypes)), metrics.WithLabel(labels.FromValues("security", securityTypes)), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(taggingCounter)) return nil } func initExperimentsMetrics(pluggables *[]handlers.Pluggable) error { experimentHandlers, err := experiments.GetHandlers() if err != nil { return err } *pluggables = append(*pluggables, experimentHandlers...) return nil } func initAutoBuildMetrics(pluggables *[]handlers.Pluggable) error { subsystem := "auto_build" pressureGauge, err := metrics.NewGaugeVec( metrics.WithName("pressure"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of commits not yet included in a package"), metrics.WithLabel(labels.AutoDeployProjects("project_name")), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(pressureGauge)) return nil } func initAutoDeployMetrics(pluggables *[]handlers.Pluggable, logger *logrus.Logger, consulHost string, consulPort int) error { subsystem := "auto_deploy" roles := []string{"gstg-cny", "gprd-cny", "gstg", "gprd"} pressureGauge, err := metrics.NewGaugeVec( metrics.WithName("pressure"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of commits not yet deployed to an environment"), metrics.WithLabel(labels.FromValues("role", roles)), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(pressureGauge)) pickCounter, err := metrics.NewCounterVec( metrics.WithName("picks_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of merge requests picked into auto-deploy branch"), metrics.WithLabel(labels.SuccessOrFailed("status")), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(pickCounter)) pipelineStatusCounter, err := metrics.NewCounterVec( metrics.WithName("gitlab_pipeline_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of gitlab pipelines in a given status at the time of rollout"), metrics.WithLabel(labels.PipelineStatus("status")), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(pipelineStatusCounter)) packageStatusGauge, err := metrics.NewGaugeVec( metrics.WithName("package_state"), metrics.WithSubsystem(subsystem), metrics.WithHelp( "Tracks states of auto deploy packages'"), metrics.WithLabel(labels.PackagerProjects("project_path")), metrics.WithLabel(labels.FromValues("pkg_state", []string{"missing", "pending", "building", "ready", "failed"})), metrics.WithLabel(labels.FullDeployVersion("version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(packageStatusGauge)) newPackageGauge, err := metrics.NewGaugeVec( metrics.WithName("building_package_state"), metrics.WithSubsystem(subsystem), metrics.WithHelp( "Tracks new auto deploy packages (newer than what's on gstg-cny) being built"), metrics.WithLabel(labels.FromValues("pkg_state", []string{"running", "success", "failed"})), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(newPackageGauge)) envStatuses := []string{"ready", "locked", "baking_time", "awaiting_promotion"} envDeployState, err := metrics.NewGaugeVec( metrics.WithName("environment_state"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Tracks deployment-related state of each environment"), metrics.WithLabel(labels.FromValues("env_state", envStatuses)), metrics.WithLabel(labels.Environment("target_env")), metrics.WithLabel(labels.Stage("target_stage")), metrics.WithConsulMemory(consulHost, consulPort, logger), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(envDeployState)) lockReasons := []string{"locked_deployment", "locked_deployment_failed", "locked_qa", "locked_qa_failed", "locked_post_deploy_migration", "locked_post_deploy_migration_failed"} envLockState, err := metrics.NewGaugeVec( metrics.WithName("lock_state"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Tracks deployment lock states and lock reasons of each environment"), metrics.WithLabel(labels.FromValues("lock_reason", lockReasons)), metrics.WithLabel(labels.Environment("target_env")), metrics.WithLabel(labels.Stage("target_stage")), metrics.WithCartesianProductLabelReset(), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(envLockState)) return nil } func initReleaseMetrics(pluggables *[]handlers.Pluggable, logger *logrus.Logger, consulHost string, consulPort int) error { subsystem := "release" severities := []string{"severity::1", "severity::2", "severity::3", "severity::4", "none"} pressureGauge, err := metrics.NewGaugeVec( metrics.WithName("pressure"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of unreleased merge requests"), metrics.WithLabel(labels.FromValues("severity", severities)), metrics.WithLabel(labels.MinorVersion("version")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(pressureGauge)) // Status (value) // Open (1): Any commit that reached production is expected to be released with the next monthly release // Announced (2): Signal the RC tagging date is getting closer // Tagged_RC (3): RC has been tagged; any commit that has not reached production prior to the tagging time is // not included in the monthly release. monthlyStatusGauge, err := metrics.NewGaugeVec( metrics.WithName("monthly_status"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Status of monthly release"), metrics.WithLabel(labels.ReleaseDate("release_date")), metrics.WithLabel(labels.MinorVersion("version")), metrics.WithConsulMemory(consulHost, consulPort, logger), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(monthlyStatusGauge)) // Status (value) // Open (1): Any 'security-target' security issue is expected to be processed in the patch release // Warning (2): Signal the Merging date is getting closer (Friday before release) // Merged (3): Default branch MRs have been merged; no more security issue will be processed for the patch release patchStatusGauge, err := metrics.NewGaugeVec( metrics.WithName("patch_status"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Status of patch release"), metrics.WithLabel(labels.ReleaseDate("release_date")), metrics.WithLabel(labels.AnyString("versions")), metrics.WithConsulMemory(consulHost, consulPort, logger), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(patchStatusGauge)) return nil } func initTissueMetrics(pluggables *[]handlers.Pluggable) error { subsystem := "tissue" ampClusters := []string{"cellsdev", "cellsprod"} patchStatuses := make([]string, 0, len(patch.Statuses)) for _, patchStatus := range patch.Statuses { patchStatuses = append(patchStatuses, string(patchStatus)) } patchesGauge, err := metrics.NewGaugeVec( metrics.WithName("patches_queued_current"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of patches that are queued up waiting to be processed"), metrics.WithLabel(labels.FromValues("amp", ampClusters)), metrics.WithLabel(labels.Integer("ring")), metrics.WithLabel(labels.FromValues("patch_status", patchStatuses)), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewGauge(patchesGauge)) patchesCounter, err := metrics.NewCounterVec( metrics.WithName("patches_processed_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of patches processed"), metrics.WithLabel(labels.FromValues("amp", ampClusters)), metrics.WithLabel(labels.Integer("ring")), ) if err != nil { return err } *pluggables = append(*pluggables, handlers.NewCounter(patchesCounter)) return nil }