metrics/metrics.go (492 lines of code) (raw):
package main
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"gitlab.com/gitlab-com/gl-infra/ringctl/lib/patch"
"gitlab.com/gitlab-org/release-tools/metrics/internal/experiments"
"gitlab.com/gitlab-org/release-tools/metrics/internal/handlers"
"gitlab.com/gitlab-org/release-tools/metrics/internal/metrics"
"gitlab.com/gitlab-org/release-tools/metrics/internal/metrics/labels"
)
// initMetrics initializes and resets all the metrics.
// Every metrics that requires an API handler must be returned in []handlers.Pluggable
//
// NOTE: the labels order is important! Pay attention to never change metric.WithLabel
// order when initializing a metric.
func initMetrics(logger *logrus.Logger, consulHost string, consulPort int) ([]handlers.Pluggable, error) {
_, err := metrics.NewCounterVec(
metrics.WithName("info"),
metrics.WithSubsystem("version"),
metrics.WithHelp("Version info metadata"),
metrics.WithLabel(labels.Nil("build_date")),
metrics.WithLabel(labels.Nil("revision")),
metrics.WithLabelReset(BuildDate, Revision))
if err != nil {
return nil, err
}
pluggables := make([]handlers.Pluggable, 0)
err = initDeploymentMetrics(&pluggables)
if err != nil {
return pluggables, err
}
err = initPackagesMetrics(&pluggables)
if err != nil {
return pluggables, err
}
err = initExperimentsMetrics(&pluggables)
if err != nil {
return pluggables, err
}
err = initAutoDeployMetrics(&pluggables, logger, consulHost, consulPort)
if err != nil {
return pluggables, err
}
err = initAutoBuildMetrics(&pluggables)
if err != nil {
return pluggables, err
}
err = initReleaseMetrics(&pluggables, logger, consulHost, consulPort)
if err != nil {
return pluggables, err
}
err = initTissueMetrics(&pluggables)
if err != nil {
return pluggables, err
}
return pluggables, nil
}
func initDeploymentMetrics(pluggables *[]handlers.Pluggable) error {
subsystem := "deployment"
deploymentLabelValues := map[string][]string{
"deployment_type": {"coordinator_pipeline"},
"target_env": {"gstg-ref", "gstg-cny", "gstg", "gprd-cny", "gprd"},
}
durationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of the coordinated deployment pipeline, from staging to production"),
metrics.WithBuckets(prometheus.LinearBuckets(12_600, 30*60, 14)), // 14 buckets of 30 minutes ranging from 3.5hrs to 10h.
metrics.WithLabel(labels.FromValues("deployment_type", deploymentLabelValues["deployment_type"])),
metrics.WithLabel(labels.SuccessOrFailed("status")),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(durationHistogram))
durationGauge, err := metrics.NewGaugeVec(
metrics.WithName("duration_last_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of the last coordinated deployment pipeline, from staging to production"),
metrics.WithLabel(labels.FromValues("deployment_type", deploymentLabelValues["deployment_type"])),
metrics.WithLabel(labels.SuccessOrFailed("status")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(durationGauge))
leadtimeGauge, err := metrics.NewGaugeVec(
metrics.WithName("merge_request_lead_time_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Time it takes from MR merge to production"),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Stage("target_stage")),
metrics.WithLabel(labels.Integer("deployment_id")),
metrics.WithLabel(labels.Integer("mr_id")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(leadtimeGauge))
adjustedLeadtimeGauge, err := metrics.NewGaugeVec(
metrics.WithName("merge_request_adjusted_lead_time_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Time it takes from MR merge to production, adjusted to ignore weekends"),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Stage("target_stage")),
metrics.WithLabel(labels.Integer("deployment_id")),
metrics.WithLabel(labels.Integer("mr_id")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(adjustedLeadtimeGauge))
envLabelValues := labels.Environment("").Values()
stageLabelValues := labels.Stage("").Values()
envLabelValuesWithEmpty := append(envLabelValues, "")
stageLabelValuesWithEmpty := append(stageLabelValues, "")
pipelineDurationGauge, err := metrics.NewGaugeVec(
metrics.WithName("pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Wall clock duration of pipelines"),
metrics.WithLabel(labels.AutoDeployPipelineProjects("project_name")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
metrics.WithLabel(labels.AnyString("pipeline_name")),
metrics.WithLabel(labels.Integer("pipeline_id")),
metrics.WithLabel(labels.WebURL("web_url")), // Used for the dashboard links
metrics.WithLabel(labels.FromValues("target_env", envLabelValuesWithEmpty)),
metrics.WithLabel(labels.FromValues("target_stage", stageLabelValuesWithEmpty)),
metrics.WithLabel(labels.AnyString("upstream_pipeline_name")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(pipelineDurationGauge))
deployerPipelineDurationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("coordinator_pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Wall clock duration of deployment coordinator pipelines"),
// Deployment pipelines usually take about 4-6 hours
metrics.WithBuckets(prometheus.LinearBuckets(12_600, 30*60, 14)), // 14 buckets of 30 minutes ranging from 3.5hrs to 10h.
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(deployerPipelineDurationHistogram))
packagerOmnibusPipelineDurationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("packager_omnibus_pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of Omnibus packaging pipeline"),
// Omnibus packaging pipelines typically take about 50 minutes - 1.5 hours
metrics.WithBuckets(prometheus.LinearBuckets(1800, 10*60, 10)), // 10 buckets of 10 minutes ranging from 30 minutes to 2 hours 10 minutes
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(packagerOmnibusPipelineDurationHistogram))
packagerCngPipelineDurationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("packager_cng_pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of CNG packaging pipeline"),
// CNG packaging pipelines typically take about 40 minutes - 1 hour
metrics.WithBuckets(prometheus.LinearBuckets(1800, 6*60, 10)), // 10 buckets of 6 minutes ranging from 30 minutes to 1.5 hours
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(packagerCngPipelineDurationHistogram))
jobDurationGauge, err := metrics.NewGaugeVec(
metrics.WithName("job_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Wall clock duration of jobs"),
metrics.WithLabel(labels.AnyString("job_name")),
metrics.WithLabel(labels.AnyString("job_stage")),
metrics.WithLabel(labels.SuccessOrFailed("job_status")),
metrics.WithLabel(labels.AutoDeployPipelineProjects("project_name")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
metrics.WithLabel(labels.FromValues("target_env", envLabelValuesWithEmpty)),
metrics.WithLabel(labels.FromValues("target_stage", stageLabelValuesWithEmpty)),
metrics.WithLabel(labels.AnyString("short_job_name")),
metrics.WithLabel(labels.WebURL("web_url")), // Used for the dashboard links
metrics.WithLabel(labels.Integer("job_id")),
metrics.WithLabel(labels.Integer("pipeline_id")),
metrics.WithLabel(labels.AnyString("pipeline_name")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(jobDurationGauge))
deploymentStartedCounter, err := metrics.NewCounterVec(
metrics.WithName("started_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments started for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(deploymentStartedCounter))
deploymentCanRollbackCounter, err := metrics.NewCounterVec(
metrics.WithName("can_rollback_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments suitable for rollback for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(deploymentCanRollbackCounter))
rollbacksCounter, err := metrics.NewCounterVec(
metrics.WithName("rollbacks_started_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of rollbacks started for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(rollbacksCounter))
deploymentCompletedCounter, err := metrics.NewCounterVec(
metrics.WithName("completed_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments completed for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(deploymentCompletedCounter))
deploymentStartedGauge, err := metrics.NewGaugeVec(
metrics.WithName("started"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments started for each environment as gauge"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithLabel(labels.FullDeployVersion("version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentStartedGauge))
deploymentCompletedGauge, err := metrics.NewGaugeVec(
metrics.WithName("completed"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments completed for each environment as gauge"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithLabel(labels.FullDeployVersion("version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentCompletedGauge))
deploymentFailedAtLeastOnceGauge, err := metrics.NewGaugeVec(
metrics.WithName("failed_atleast_once"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("A deployment with at least one failed job"),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentFailedAtLeastOnceGauge))
deploymentBlockerCountGauge, err := metrics.NewGaugeVec(
metrics.WithName("blocker_count"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Deployment Blocker Count Per Category"),
metrics.WithLabel(labels.RootCause("root_cause")),
metrics.WithLabel(labels.Date("week")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentBlockerCountGauge))
deploymentHoursBlockedGauge, err := metrics.NewGaugeVec(
metrics.WithName("hours_blocked"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Deployment Hours Blocked"),
metrics.WithLabel(labels.RootCause("root_cause")),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Date("week")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentHoursBlockedGauge))
return nil
}
func initPackagesMetrics(pluggables *[]handlers.Pluggable) error {
subsystem := "packages"
//NOTE (nolith): we should consider tracking also RCs and public packages
pkgTypes := []string{"auto_deploy", "monthly", "patch", "rc", "security"}
securityTypes := []string{"no", "regular", "critical"}
taggingCounter, err := metrics.NewCounterVec(
metrics.WithName("tagging_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of tagged packages by package type"),
metrics.WithLabel(labels.FromValues("pkg_type", pkgTypes)),
metrics.WithLabel(labels.FromValues("security", securityTypes)),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(taggingCounter))
return nil
}
func initExperimentsMetrics(pluggables *[]handlers.Pluggable) error {
experimentHandlers, err := experiments.GetHandlers()
if err != nil {
return err
}
*pluggables = append(*pluggables, experimentHandlers...)
return nil
}
func initAutoBuildMetrics(pluggables *[]handlers.Pluggable) error {
subsystem := "auto_build"
pressureGauge, err := metrics.NewGaugeVec(
metrics.WithName("pressure"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of commits not yet included in a package"),
metrics.WithLabel(labels.AutoDeployProjects("project_name")),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(pressureGauge))
return nil
}
func initAutoDeployMetrics(pluggables *[]handlers.Pluggable, logger *logrus.Logger, consulHost string, consulPort int) error {
subsystem := "auto_deploy"
roles := []string{"gstg-cny", "gprd-cny", "gstg", "gprd"}
pressureGauge, err := metrics.NewGaugeVec(
metrics.WithName("pressure"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of commits not yet deployed to an environment"),
metrics.WithLabel(labels.FromValues("role", roles)),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(pressureGauge))
pickCounter, err := metrics.NewCounterVec(
metrics.WithName("picks_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of merge requests picked into auto-deploy branch"),
metrics.WithLabel(labels.SuccessOrFailed("status")),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(pickCounter))
pipelineStatusCounter, err := metrics.NewCounterVec(
metrics.WithName("gitlab_pipeline_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of gitlab pipelines in a given status at the time of rollout"),
metrics.WithLabel(labels.PipelineStatus("status")),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(pipelineStatusCounter))
packageStatusGauge, err := metrics.NewGaugeVec(
metrics.WithName("package_state"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp(
"Tracks states of auto deploy packages'"),
metrics.WithLabel(labels.PackagerProjects("project_path")),
metrics.WithLabel(labels.FromValues("pkg_state", []string{"missing", "pending", "building", "ready", "failed"})),
metrics.WithLabel(labels.FullDeployVersion("version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(packageStatusGauge))
newPackageGauge, err := metrics.NewGaugeVec(
metrics.WithName("building_package_state"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp(
"Tracks new auto deploy packages (newer than what's on gstg-cny) being built"),
metrics.WithLabel(labels.FromValues("pkg_state", []string{"running", "success", "failed"})),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(newPackageGauge))
envStatuses := []string{"ready", "locked", "baking_time", "awaiting_promotion"}
envDeployState, err := metrics.NewGaugeVec(
metrics.WithName("environment_state"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Tracks deployment-related state of each environment"),
metrics.WithLabel(labels.FromValues("env_state", envStatuses)),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Stage("target_stage")),
metrics.WithConsulMemory(consulHost, consulPort, logger),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(envDeployState))
lockReasons := []string{"locked_deployment", "locked_deployment_failed", "locked_qa", "locked_qa_failed", "locked_post_deploy_migration", "locked_post_deploy_migration_failed"}
envLockState, err := metrics.NewGaugeVec(
metrics.WithName("lock_state"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Tracks deployment lock states and lock reasons of each environment"),
metrics.WithLabel(labels.FromValues("lock_reason", lockReasons)),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Stage("target_stage")),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(envLockState))
return nil
}
func initReleaseMetrics(pluggables *[]handlers.Pluggable, logger *logrus.Logger, consulHost string, consulPort int) error {
subsystem := "release"
severities := []string{"severity::1", "severity::2", "severity::3", "severity::4", "none"}
pressureGauge, err := metrics.NewGaugeVec(
metrics.WithName("pressure"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of unreleased merge requests"),
metrics.WithLabel(labels.FromValues("severity", severities)),
metrics.WithLabel(labels.MinorVersion("version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(pressureGauge))
// Status (value)
// Open (1): Any commit that reached production is expected to be released with the next monthly release
// Announced (2): Signal the RC tagging date is getting closer
// Tagged_RC (3): RC has been tagged; any commit that has not reached production prior to the tagging time is
// not included in the monthly release.
monthlyStatusGauge, err := metrics.NewGaugeVec(
metrics.WithName("monthly_status"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Status of monthly release"),
metrics.WithLabel(labels.ReleaseDate("release_date")),
metrics.WithLabel(labels.MinorVersion("version")),
metrics.WithConsulMemory(consulHost, consulPort, logger),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(monthlyStatusGauge))
// Status (value)
// Open (1): Any 'security-target' security issue is expected to be processed in the patch release
// Warning (2): Signal the Merging date is getting closer (Friday before release)
// Merged (3): Default branch MRs have been merged; no more security issue will be processed for the patch release
patchStatusGauge, err := metrics.NewGaugeVec(
metrics.WithName("patch_status"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Status of patch release"),
metrics.WithLabel(labels.ReleaseDate("release_date")),
metrics.WithLabel(labels.AnyString("versions")),
metrics.WithConsulMemory(consulHost, consulPort, logger),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(patchStatusGauge))
return nil
}
func initTissueMetrics(pluggables *[]handlers.Pluggable) error {
subsystem := "tissue"
ampClusters := []string{"cellsdev", "cellsprod"}
patchStatuses := make([]string, 0, len(patch.Statuses))
for _, patchStatus := range patch.Statuses {
patchStatuses = append(patchStatuses, string(patchStatus))
}
patchesGauge, err := metrics.NewGaugeVec(
metrics.WithName("patches_queued_current"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of patches that are queued up waiting to be processed"),
metrics.WithLabel(labels.FromValues("amp", ampClusters)),
metrics.WithLabel(labels.Integer("ring")),
metrics.WithLabel(labels.FromValues("patch_status", patchStatuses)),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(patchesGauge))
patchesCounter, err := metrics.NewCounterVec(
metrics.WithName("patches_processed_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of patches processed"),
metrics.WithLabel(labels.FromValues("amp", ampClusters)),
metrics.WithLabel(labels.Integer("ring")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(patchesCounter))
return nil
}