in metrics/metrics.go [71:341]
func initDeploymentMetrics(pluggables *[]handlers.Pluggable) error {
subsystem := "deployment"
deploymentLabelValues := map[string][]string{
"deployment_type": {"coordinator_pipeline"},
"target_env": {"gstg-ref", "gstg-cny", "gstg", "gprd-cny", "gprd"},
}
durationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of the coordinated deployment pipeline, from staging to production"),
metrics.WithBuckets(prometheus.LinearBuckets(12_600, 30*60, 14)), // 14 buckets of 30 minutes ranging from 3.5hrs to 10h.
metrics.WithLabel(labels.FromValues("deployment_type", deploymentLabelValues["deployment_type"])),
metrics.WithLabel(labels.SuccessOrFailed("status")),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(durationHistogram))
durationGauge, err := metrics.NewGaugeVec(
metrics.WithName("duration_last_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of the last coordinated deployment pipeline, from staging to production"),
metrics.WithLabel(labels.FromValues("deployment_type", deploymentLabelValues["deployment_type"])),
metrics.WithLabel(labels.SuccessOrFailed("status")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(durationGauge))
leadtimeGauge, err := metrics.NewGaugeVec(
metrics.WithName("merge_request_lead_time_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Time it takes from MR merge to production"),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Stage("target_stage")),
metrics.WithLabel(labels.Integer("deployment_id")),
metrics.WithLabel(labels.Integer("mr_id")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(leadtimeGauge))
adjustedLeadtimeGauge, err := metrics.NewGaugeVec(
metrics.WithName("merge_request_adjusted_lead_time_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Time it takes from MR merge to production, adjusted to ignore weekends"),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Stage("target_stage")),
metrics.WithLabel(labels.Integer("deployment_id")),
metrics.WithLabel(labels.Integer("mr_id")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(adjustedLeadtimeGauge))
envLabelValues := labels.Environment("").Values()
stageLabelValues := labels.Stage("").Values()
envLabelValuesWithEmpty := append(envLabelValues, "")
stageLabelValuesWithEmpty := append(stageLabelValues, "")
pipelineDurationGauge, err := metrics.NewGaugeVec(
metrics.WithName("pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Wall clock duration of pipelines"),
metrics.WithLabel(labels.AutoDeployPipelineProjects("project_name")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
metrics.WithLabel(labels.AnyString("pipeline_name")),
metrics.WithLabel(labels.Integer("pipeline_id")),
metrics.WithLabel(labels.WebURL("web_url")), // Used for the dashboard links
metrics.WithLabel(labels.FromValues("target_env", envLabelValuesWithEmpty)),
metrics.WithLabel(labels.FromValues("target_stage", stageLabelValuesWithEmpty)),
metrics.WithLabel(labels.AnyString("upstream_pipeline_name")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(pipelineDurationGauge))
deployerPipelineDurationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("coordinator_pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Wall clock duration of deployment coordinator pipelines"),
// Deployment pipelines usually take about 4-6 hours
metrics.WithBuckets(prometheus.LinearBuckets(12_600, 30*60, 14)), // 14 buckets of 30 minutes ranging from 3.5hrs to 10h.
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(deployerPipelineDurationHistogram))
packagerOmnibusPipelineDurationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("packager_omnibus_pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of Omnibus packaging pipeline"),
// Omnibus packaging pipelines typically take about 50 minutes - 1.5 hours
metrics.WithBuckets(prometheus.LinearBuckets(1800, 10*60, 10)), // 10 buckets of 10 minutes ranging from 30 minutes to 2 hours 10 minutes
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(packagerOmnibusPipelineDurationHistogram))
packagerCngPipelineDurationHistogram, err := metrics.NewHistogramVec(
metrics.WithName("packager_cng_pipeline_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Duration of CNG packaging pipeline"),
// CNG packaging pipelines typically take about 40 minutes - 1 hour
metrics.WithBuckets(prometheus.LinearBuckets(1800, 6*60, 10)), // 10 buckets of 6 minutes ranging from 30 minutes to 1.5 hours
metrics.WithLabel(labels.PipelineStatus("pipeline_status")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewHistogram(packagerCngPipelineDurationHistogram))
jobDurationGauge, err := metrics.NewGaugeVec(
metrics.WithName("job_duration_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Wall clock duration of jobs"),
metrics.WithLabel(labels.AnyString("job_name")),
metrics.WithLabel(labels.AnyString("job_stage")),
metrics.WithLabel(labels.SuccessOrFailed("job_status")),
metrics.WithLabel(labels.AutoDeployPipelineProjects("project_name")),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
metrics.WithLabel(labels.FromValues("target_env", envLabelValuesWithEmpty)),
metrics.WithLabel(labels.FromValues("target_stage", stageLabelValuesWithEmpty)),
metrics.WithLabel(labels.AnyString("short_job_name")),
metrics.WithLabel(labels.WebURL("web_url")), // Used for the dashboard links
metrics.WithLabel(labels.Integer("job_id")),
metrics.WithLabel(labels.Integer("pipeline_id")),
metrics.WithLabel(labels.AnyString("pipeline_name")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(jobDurationGauge))
deploymentStartedCounter, err := metrics.NewCounterVec(
metrics.WithName("started_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments started for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(deploymentStartedCounter))
deploymentCanRollbackCounter, err := metrics.NewCounterVec(
metrics.WithName("can_rollback_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments suitable for rollback for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(deploymentCanRollbackCounter))
rollbacksCounter, err := metrics.NewCounterVec(
metrics.WithName("rollbacks_started_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of rollbacks started for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(rollbacksCounter))
deploymentCompletedCounter, err := metrics.NewCounterVec(
metrics.WithName("completed_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments completed for each environment"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithCartesianProductLabelReset(),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewCounter(deploymentCompletedCounter))
deploymentStartedGauge, err := metrics.NewGaugeVec(
metrics.WithName("started"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments started for each environment as gauge"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithLabel(labels.FullDeployVersion("version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentStartedGauge))
deploymentCompletedGauge, err := metrics.NewGaugeVec(
metrics.WithName("completed"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of deployments completed for each environment as gauge"),
metrics.WithLabel(labels.FromValues("target_env", deploymentLabelValues["target_env"])),
metrics.WithLabel(labels.FullDeployVersion("version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentCompletedGauge))
deploymentFailedAtLeastOnceGauge, err := metrics.NewGaugeVec(
metrics.WithName("failed_atleast_once"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("A deployment with at least one failed job"),
metrics.WithLabel(labels.FullDeployVersion("deploy_version")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentFailedAtLeastOnceGauge))
deploymentBlockerCountGauge, err := metrics.NewGaugeVec(
metrics.WithName("blocker_count"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Deployment Blocker Count Per Category"),
metrics.WithLabel(labels.RootCause("root_cause")),
metrics.WithLabel(labels.Date("week")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentBlockerCountGauge))
deploymentHoursBlockedGauge, err := metrics.NewGaugeVec(
metrics.WithName("hours_blocked"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Deployment Hours Blocked"),
metrics.WithLabel(labels.RootCause("root_cause")),
metrics.WithLabel(labels.Environment("target_env")),
metrics.WithLabel(labels.Date("week")),
)
if err != nil {
return err
}
*pluggables = append(*pluggables, handlers.NewGauge(deploymentHoursBlockedGauge))
return nil
}