metrics/internal/handlers/job_webhook.go (221 lines of code) (raw):

package handlers import ( "crypto/subtle" "io" "log" "net/http" "os" "regexp" "time" "github.com/gorilla/mux" gitlab "gitlab.com/gitlab-org/api/client-go" "gitlab.com/gitlab-org/release-tools/metrics/internal/metrics" "gitlab.com/gitlab-org/release-tools/metrics/internal/metrics/labels" ) const ( subsystem string = "webhooks" buildStatusCreated = "created" buildStatusFailed = "failed" buildStatusRunning = "running" ProjectDeployer = "ops-deployer" ProjectReleaseToolsOps = "ops-release-tools" ProjectQualityStagingCanaryOps = "ops-quality-staging-canary" ProjectQualityStagingOps = "ops-quality-staging" ProjectK8sWorkloadsGitlabComOps = "ops-k8s-workloads-gitlab-com" ProjectCNGDev = "dev-charts-components-images" ProjectOmnibusDev = "dev-omnibus" ) var jobEventMetric metrics.Counter var jobRetriesMetric metrics.Counter var jobFailureLostSecondsMetric metrics.Counter var autoDeployPipelineNameRegexes = map[string][]*regexp.Regexp{ ProjectReleaseToolsOps: {regexp.MustCompile("^Coordinator pipeline$")}, ProjectDeployer: {regexp.MustCompile("^auto-deploy: .*")}, ProjectQualityStagingCanaryOps: {regexp.MustCompile("^Deployment QA pipeline - ")}, ProjectQualityStagingOps: {regexp.MustCompile("^Deployment QA pipeline - ")}, ProjectK8sWorkloadsGitlabComOps: {regexp.MustCompile("^auto-deploy: ")}, ProjectCNGDev: {regexp.MustCompile("^AUTO_DEPLOY_BUILD_PIPELINE$")}, ProjectOmnibusDev: {regexp.MustCompile("^AUTO_DEPLOY_BUILD_PIPELINE$")}, } var gitlabOpsJobsClient jobsClient func init() { jobEventsTotalCounter, err := initJobEventsTotalMetric() if err != nil { panic(err) } jobEventMetric = jobEventsTotalCounter jobRetriesCounter, err := initJobRetriesMetric() if err != nil { panic(err) } jobRetriesMetric = jobRetriesCounter jobFailureLostSecondsCounter, err := initJobFailureLostSecondsMetric() if err != nil { panic(err) } jobFailureLostSecondsMetric = jobFailureLostSecondsCounter gitlabOpsJobsClient, err = newClient(os.Getenv("DELIVERY_METRICS_OPS_TOKEN")) if err != nil { panic(err) } } type jobWebhook struct { tokenMap map[string]string } func NewJobWebhook(tokenMap map[string]string) Pluggable { return &jobWebhook{ tokenMap: tokenMap, } } func initJobEventsTotalMetric() (metrics.Counter, error) { return metrics.NewCounterVec( metrics.WithName("job_events_total"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Total number of received job events"), metrics.WithLabel(labels.FromValues("project", autoDeployProjects())), metrics.WithLabel(labels.FromValues("pipeline_type", []string{"auto_deploy", "others"})), metrics.WithCartesianProductLabelReset(), ) } func initJobRetriesMetric() (metrics.Counter, error) { return metrics.NewCounterVec( metrics.WithName("auto_deploy_job_retries"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Total number of job retries"), metrics.WithLabel(labels.FromValues("project", autoDeployProjects())), metrics.WithLabel(labels.AnyString("job_name")), ) } func initJobFailureLostSecondsMetric() (metrics.Counter, error) { return metrics.NewCounterVec( metrics.WithName("auto_deploy_job_failure_lost_seconds"), metrics.WithSubsystem(subsystem), metrics.WithHelp("Number of seconds lost due to flaky job failures that succeed on retry"), metrics.WithLabel(labels.FromValues("project", autoDeployProjects())), metrics.WithLabel(labels.AnyString("job_name")), ) } func (j *jobWebhook) PlugRoutes(r *mux.Router) { subRouter := r.PathPrefix("/job").Subrouter() subRouter.HandleFunc("/ops/gitlab-org/release/tools", j.releaseToolsJobWebhook) subRouter.HandleFunc("/ops/gitlab-com/gl-infra/deployer", j.deployerJobWebhook) subRouter.HandleFunc("/ops/gitlab-org/quality/staging-canary", j.qualityStagingCanaryJobWebhook) subRouter.HandleFunc("/ops/gitlab-org/quality/staging", j.qualityStagingJobWebhook) subRouter.HandleFunc("/ops/gitlab-com/gl-infra/k8s-workloads/gitlab-com", j.K8sWorkloadsGitlabComJobWebhook) subRouter.HandleFunc("/dev/gitlab/charts/components/images", j.CNGJobWebhook) subRouter.HandleFunc("/dev/gitlab/omnibus-gitlab", j.OmnibusJobWebhook) } func (j *jobWebhook) releaseToolsJobWebhook(writer http.ResponseWriter, request *http.Request) { j.jobWebhook(ProjectReleaseToolsOps, writer, request) } func (j *jobWebhook) deployerJobWebhook(writer http.ResponseWriter, request *http.Request) { j.jobWebhook(ProjectDeployer, writer, request) } func (j *jobWebhook) qualityStagingCanaryJobWebhook(writer http.ResponseWriter, request *http.Request) { j.jobWebhook(ProjectQualityStagingCanaryOps, writer, request) } func (j *jobWebhook) qualityStagingJobWebhook(writer http.ResponseWriter, request *http.Request) { j.jobWebhook(ProjectQualityStagingOps, writer, request) } func (j *jobWebhook) K8sWorkloadsGitlabComJobWebhook(writer http.ResponseWriter, request *http.Request) { j.jobWebhook(ProjectK8sWorkloadsGitlabComOps, writer, request) } func (j *jobWebhook) CNGJobWebhook(writer http.ResponseWriter, request *http.Request) { j.jobWebhook(ProjectCNGDev, writer, request) } func (j *jobWebhook) OmnibusJobWebhook(writer http.ResponseWriter, request *http.Request) { j.jobWebhook(ProjectOmnibusDev, writer, request) } func (j *jobWebhook) jobWebhook(project string, writer http.ResponseWriter, request *http.Request) { token := request.Header.Get("X-Gitlab-Token") if !j.checkToken(project, token) { http.Error(writer, "Invalid token", http.StatusUnauthorized) return } // Read a max of 1 MB request.Body = http.MaxBytesReader(writer, request.Body, 1048576) body, err := io.ReadAll(request.Body) if err != nil { http.Error(writer, err.Error(), http.StatusBadRequest) return } event, err := gitlab.ParseWebhook(gitlab.HookEventType(request), body) if err != nil { http.Error(writer, err.Error(), http.StatusBadRequest) return } jobEvent, ok := event.(*gitlab.JobEvent) if !ok { http.Error(writer, "Request is expected to be a Job event", http.StatusBadRequest) return } autoDeployPipeline := isAutoDeployPipeline(jobEvent.Commit.Name, project) if autoDeployPipeline { jobEventMetric.Inc(project, "auto_deploy") } else { jobEventMetric.Inc(project, "others") // Make sure we are only populating metrics using data from auto-deploy and downstream pipelines. return } if jobEvent.RetriesCount > 0 && jobEvent.BuildStatus == buildStatusCreated { jobRetriesMetric.Inc(project, jobEvent.BuildName) } // Job failure lost seconds if jobEvent.BuildStatus == buildStatusFailed { jobFailureLostSecondsMetric.Add(jobEvent.BuildDuration, project, jobEvent.BuildName) } if jobEvent.BuildStatus == buildStatusRunning && jobEvent.RetriesCount > 0 { createdAt, _ := time.Parse(time.RFC3339, jobEvent.BuildCreatedAt) finishedAt := jobFailureLastOccurrence(jobEvent.ProjectID, jobEvent.PipelineID, jobEvent.BuildName) if finishedAt == nil { return } secondsLost := createdAt.Sub(*finishedAt).Seconds() jobFailureLostSecondsMetric.Add(secondsLost, project, jobEvent.BuildName) } } func jobFailureLastOccurrence(ProjectID int, pipelineID int, jobName string) *time.Time { retried := true opts := gitlab.ListJobsOptions{ ListOptions: gitlab.ListOptions{ Page: 1, PerPage: 10, }, IncludeRetried: &retried, } // find the prevous run for { jobs, response, err := gitlabOpsJobsClient.ListPipelineJobs(ProjectID, pipelineID, &opts) if err != nil { log.Fatal(err) break } for _, job := range jobs { if job.Status == buildStatusFailed && job.Name == jobName { return job.FinishedAt } } if response.NextPage == 0 { break } opts.Page = response.NextPage } return nil } func (j *jobWebhook) checkToken(project string, token string) bool { expectedToken, ok := j.tokenMap[project] if ok && subtle.ConstantTimeCompare([]byte(token), []byte(expectedToken)) == 1 { return true } return false } // Returns true if the pipeline name is of an auto-deploy pipeline. func isAutoDeployPipeline(name, project string) bool { for _, regex := range autoDeployPipelineNameRegexes[project] { if regex.MatchString(name) { return true } } return false } func autoDeployProjects() []string { return []string{ ProjectDeployer, ProjectReleaseToolsOps, ProjectQualityStagingCanaryOps, ProjectQualityStagingOps, ProjectK8sWorkloadsGitlabComOps, ProjectCNGDev, ProjectOmnibusDev, } }