metrics/internal/handlers/job_webhook.go (221 lines of code) (raw):
package handlers
import (
"crypto/subtle"
"io"
"log"
"net/http"
"os"
"regexp"
"time"
"github.com/gorilla/mux"
gitlab "gitlab.com/gitlab-org/api/client-go"
"gitlab.com/gitlab-org/release-tools/metrics/internal/metrics"
"gitlab.com/gitlab-org/release-tools/metrics/internal/metrics/labels"
)
const (
subsystem string = "webhooks"
buildStatusCreated = "created"
buildStatusFailed = "failed"
buildStatusRunning = "running"
ProjectDeployer = "ops-deployer"
ProjectReleaseToolsOps = "ops-release-tools"
ProjectQualityStagingCanaryOps = "ops-quality-staging-canary"
ProjectQualityStagingOps = "ops-quality-staging"
ProjectK8sWorkloadsGitlabComOps = "ops-k8s-workloads-gitlab-com"
ProjectCNGDev = "dev-charts-components-images"
ProjectOmnibusDev = "dev-omnibus"
)
var jobEventMetric metrics.Counter
var jobRetriesMetric metrics.Counter
var jobFailureLostSecondsMetric metrics.Counter
var autoDeployPipelineNameRegexes = map[string][]*regexp.Regexp{
ProjectReleaseToolsOps: {regexp.MustCompile("^Coordinator pipeline$")},
ProjectDeployer: {regexp.MustCompile("^auto-deploy: .*")},
ProjectQualityStagingCanaryOps: {regexp.MustCompile("^Deployment QA pipeline - ")},
ProjectQualityStagingOps: {regexp.MustCompile("^Deployment QA pipeline - ")},
ProjectK8sWorkloadsGitlabComOps: {regexp.MustCompile("^auto-deploy: ")},
ProjectCNGDev: {regexp.MustCompile("^AUTO_DEPLOY_BUILD_PIPELINE$")},
ProjectOmnibusDev: {regexp.MustCompile("^AUTO_DEPLOY_BUILD_PIPELINE$")},
}
var gitlabOpsJobsClient jobsClient
func init() {
jobEventsTotalCounter, err := initJobEventsTotalMetric()
if err != nil {
panic(err)
}
jobEventMetric = jobEventsTotalCounter
jobRetriesCounter, err := initJobRetriesMetric()
if err != nil {
panic(err)
}
jobRetriesMetric = jobRetriesCounter
jobFailureLostSecondsCounter, err := initJobFailureLostSecondsMetric()
if err != nil {
panic(err)
}
jobFailureLostSecondsMetric = jobFailureLostSecondsCounter
gitlabOpsJobsClient, err = newClient(os.Getenv("DELIVERY_METRICS_OPS_TOKEN"))
if err != nil {
panic(err)
}
}
type jobWebhook struct {
tokenMap map[string]string
}
func NewJobWebhook(tokenMap map[string]string) Pluggable {
return &jobWebhook{
tokenMap: tokenMap,
}
}
func initJobEventsTotalMetric() (metrics.Counter, error) {
return metrics.NewCounterVec(
metrics.WithName("job_events_total"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Total number of received job events"),
metrics.WithLabel(labels.FromValues("project", autoDeployProjects())),
metrics.WithLabel(labels.FromValues("pipeline_type", []string{"auto_deploy", "others"})),
metrics.WithCartesianProductLabelReset(),
)
}
func initJobRetriesMetric() (metrics.Counter, error) {
return metrics.NewCounterVec(
metrics.WithName("auto_deploy_job_retries"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Total number of job retries"),
metrics.WithLabel(labels.FromValues("project", autoDeployProjects())),
metrics.WithLabel(labels.AnyString("job_name")),
)
}
func initJobFailureLostSecondsMetric() (metrics.Counter, error) {
return metrics.NewCounterVec(
metrics.WithName("auto_deploy_job_failure_lost_seconds"),
metrics.WithSubsystem(subsystem),
metrics.WithHelp("Number of seconds lost due to flaky job failures that succeed on retry"),
metrics.WithLabel(labels.FromValues("project", autoDeployProjects())),
metrics.WithLabel(labels.AnyString("job_name")),
)
}
func (j *jobWebhook) PlugRoutes(r *mux.Router) {
subRouter := r.PathPrefix("/job").Subrouter()
subRouter.HandleFunc("/ops/gitlab-org/release/tools", j.releaseToolsJobWebhook)
subRouter.HandleFunc("/ops/gitlab-com/gl-infra/deployer", j.deployerJobWebhook)
subRouter.HandleFunc("/ops/gitlab-org/quality/staging-canary", j.qualityStagingCanaryJobWebhook)
subRouter.HandleFunc("/ops/gitlab-org/quality/staging", j.qualityStagingJobWebhook)
subRouter.HandleFunc("/ops/gitlab-com/gl-infra/k8s-workloads/gitlab-com", j.K8sWorkloadsGitlabComJobWebhook)
subRouter.HandleFunc("/dev/gitlab/charts/components/images", j.CNGJobWebhook)
subRouter.HandleFunc("/dev/gitlab/omnibus-gitlab", j.OmnibusJobWebhook)
}
func (j *jobWebhook) releaseToolsJobWebhook(writer http.ResponseWriter, request *http.Request) {
j.jobWebhook(ProjectReleaseToolsOps, writer, request)
}
func (j *jobWebhook) deployerJobWebhook(writer http.ResponseWriter, request *http.Request) {
j.jobWebhook(ProjectDeployer, writer, request)
}
func (j *jobWebhook) qualityStagingCanaryJobWebhook(writer http.ResponseWriter, request *http.Request) {
j.jobWebhook(ProjectQualityStagingCanaryOps, writer, request)
}
func (j *jobWebhook) qualityStagingJobWebhook(writer http.ResponseWriter, request *http.Request) {
j.jobWebhook(ProjectQualityStagingOps, writer, request)
}
func (j *jobWebhook) K8sWorkloadsGitlabComJobWebhook(writer http.ResponseWriter, request *http.Request) {
j.jobWebhook(ProjectK8sWorkloadsGitlabComOps, writer, request)
}
func (j *jobWebhook) CNGJobWebhook(writer http.ResponseWriter, request *http.Request) {
j.jobWebhook(ProjectCNGDev, writer, request)
}
func (j *jobWebhook) OmnibusJobWebhook(writer http.ResponseWriter, request *http.Request) {
j.jobWebhook(ProjectOmnibusDev, writer, request)
}
func (j *jobWebhook) jobWebhook(project string, writer http.ResponseWriter, request *http.Request) {
token := request.Header.Get("X-Gitlab-Token")
if !j.checkToken(project, token) {
http.Error(writer, "Invalid token", http.StatusUnauthorized)
return
}
// Read a max of 1 MB
request.Body = http.MaxBytesReader(writer, request.Body, 1048576)
body, err := io.ReadAll(request.Body)
if err != nil {
http.Error(writer, err.Error(), http.StatusBadRequest)
return
}
event, err := gitlab.ParseWebhook(gitlab.HookEventType(request), body)
if err != nil {
http.Error(writer, err.Error(), http.StatusBadRequest)
return
}
jobEvent, ok := event.(*gitlab.JobEvent)
if !ok {
http.Error(writer, "Request is expected to be a Job event", http.StatusBadRequest)
return
}
autoDeployPipeline := isAutoDeployPipeline(jobEvent.Commit.Name, project)
if autoDeployPipeline {
jobEventMetric.Inc(project, "auto_deploy")
} else {
jobEventMetric.Inc(project, "others")
// Make sure we are only populating metrics using data from auto-deploy and downstream pipelines.
return
}
if jobEvent.RetriesCount > 0 && jobEvent.BuildStatus == buildStatusCreated {
jobRetriesMetric.Inc(project, jobEvent.BuildName)
}
// Job failure lost seconds
if jobEvent.BuildStatus == buildStatusFailed {
jobFailureLostSecondsMetric.Add(jobEvent.BuildDuration, project, jobEvent.BuildName)
}
if jobEvent.BuildStatus == buildStatusRunning && jobEvent.RetriesCount > 0 {
createdAt, _ := time.Parse(time.RFC3339, jobEvent.BuildCreatedAt)
finishedAt := jobFailureLastOccurrence(jobEvent.ProjectID, jobEvent.PipelineID, jobEvent.BuildName)
if finishedAt == nil {
return
}
secondsLost := createdAt.Sub(*finishedAt).Seconds()
jobFailureLostSecondsMetric.Add(secondsLost, project, jobEvent.BuildName)
}
}
func jobFailureLastOccurrence(ProjectID int, pipelineID int, jobName string) *time.Time {
retried := true
opts := gitlab.ListJobsOptions{
ListOptions: gitlab.ListOptions{
Page: 1,
PerPage: 10,
},
IncludeRetried: &retried,
}
// find the prevous run
for {
jobs, response, err := gitlabOpsJobsClient.ListPipelineJobs(ProjectID, pipelineID, &opts)
if err != nil {
log.Fatal(err)
break
}
for _, job := range jobs {
if job.Status == buildStatusFailed && job.Name == jobName {
return job.FinishedAt
}
}
if response.NextPage == 0 {
break
}
opts.Page = response.NextPage
}
return nil
}
func (j *jobWebhook) checkToken(project string, token string) bool {
expectedToken, ok := j.tokenMap[project]
if ok && subtle.ConstantTimeCompare([]byte(token), []byte(expectedToken)) == 1 {
return true
}
return false
}
// Returns true if the pipeline name is of an auto-deploy pipeline.
func isAutoDeployPipeline(name, project string) bool {
for _, regex := range autoDeployPipelineNameRegexes[project] {
if regex.MatchString(name) {
return true
}
}
return false
}
func autoDeployProjects() []string {
return []string{
ProjectDeployer, ProjectReleaseToolsOps, ProjectQualityStagingCanaryOps, ProjectQualityStagingOps,
ProjectK8sWorkloadsGitlabComOps, ProjectCNGDev, ProjectOmnibusDev,
}
}