pkg/hostmgr/metrics/metrics.go (122 lines of code) (raw):

// Copyright (c) 2019 Uber Technologies, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metrics import ( "github.com/uber-go/tally" "github.com/uber/peloton/.gen/peloton/private/hostmgr/hostsvc" "github.com/uber/peloton/pkg/common/util" ) // Metrics is a placeholder for all metrics in hostmgr. type Metrics struct { LaunchTasks tally.Counter LaunchTasksFail tally.Counter LaunchTasksInvalid tally.Counter LaunchTasksInvalidOffers tally.Counter AcquireHostOffers tally.Counter AcquireHostOffersFail tally.Counter AcquireHostOffersInvalid tally.Counter AcquireHostOffersCount tally.Counter GetHosts tally.Counter GetHostsInvalid tally.Counter GetHostsCount tally.Counter KillTasks tally.Counter KillTasksFail tally.Counter ShutdownExecutors tally.Counter ShutdownExecutorsInvalid tally.Counter ShutdownExecutorsFail tally.Counter ReleaseHostOffers tally.Counter ReleaseHostOffersFail tally.Counter ReleaseHostsCount tally.Counter GetMesosMasterHostPort tally.Counter GetMesosMasterHostPortFail tally.Counter Elected tally.Gauge MesosConnected tally.Gauge HandlersRunning tally.Gauge ClusterCapacity tally.Counter ClusterCapacityFail tally.Counter RecoverySuccess tally.Counter RecoveryFail tally.Counter GetDrainingHosts tally.Counter GetDrainingHostsFail tally.Counter MarkHostDrained tally.Counter MarkHostDrainedFail tally.Counter WatchEventCancel tally.Counter WatchEventOverflow tally.Counter WatchCancelNotFound tally.Counter // Time takes to acquire lock in watch processor WatchProcessorLockDuration tally.Timer GetCqosAdvisorMetric tally.Counter GetCqosAdvisorMetricFail tally.Counter scope tally.Scope } // NewMetrics returns a new instance of hostmgr.Metrics. func NewMetrics(scope tally.Scope) *Metrics { serverScope := scope.SubScope("server") watchEventScope := scope.SubScope("watch") return &Metrics{ LaunchTasks: scope.Counter("launch_tasks"), LaunchTasksFail: scope.Counter("launch_tasks_fail"), LaunchTasksInvalid: scope.Counter("launch_tasks_invalid"), LaunchTasksInvalidOffers: scope.Counter("launch_tasks_invalid_offers"), AcquireHostOffers: scope.Counter("acquire_host_offers"), AcquireHostOffersFail: scope.Counter("acquire_host_offers_fail"), AcquireHostOffersInvalid: scope.Counter("acquire_host_offers_invalid"), AcquireHostOffersCount: scope.Counter("acquire_host_offers_count"), GetHosts: scope.Counter("get_hosts"), GetHostsInvalid: scope.Counter("get_hosts_invalid"), GetHostsCount: scope.Counter("get_hosts_count"), KillTasks: scope.Counter("kill_tasks"), KillTasksFail: scope.Counter("kill_tasks_fail"), ShutdownExecutors: scope.Counter("shutdown_executors"), ShutdownExecutorsInvalid: scope.Counter("shutdown_executors_invalid"), ShutdownExecutorsFail: scope.Counter("shutdown_executors_fail"), ReleaseHostOffers: scope.Counter("release_host_offers"), ReleaseHostOffersFail: scope.Counter("release_host_offers_fail"), ReleaseHostsCount: scope.Counter("release_hosts_count"), GetMesosMasterHostPort: scope.Counter("get_mesos_master_host_port"), GetMesosMasterHostPortFail: scope.Counter("get_mesos_master_host_port_fail"), Elected: serverScope.Gauge("elected"), MesosConnected: serverScope.Gauge("mesos_connected"), HandlersRunning: serverScope.Gauge("handlers_running"), ClusterCapacity: scope.Counter("cluster_capacity"), ClusterCapacityFail: scope.Counter("cluster_capacity_fail"), RecoverySuccess: scope.Counter("recovery_success"), RecoveryFail: scope.Counter("recovery_fail"), GetDrainingHosts: scope.Counter("get_draining_hosts"), GetDrainingHostsFail: scope.Counter("get_draining_hosts_fail"), MarkHostDrained: scope.Counter("mark_host_drained"), MarkHostDrainedFail: scope.Counter("mark_host_drained_fail"), WatchEventCancel: watchEventScope.Counter("watch_event_cancel"), WatchEventOverflow: watchEventScope.Counter("watch_event_overflow"), WatchCancelNotFound: watchEventScope.Counter("watch_cancel_not_found"), WatchProcessorLockDuration: watchEventScope.Timer("watch_processor_lock_duration"), GetCqosAdvisorMetric: scope.Counter("get_cqos_advisor_metric"), GetCqosAdvisorMetricFail: scope.Counter("get_cqos_advisor_metric_fail"), scope: scope, } } // RefreshClusterCapacityGauges refreshes all the cluster capacity gauges func (m *Metrics) RefreshClusterCapacityGauges(response *hostsvc.ClusterCapacityResponse) { // update metrics for total cluster capacity for _, resource := range response.GetPhysicalResources() { if len(resource.GetKind()) == 0 || resource.GetCapacity() < util.ResourceEpsilon { continue } gauge := m.scope.Gauge("cluster_capacity_" + resource.GetKind()) gauge.Update(resource.GetCapacity()) } for _, resource := range response.GetPhysicalSlackResources() { if len(resource.GetKind()) == 0 || resource.GetCapacity() < util.ResourceEpsilon { continue } gauge := m.scope.Gauge("cluster_capacity_revocable_" + resource.GetKind()) gauge.Update(resource.GetCapacity()) } // update metrics for resources allocated tasks launched by Peloton for _, resource := range response.GetResources() { if len(resource.GetKind()) == 0 || resource.GetCapacity() < util.ResourceEpsilon { continue } gauge := m.scope.Gauge("mesos_task_allocation_" + resource.GetKind()) gauge.Update(resource.GetCapacity()) } for _, resource := range response.GetAllocatedSlackResources() { if len(resource.GetKind()) == 0 || resource.GetCapacity() < util.ResourceEpsilon { continue } gauge := m.scope.Gauge("mesos_task_allocation_revocable_" + resource.GetKind()) gauge.Update(resource.GetCapacity()) } }