internal/processmetrics/pacemaker/pacemaker.go (96 lines of code) (raw):
/*
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package pacemaker package is responsible for sending linux cluster related metrics
// to cloud monitoring by interacting with pacemaker command.
// - /sap/pacemaker
package pacemaker
import (
"context"
"strconv"
"github.com/cenkalti/backoff/v4"
"github.com/GoogleCloudPlatform/sapagent/internal/pacemaker"
"github.com/GoogleCloudPlatform/sapagent/internal/utils/protostruct"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/cloudmonitoring"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/log"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/metricevents"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/timeseries"
mrpb "google.golang.org/genproto/googleapis/monitoring/v3"
tspb "google.golang.org/protobuf/types/known/timestamppb"
cnfpb "github.com/GoogleCloudPlatform/sapagent/protos/configuration"
)
// PMCollector provides testable replacement for workloadmanager.CollectPacemakerMetrics API.
type PMCollector interface {
CollectPacemakerMetrics(ctx context.Context) (float64, map[string]string)
}
// Params has the necessary context to collect pacemaker metrics.
type Params struct {
PCMParams pacemaker.Parameters
}
// InstanceProperties have the necessary context for pacemaker metric collection
type InstanceProperties struct {
Config *cnfpb.Configuration
Client cloudmonitoring.TimeSeriesCreator
Sids map[string]bool
SkippedMetrics map[string]bool
PMBackoffPolicy backoff.BackOffContext
PacemakerCollector PMCollector
}
// TODO: Document this in public docs post launch.
const pacemakerPath = "workload.googleapis.com/sap/pacemaker"
// CollectPacemakerMetrics is a PMCollector implementation of the PMCollector interface.
func (pm Params) CollectPacemakerMetrics(ctx context.Context) (float64, map[string]string) {
return pacemaker.CollectPacemakerMetrics(ctx, pm.PCMParams)
}
// Collect is a Pacemaker implementation of the Collector interface from
// processmetrics. It returns the value of current linux cluster related pacemaker
// metrics configured per sid as a metric list.
func (p *InstanceProperties) Collect(ctx context.Context) ([]*mrpb.TimeSeries, error) {
var metrics []*mrpb.TimeSeries
if _, ok := p.SkippedMetrics[pacemakerPath]; ok {
return metrics, nil
}
if p.Sids == nil {
log.CtxLogger(ctx).Debug("Sids is nil, skipping pacemaker metric collection.")
return metrics, nil
}
log.CtxLogger(ctx).Debug("Starting pacemaker metric collection.")
pacemakerVal, labels := p.PacemakerCollector.CollectPacemakerMetrics(ctx)
for sid := range p.Sids {
l := map[string]string{
"sid": sid,
}
for k, v := range labels {
l[k] = v
}
params := timeseries.Params{
CloudProp: protostruct.ConvertCloudPropertiesToStruct(p.Config.CloudProperties),
MetricType: pacemakerPath,
MetricLabels: l,
Timestamp: tspb.Now(),
Int64Value: int64(pacemakerVal),
BareMetal: p.Config.BareMetal,
}
metricevents.AddEvent(ctx, metricevents.Parameters{
Path: pacemakerPath,
Message: "Pacemaker Metrics",
Value: strconv.FormatInt(int64(pacemakerVal), 10),
Labels: l,
Identifier: sid,
})
metrics = append(metrics, timeseries.BuildInt(params))
}
log.CtxLogger(ctx).Debugw("Finished pacemaker metric collection.", "metrics", metrics)
return metrics, nil
}
// CollectWithRetry decorates the Collect method with retry mechanism.
func (p *InstanceProperties) CollectWithRetry(ctx context.Context) ([]*mrpb.TimeSeries, error) {
var (
attempt = 1
res []*mrpb.TimeSeries
)
err := backoff.Retry(func() error {
select {
case <-ctx.Done():
log.CtxLogger(ctx).Debugw("Context cancelled, exiting CollectWithRetry")
return nil
default:
var err error
res, err = p.Collect(ctx)
if err != nil {
log.CtxLogger(ctx).Debugw("Error in Collection", "attempt", attempt, "error", err)
attempt++
}
return err
}
}, p.PMBackoffPolicy)
if err != nil {
log.CtxLogger(ctx).Debug("Retry limit exceeded", "error", err)
}
return res, err
}