internal/processmetrics/sapservice/sapservice.go (112 lines of code) (raw):

/* Copyright 2022 Google LLC Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Package sapservice is responsible for collecting metrics for SAP service // statuses using systemctl is-* cmd. package sapservice import ( "context" "fmt" "strconv" mrpb "google.golang.org/genproto/googleapis/monitoring/v3" tspb "google.golang.org/protobuf/types/known/timestamppb" "github.com/cenkalti/backoff/v4" "github.com/GoogleCloudPlatform/sapagent/internal/utils/protostruct" cnfpb "github.com/GoogleCloudPlatform/sapagent/protos/configuration" "github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/cloudmonitoring" "github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/commandlineexecutor" "github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/log" "github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/metricevents" "github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/timeseries" ) const ( metricURL = "workload.googleapis.com" failedMPath = "/sap/service/is_failed" disabledMPath = "/sap/service/is_disabled" ) var ( services = []string{"pacemaker", "corosync", "sapinit", "sapconf", "saptune"} mPathMap = map[string]string{"is-failed": failedMPath, "is-enabled": disabledMPath} ) type ( // InstanceProperties has the necessary context for Metrics collection. // InstanceProperties implements the Collector interface for sapservice. InstanceProperties struct { Config *cnfpb.Configuration Client cloudmonitoring.TimeSeriesCreator Execute commandlineexecutor.Execute ExitCode commandlineexecutor.ExitCode SkippedMetrics map[string]bool PMBackoffPolicy backoff.BackOffContext } ) // Collect is an implementation of Collector interface from processmetrics // responsible for collecting sap service statuses metric. func (p *InstanceProperties) Collect(ctx context.Context) ([]*mrpb.TimeSeries, error) { var metrics []*mrpb.TimeSeries if _, ok := mPathMap[failedMPath]; !ok { isFailedMetrics := queryInstanceState(ctx, p, "is-failed") metrics = append(metrics, isFailedMetrics...) } if _, ok := mPathMap[disabledMPath]; !ok { isDisabledMetrics := queryInstanceState(ctx, p, "is-enabled") metrics = append(metrics, isDisabledMetrics...) } return metrics, nil } // CollectWithRetry decorates the Collect method with retry mechanism. func (p *InstanceProperties) CollectWithRetry(ctx context.Context) ([]*mrpb.TimeSeries, error) { var ( attempt = 1 res []*mrpb.TimeSeries ) err := backoff.Retry(func() error { select { case <-ctx.Done(): log.CtxLogger(ctx).Debugw("Context cancelled, exiting CollectWithRetry") return nil default: var err error res, err = p.Collect(ctx) if err != nil { log.CtxLogger(ctx).Debugw("Error in Collection", "attempt", attempt, "error", err) attempt++ } return err } }, p.PMBackoffPolicy) if err != nil { log.CtxLogger(ctx).Debugw("Retry limit exceeded", "error", err) } return res, err } // queryInstanceState is responsible for collecting is_failed / is_enabled state of OS // services related to SAP and cluster services. // In case of `systemctl is_failed service` it returns 0 if there has been an error in starting the // service, metric will be sent only in case of an error. // // In case of `systemctl is-enabled service` it returns 0 if the specified service is enabled, // non-zero otherwise, metric will be sent only in case service is disabled. func queryInstanceState(ctx context.Context, p *InstanceProperties, metric string) []*mrpb.TimeSeries { var metrics []*mrpb.TimeSeries for _, service := range services { command := "systemctl" args := metric + " --quiet " + service result := p.Execute(ctx, commandlineexecutor.Params{ Executable: command, ArgsToSplit: args, }) sendMetric := int64(1) if metric == "is-failed" && result.ExitCode != 0 && result.ExitStatusParsed { log.CtxLogger(ctx).Debugw("No error while executing command, not sending is_failed metric", "command", command, "args", args) sendMetric = 0 } else if metric != "is-failed" && result.Error == nil { log.CtxLogger(ctx).Debugw("No error while executing command, not sending is_disabled metric", "command", command, "args", args) sendMetric = 0 } metricevents.AddEvent(ctx, metricevents.Parameters{ Path: metricURL + mPathMap[metric], Message: fmt.Sprintf("%s metric for service %s", metric, service), Value: strconv.FormatInt(sendMetric, 10), Labels: map[string]string{"service": service}, Identifier: service, }) if sendMetric == 0 { continue } log.CtxLogger(ctx).Debugw("Error while executing command", "command", command, "args", args, "stderr", result.StdErr) params := timeseries.Params{ CloudProp: protostruct.ConvertCloudPropertiesToStruct(p.Config.CloudProperties), MetricType: metricURL + mPathMap[metric], MetricLabels: map[string]string{"service": service}, Timestamp: tspb.Now(), Int64Value: 1, BareMetal: p.Config.BareMetal, } metrics = append(metrics, timeseries.BuildInt(params)) } return metrics }