internal/onetime/status/status.go (662 lines of code) (raw):
/*
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package status implements the status subcommand to provide information
// on the agent, configuration, IAM and functional statuses.
package status
import (
"context"
_ "embed"
"fmt"
"io/fs"
"io/ioutil"
"net/http"
"os"
"runtime"
"slices"
"sort"
"strings"
"time"
"flag"
"cloud.google.com/go/artifactregistry/apiv1"
store "cloud.google.com/go/storage"
"github.com/google/subcommands"
backintconfiguration "github.com/GoogleCloudPlatform/sapagent/internal/backint/configuration"
"github.com/GoogleCloudPlatform/sapagent/internal/configuration"
"github.com/GoogleCloudPlatform/sapagent/internal/databaseconnector"
"github.com/GoogleCloudPlatform/sapagent/internal/heartbeat"
"github.com/GoogleCloudPlatform/sapagent/internal/iam"
"github.com/GoogleCloudPlatform/sapagent/internal/onetime"
"github.com/GoogleCloudPlatform/sapagent/internal/onetime/supportbundle"
"github.com/GoogleCloudPlatform/sapagent/internal/usagemetrics"
cpb "github.com/GoogleCloudPlatform/sapagent/protos/configuration"
iipb "github.com/GoogleCloudPlatform/sapagent/protos/instanceinfo"
"github.com/GoogleCloudPlatform/sapagent/shared/iam"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/commandlineexecutor"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/log"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/recovery"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/statushelper"
"github.com/GoogleCloudPlatform/workloadagentplatform/sharedlibraries/storage"
spb "github.com/GoogleCloudPlatform/workloadagentplatform/sharedprotos/status"
)
const (
agentPackageName = "google-cloud-sap-agent"
projectName = "sap-core-eng-products"
repositoryLocation = "us"
repositoryName = "google-cloud-sap-agent-sles15-x86-64"
fetchLatestVersionError = "Error: could not fetch latest version"
// TODO: Implement status OTE check for WIF based authentications access to scopes.
requiredScope = "https://www.googleapis.com/auth/cloud-platform"
hostMetricsEndpoint = "http://localhost:18181"
// Below outline the names of the services as defined in the iam permissions file.
hanaMonitoringLabel = "HANA_MONITORING"
processMetricsLabel = "PROCESS_METRICS"
cloudLoggingLabel = "CLOUD_LOGGING"
hostMetricsLabel = "HOST_METRICS"
backintLabel = "BACKINT"
backintMultipartLabel = "BACKINT_MULTIPART"
diskBackupLabel = "DISKBACKUP"
diskBackupStripedLabel = "DISKBACKUP_STRIPED"
systemDiscoveryLabel = "SAP_SYSTEM_DISCOVERY"
workloadEvaluationMELabel = "WORKLOAD_EVALUATION_METRICS"
secretManagerLabel = "SECRET_MANAGER"
agentMetricsLabel = "AGENT_HEALTH_METRICS"
)
var (
dailyUsageRoutine *recovery.RecoverableRoutine
collectRoutine *recovery.RecoverableRoutine
)
type (
// IAMService is an interface for the IAM service.
IAMService interface {
CheckIAMPermissionsOnBucket(ctx context.Context, bucketName string, permissions []string) ([]string, error)
CheckIAMPermissionsOnDisk(ctx context.Context, project, zone, diskName string, permissions []string) ([]string, error)
CheckIAMPermissionsOnInstance(ctx context.Context, project, zone, instanceName string, permissions []string) ([]string, error)
CheckIAMPermissionsOnProject(ctx context.Context, projectID string, permissions []string) ([]string, error)
CheckIAMPermissionsOnSecret(ctx context.Context, projectID, secretID string, permissions []string) ([]string, error)
}
httpGetter func(url string) (resp *http.Response, err error)
statFunc func(name string) (os.FileInfo, error)
readDirFunc func(dirname string) ([]fs.FileInfo, error)
)
// Status stores the status subcommand parameters.
type Status struct {
ConfigFilePath string
BackintParametersPath string
CloudProps *iipb.CloudProperties
HeartbeatSpec *heartbeat.Spec
compact bool
help bool
logLevel, logPath string
config *cpb.Configuration
gceService onetime.GCEInterface
iamService IAMService
arClient statushelper.ARClientInterface
oteLogger *onetime.OTELogger
readFile configuration.ReadConfigFile
backintReadFile backintconfiguration.ReadConfigFile
exec commandlineexecutor.Execute
backintClient storage.Client
permissionsStatus permissions.FetchStatusFunc
httpGet httpGetter
createDBHandle databaseconnector.DBHandleFunc
stat statFunc
readDir readDirFunc
}
// Name implements the subcommand interface for status.
func (*Status) Name() string { return "status" }
// Synopsis implements the subcommand interface for status.
func (*Status) Synopsis() string { return "get the status of the agent and its services" }
// Usage implements the subcommand interface for status.
func (*Status) Usage() string {
return `status [-config <path-to-agent-config-file>]
[-backint <path-to-backint-parameters-file>] [-compact]
Get the status of the agent and its services.
`
}
// SetFlags implements the subcommand interface for status.
func (s *Status) SetFlags(fs *flag.FlagSet) {
fs.StringVar(&s.ConfigFilePath, "config", "", "Configuration path override")
fs.StringVar(&s.ConfigFilePath, "c", "", "Configuration path override")
fs.StringVar(&s.BackintParametersPath, "backint", "", "Backint parameters path")
fs.StringVar(&s.BackintParametersPath, "b", "", "Backint parameters path")
fs.BoolVar(&s.compact, "compact", false, "Display a compact status (no configuration or IAM)")
fs.BoolVar(&s.help, "h", false, "Display help")
}
// Execute implements the subcommand interface for status.
func (s *Status) Execute(ctx context.Context, f *flag.FlagSet, args ...any) subcommands.ExitStatus {
// Initialize logging and cloud properties.
lp, cp, exitStatus, completed := onetime.Init(ctx, onetime.InitOptions{
Name: s.Name(),
Help: s.help,
LogLevel: s.logLevel,
LogPath: s.logPath,
Fs: f,
}, args...)
if !completed {
return exitStatus
}
s.CloudProps = cp
// Run the status checks.
agentStatus, exitStatus := s.Run(ctx, onetime.CreateRunOptions(cp, false))
if exitStatus == subcommands.ExitFailure {
// Collect support bundle if there's an error.
supportbundle.CollectAgentSupport(ctx, f, lp, cp, s.Name())
}
log.CtxLogger(ctx).Infow("Agent Status", "status", agentStatus)
statushelper.PrintStatus(ctx, agentStatus, s.compact)
log.CtxLogger(ctx).Info("Status finished")
return exitStatus
}
// Init initializes status parameters and creates clients.
func (s *Status) Init(ctx context.Context) error {
var err error
s.readFile = os.ReadFile
s.backintReadFile = os.ReadFile
s.exec = commandlineexecutor.ExecuteCommand
s.backintClient = store.NewClient
s.stat = os.Stat
s.readDir = ioutil.ReadDir
s.permissionsStatus = permissions.GetServicePermissionsStatus
s.httpGet = http.Get
s.createDBHandle = databaseconnector.CreateDBHandle
s.iamService, err = iam.NewIAMClient(ctx)
if err != nil {
log.CtxLogger(ctx).Errorw("Could not create IAM client", "error", err)
return err
}
arClient, err := artifactregistry.NewClient(ctx)
if err != nil {
log.CtxLogger(ctx).Errorw("Could not create artifact registry client", "error", err)
return err
}
s.arClient = &statushelper.ArtifactRegistryClient{Client: arClient}
return nil
}
// Run executes the command and returns the status.
func (s *Status) Run(ctx context.Context, opts *onetime.RunOptions) (*spb.AgentStatus, subcommands.ExitStatus) {
s.oteLogger = onetime.CreateOTELogger(opts.DaemonMode)
if err := s.Init(ctx); err != nil {
return nil, subcommands.ExitFailure
}
status, err := s.statusHandler(ctx)
if err != nil {
log.CtxLogger(ctx).Errorw("Could not get agent status", "error", err)
return nil, subcommands.ExitFailure
}
return status, subcommands.ExitSuccess
}
// StartStatusCollection continuously sends status to data warehouse.
// Returns true if the collection goroutine is started, and false otherwise.
func (s *Status) StartStatusCollection(ctx context.Context) bool {
dailyUsageRoutine = &recovery.RecoverableRoutine{
Routine: func(context.Context, any) { usagemetrics.LogActionDaily(usagemetrics.CollectStatus) },
RoutineArg: nil,
ErrorCode: usagemetrics.UsageMetricsDailyLogError,
UsageLogger: *usagemetrics.Logger,
ExpectedMinDuration: 24 * time.Hour,
}
dailyUsageRoutine.StartRoutine(ctx)
collectRoutine = &recovery.RecoverableRoutine{
Routine: start,
RoutineArg: s,
ErrorCode: usagemetrics.StatusCollectionFailure,
UsageLogger: *usagemetrics.Logger,
ExpectedMinDuration: 60 * time.Second,
}
collectRoutine.StartRoutine(ctx)
return true
}
func start(ctx context.Context, a any) {
var s *Status
if v, ok := a.(*Status); ok {
s = v
} else {
log.CtxLogger(ctx).Error("Cannot collect status, no collection configuration detected")
return
}
statusTicker := time.NewTicker(time.Duration(60) * time.Second)
defer statusTicker.Stop()
heartbeatTicker := s.HeartbeatSpec.CreateTicker()
defer heartbeatTicker.Stop()
// Do not wait for the first tick and start status collection immediately.
select {
case <-ctx.Done():
log.CtxLogger(ctx).Debug("Status cancellation requested")
return
default:
s.collectAndSendStatus(ctx)
}
for {
select {
case <-ctx.Done():
log.CtxLogger(ctx).Debug("Status cancellation requested")
return
case <-heartbeatTicker.C:
s.HeartbeatSpec.Beat()
case <-statusTicker.C:
s.collectAndSendStatus(ctx)
}
}
}
func (s *Status) collectAndSendStatus(ctx context.Context) error {
s.HeartbeatSpec.Beat()
agentStatus, err := s.statusHandler(ctx)
if err != nil {
log.CtxLogger(ctx).Errorw("Could not get agent status", "error", err)
return err
}
// TODO: Send status to data warehouse.
log.CtxLogger(ctx).Debugw("Agent Status", "status", agentStatus)
return nil
}
// statusHandler executes the status checks and returns the results as the AgentStatus proto.
func (s *Status) statusHandler(ctx context.Context) (*spb.AgentStatus, error) {
if s.permissionsStatus == nil || s.httpGet == nil || s.createDBHandle == nil {
return nil, fmt.Errorf("status struct has not been initialized")
}
log.CtxLogger(ctx).Info("Status starting")
agentStatus, config := s.agentStatus(ctx)
agentStatus.Services = append(agentStatus.Services, s.hostMetricsStatus(ctx, config))
agentStatus.Services = append(agentStatus.Services, s.processMetricsStatus(ctx, config))
agentStatus.Services = append(agentStatus.Services, s.hanaMonitoringMetricsStatus(ctx, config))
agentStatus.Services = append(agentStatus.Services, s.systemDiscoveryStatus(ctx, config))
agentStatus.Services = append(agentStatus.Services, s.backintStatus(ctx))
agentStatus.Services = append(agentStatus.Services, s.diskSnapshotStatus(ctx, config))
agentStatus.Services = append(agentStatus.Services, s.workloadManagerStatus(ctx, config))
agentStatus.References = append(agentStatus.References, &spb.Reference{
Name: "Release notes",
Url: "https://cloud.google.com/solutions/sap/docs/agent-for-sap/whats-new",
})
agentStatus.References = append(agentStatus.References, &spb.Reference{
Name: "Guides",
Url: "https://cloud.google.com/solutions/sap/docs/agent-for-sap/latest/all-guides",
})
return agentStatus, nil
}
// agentStatus returns the agent version, enabled/running, config path, and the
// configuration as parsed by the agent.
func (s *Status) agentStatus(ctx context.Context) (*spb.AgentStatus, *cpb.Configuration) {
agentStatus := &spb.AgentStatus{
AgentName: agentPackageName,
InstalledVersion: fmt.Sprintf("%s-%s", configuration.AgentVersion, configuration.AgentBuildChange),
}
var err error
agentStatus.AvailableVersion, err = statushelper.LatestVersionArtifactRegistry(ctx, s.arClient, projectName, repositoryLocation, repositoryName, agentPackageName)
if err != nil {
log.CtxLogger(ctx).Errorw("Could not fetch latest version", "error", err)
agentStatus.AvailableVersion = fetchLatestVersionError
}
switch {
case s.CloudProps == nil:
log.CtxLogger(ctx).Errorw("Could not fetch scopes", "error", err)
agentStatus.CloudApiAccessFullScopesGranted = spb.State_ERROR_STATE
case slices.Contains(s.CloudProps.GetScopes(), requiredScope):
agentStatus.CloudApiAccessFullScopesGranted = spb.State_SUCCESS_STATE
default:
agentStatus.CloudApiAccessFullScopesGranted = spb.State_FAILURE_STATE
}
enabled, running, err := statushelper.CheckAgentEnabledAndRunning(ctx, agentPackageName, runtime.GOOS, s.exec)
agentStatus.SystemdServiceEnabled = spb.State_FAILURE_STATE
agentStatus.SystemdServiceRunning = spb.State_FAILURE_STATE
if err != nil {
log.CtxLogger(ctx).Errorw("Could not check agent enabled and running", "error", err)
agentStatus.SystemdServiceEnabled = spb.State_ERROR_STATE
agentStatus.SystemdServiceRunning = spb.State_ERROR_STATE
} else {
if enabled {
agentStatus.SystemdServiceEnabled = spb.State_SUCCESS_STATE
}
if running {
agentStatus.SystemdServiceRunning = spb.State_SUCCESS_STATE
}
}
path := s.ConfigFilePath
if len(path) == 0 {
switch runtime.GOOS {
case "linux":
path = configuration.LinuxConfigPath
case "windows":
path = configuration.WindowsConfigPath
}
}
agentStatus.ConfigurationFilePath = path
config, err := configuration.Read(path, s.readFile)
agentStatus.ConfigurationValid = spb.State_SUCCESS_STATE
if err != nil {
agentStatus.ConfigurationValid = spb.State_FAILURE_STATE
agentStatus.ConfigurationErrorMessage = err.Error()
}
config = configuration.ApplyDefaults(config, s.CloudProps)
agentStatus.KernelVersion, err = statushelper.KernelVersion(ctx, runtime.GOOS, s.exec)
if err != nil && runtime.GOOS == "linux" {
log.CtxLogger(ctx).Errorw("Could not fetch kernel version", "error", err)
}
return agentStatus, config
}
func (s *Status) hostMetricsStatus(ctx context.Context, config *cpb.Configuration) *spb.ServiceStatus {
status := &spb.ServiceStatus{
Name: "Host Metrics",
State: spb.State_UNSPECIFIED_STATE,
ConfigValues: []*spb.ConfigValue{
configValue("provide_sap_host_agent_metrics", config.GetProvideSapHostAgentMetrics().GetValue(), true),
},
}
if !config.GetProvideSapHostAgentMetrics().GetValue() {
status.State = spb.State_FAILURE_STATE
return status
}
status.State = spb.State_SUCCESS_STATE
permissionsStatus, allGranted, err := s.fetchPermissionsStatus(ctx, hostMetricsLabel, &permissions.ResourceDetails{
ProjectID: s.CloudProps.GetProjectId(),
InstanceName: s.CloudProps.GetInstanceId(),
Zone: s.CloudProps.GetZone(),
})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
status.IamPermissions = permissionsStatus
if !allGranted {
return logCheckFailureAndReturnStatus(ctx, status, "IAM permissions not granted", spb.State_FAILURE_STATE)
}
resp, err := s.httpGet(hostMetricsEndpoint)
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error verifying endpoint: %v", err.Error()), spb.State_ERROR_STATE)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Endpoint verification failed with code: %d", resp.StatusCode), spb.State_FAILURE_STATE)
}
status.FullyFunctional = spb.State_SUCCESS_STATE
return status
}
func (s *Status) processMetricsStatus(ctx context.Context, config *cpb.Configuration) *spb.ServiceStatus {
status := &spb.ServiceStatus{
Name: "Process Metrics",
State: spb.State_UNSPECIFIED_STATE,
ConfigValues: []*spb.ConfigValue{
configValue("collect_process_metrics", config.GetCollectionConfiguration().GetCollectProcessMetrics(), false),
configValue("process_metrics_frequency", config.GetCollectionConfiguration().GetProcessMetricsFrequency(), 5),
configValue("process_metrics_to_skip", config.GetCollectionConfiguration().GetProcessMetricsToSkip(), []string{}),
configValue("slow_process_metrics_frequency", config.GetCollectionConfiguration().GetSlowProcessMetricsFrequency(), 30),
},
}
if !config.GetCollectionConfiguration().GetCollectProcessMetrics() {
status.State = spb.State_FAILURE_STATE
return status
}
status.State = spb.State_SUCCESS_STATE
permissionsStatus, allGranted, err := s.fetchPermissionsStatus(ctx, processMetricsLabel, &permissions.ResourceDetails{ProjectID: s.CloudProps.GetProjectId()})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
status.IamPermissions = permissionsStatus
if !allGranted {
return logCheckFailureAndReturnStatus(ctx, status, "IAM permissions not granted", spb.State_FAILURE_STATE)
}
status.FullyFunctional = spb.State_SUCCESS_STATE
return status
}
func (s *Status) hanaMonitoringMetricsStatus(ctx context.Context, config *cpb.Configuration) *spb.ServiceStatus {
status := &spb.ServiceStatus{
Name: "HANA Monitoring Metrics",
State: spb.State_UNSPECIFIED_STATE,
ConfigValues: []*spb.ConfigValue{
configValue("connection_timeout", config.GetHanaMonitoringConfiguration().GetConnectionTimeout().GetSeconds(), 120),
configValue("enabled", config.GetHanaMonitoringConfiguration().GetEnabled(), false),
configValue("execution_threads", config.GetHanaMonitoringConfiguration().GetExecutionThreads(), 10),
configValue("max_connect_retries", config.GetHanaMonitoringConfiguration().GetMaxConnectRetries().GetValue(), 1),
configValue("query_timeout_sec", config.GetHanaMonitoringConfiguration().GetQueryTimeoutSec(), 300),
configValue("sample_interval_sec", config.GetHanaMonitoringConfiguration().GetSampleIntervalSec(), 300),
configValue("send_query_response_time", config.GetHanaMonitoringConfiguration().GetSendQueryResponseTime(), false),
},
}
if !config.GetHanaMonitoringConfiguration().GetEnabled() {
status.State = spb.State_FAILURE_STATE
return status
}
status.State = spb.State_SUCCESS_STATE
permissionsStatus, allGranted, err := s.fetchPermissionsStatus(ctx, processMetricsLabel, &permissions.ResourceDetails{ProjectID: s.CloudProps.GetProjectId()})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
status.IamPermissions = permissionsStatus
if !allGranted {
return logCheckFailureAndReturnStatus(ctx, status, "IAM permissions not granted", spb.State_FAILURE_STATE)
}
var failedInstances []string
for _, i := range s.config.GetHanaMonitoringConfiguration().GetHanaInstances() {
// Note: We ignore timeout params here.
dbp := databaseconnector.Params{
Username: i.GetUser(),
Host: i.GetHost(),
Password: i.GetPassword(),
PasswordSecret: i.GetSecretName(),
Port: i.GetPort(),
EnableSSL: i.GetEnableSsl(),
HostNameInCert: i.GetHostNameInCertificate(),
RootCAFile: i.GetTlsRootCaFile(),
HDBUserKey: i.GetHdbuserstoreKey(),
SID: i.GetSid(),
GCEService: s.gceService,
Project: s.config.GetCloudProperties().GetProjectId(),
PingSpec: &databaseconnector.PingSpec{
Timeout: 1 * time.Second,
MaxRetries: 0,
},
}
if _, err := s.createDBHandle(ctx, dbp); err != nil {
log.CtxLogger(ctx).Errorw("Error connecting to database", "name", i.GetName(), "error", err.Error())
failedInstances = append(failedInstances, i.GetName())
continue
}
}
if len(failedInstances) > 0 {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Failed to connect to HANA instances: %s", strings.Join(failedInstances, ", ")), spb.State_FAILURE_STATE)
}
status.FullyFunctional = spb.State_SUCCESS_STATE
return status
}
func (s *Status) systemDiscoveryStatus(ctx context.Context, config *cpb.Configuration) *spb.ServiceStatus {
status := &spb.ServiceStatus{
Name: "System Discovery",
ConfigValues: []*spb.ConfigValue{
configValue("enable_discovery", config.GetDiscoveryConfiguration().GetEnableDiscovery().GetValue(), true),
configValue("enable_workload_discovery", config.GetDiscoveryConfiguration().GetEnableWorkloadDiscovery().GetValue(), true),
configValue("sap_instances_update_frequency", config.GetDiscoveryConfiguration().GetSapInstancesUpdateFrequency().GetSeconds(), 60),
configValue("system_discovery_update_frequency", config.GetDiscoveryConfiguration().GetSystemDiscoveryUpdateFrequency().GetSeconds(), 4*60*60),
},
}
if !config.GetDiscoveryConfiguration().GetEnableDiscovery().GetValue() {
status.State = spb.State_FAILURE_STATE
return status
}
status.State = spb.State_SUCCESS_STATE
permissionsStatus, allGranted, err := s.fetchPermissionsStatus(ctx, systemDiscoveryLabel, &permissions.ResourceDetails{ProjectID: s.CloudProps.GetProjectId()})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
status.IamPermissions = permissionsStatus
if !allGranted {
return logCheckFailureAndReturnStatus(ctx, status, "IAM permissions not granted", spb.State_FAILURE_STATE)
}
// Ensure sapservices and anything in /hana/shared have readable permissions.
if err := checkFilePermissions("/usr/sap/sapservices", 0400, s.stat); err != nil {
return logCheckFailureAndReturnStatus(ctx, status, err.Error(), spb.State_FAILURE_STATE)
}
files, err := s.readDir("/hana/shared")
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, err.Error(), spb.State_FAILURE_STATE)
}
for _, f := range files {
if err := checkFilePermissions("/hana/shared/"+f.Name(), 0400, s.stat); err != nil {
return logCheckFailureAndReturnStatus(ctx, status, err.Error(), spb.State_FAILURE_STATE)
}
}
status.FullyFunctional = spb.State_SUCCESS_STATE
return status
}
func (s *Status) backintStatus(ctx context.Context) *spb.ServiceStatus {
status := &spb.ServiceStatus{
Name: "Backint",
State: spb.State_UNSPECIFIED_STATE,
}
if s.BackintParametersPath == "" {
status.UnspecifiedStateMessage = "Backint parameters file not specified / Disabled"
return status
}
p := backintconfiguration.Parameters{
User: "Status OTE",
Function: "diagnose",
ParamFile: s.BackintParametersPath,
}
config, err := p.ParseArgsAndValidateConfig(s.backintReadFile, s.backintReadFile)
if err != nil {
log.CtxLogger(ctx).Errorw("Could not parse backint parameters", "error", err)
status.State = spb.State_ERROR_STATE
status.ErrorMessage = err.Error()
return status
}
printConfig := backintconfiguration.ConfigToPrint(config)
status.State = spb.State_SUCCESS_STATE
log.CtxLogger(ctx).Infof("Backint parameters: %v", printConfig)
// Due to the large number of config values, print the important ones always
// and the others only if the user has overridden the default.
status.ConfigValues = []*spb.ConfigValue{
configValue("bucket", printConfig.Bucket, ""),
configValue("log_to_cloud", printConfig.LogToCloud.GetValue(), true),
configValue("param_file", printConfig.ParamFile, ""),
}
overrideOnlyConfigValues := []*spb.ConfigValue{
configValue("buffer_size_mb", printConfig.BufferSizeMb, 100),
configValue("client_endpoint", printConfig.ClientEndpoint, ""),
configValue("compress", printConfig.Compress, false),
configValue("custom_time", printConfig.CustomTime, ""),
configValue("encryption_key", printConfig.EncryptionKey, ""),
configValue("folder_prefix", printConfig.FolderPrefix, ""),
configValue("file_read_timeout_ms", printConfig.FileReadTimeoutMs, 60000),
configValue("kms_key", printConfig.KmsKey, ""),
configValue("metadata", printConfig.Metadata, map[string]string{}),
configValue("parallel_streams", printConfig.ParallelStreams, 1),
configValue("parallel_recovery_streams", printConfig.ParallelRecoveryStreams, 0),
configValue("rate_limit_mb", printConfig.RateLimitMb, 0),
configValue("recovery_bucket", printConfig.RecoveryBucket, ""),
configValue("recovery_folder_prefix", printConfig.RecoveryFolderPrefix, ""),
configValue("retries", printConfig.Retries, 5),
configValue("send_metrics_to_monitoring", printConfig.SendMetricsToMonitoring.GetValue(), true),
configValue("service_account_key", printConfig.ServiceAccountKey, ""),
configValue("shorten_folder_path", printConfig.ShortenFolderPath, false),
configValue("storage_class", printConfig.StorageClass, "STORAGE_CLASS_UNSPECIFIED"),
configValue("threads", printConfig.Threads, 64),
configValue("xml_multipart_upload", printConfig.XmlMultipartUpload, false),
configValue("object_retention_mode", printConfig.ObjectRetentionMode, ""),
configValue("object_retention_time", printConfig.ObjectRetentionTime, ""),
}
for _, configValue := range overrideOnlyConfigValues {
if !configValue.GetIsDefault() {
status.ConfigValues = append(status.ConfigValues, configValue)
}
}
permissionsStatus, allGranted, err := s.fetchPermissionsStatus(ctx, backintLabel, &permissions.ResourceDetails{
ProjectID: s.CloudProps.GetProjectId(),
BucketName: printConfig.Bucket,
})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
if printConfig.XmlMultipartUpload {
multipartUploadPermissionsStatus, multipartAllGranted, err := s.fetchPermissionsStatus(ctx, backintMultipartLabel, &permissions.ResourceDetails{
ProjectID: s.CloudProps.GetProjectId(),
BucketName: printConfig.Bucket,
})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking multipart upload IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
permissionsStatus = append(permissionsStatus, multipartUploadPermissionsStatus...)
allGranted = allGranted && multipartAllGranted
}
status.IamPermissions = permissionsStatus
if !allGranted {
return logCheckFailureAndReturnStatus(ctx, status, "IAM permissions not granted", spb.State_FAILURE_STATE)
}
connectParams := &storage.ConnectParameters{
StorageClient: s.backintClient,
ServiceAccount: config.GetServiceAccountKey(),
BucketName: config.GetBucket(),
UserAgentSuffix: "Backint for GCS",
VerifyConnection: true,
MaxRetries: config.GetRetries(),
Endpoint: config.GetClientEndpoint(),
UserAgent: configuration.StorageAgentName(),
}
_, ok := storage.ConnectToBucket(ctx, connectParams)
if !ok {
return logCheckFailureAndReturnStatus(ctx, status, "Failed to connect to bucket", spb.State_FAILURE_STATE)
}
status.FullyFunctional = spb.State_SUCCESS_STATE
return status
}
func (s *Status) diskSnapshotStatus(ctx context.Context, config *cpb.Configuration) *spb.ServiceStatus {
status := &spb.ServiceStatus{
Name: "Disk Snapshot",
State: spb.State_SUCCESS_STATE, // Disk snapshot is an OTE so there's no enabled state to check.
}
permissionsStatus, allGranted, err := s.fetchPermissionsStatus(ctx, diskBackupLabel, &permissions.ResourceDetails{
ProjectID: s.CloudProps.GetProjectId(),
})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
status.IamPermissions = permissionsStatus
if !allGranted {
return logCheckFailureAndReturnStatus(ctx, status, "IAM permissions not granted", spb.State_FAILURE_STATE)
}
// TODO: Add striped disk checks once that is GA.
status.FullyFunctional = spb.State_SUCCESS_STATE
return status
}
func (s *Status) workloadManagerStatus(ctx context.Context, config *cpb.Configuration) *spb.ServiceStatus {
status := &spb.ServiceStatus{
Name: "Workload Manager Evaluation",
State: spb.State_UNSPECIFIED_STATE,
ConfigValues: []*spb.ConfigValue{
configValue("collect_workload_validation_metrics", config.GetCollectionConfiguration().GetCollectWorkloadValidationMetrics().GetValue(), true),
configValue("config_target_environment", config.GetCollectionConfiguration().GetWorkloadValidationCollectionDefinition().GetConfigTargetEnvironment(), cpb.TargetEnvironment_PRODUCTION),
configValue("fetch_latest_config", config.GetCollectionConfiguration().GetWorkloadValidationCollectionDefinition().GetFetchLatestConfig().GetValue(), true),
configValue("workload_validation_db_metrics_frequency", config.GetCollectionConfiguration().GetWorkloadValidationDbMetricsFrequency(), 3600),
configValue("workload_validation_metrics_frequency", config.GetCollectionConfiguration().GetWorkloadValidationMetricsFrequency(), 300),
},
}
if !config.GetCollectionConfiguration().GetCollectWorkloadValidationMetrics().GetValue() {
status.State = spb.State_FAILURE_STATE
return status
}
status.State = spb.State_SUCCESS_STATE
permissionsStatus, allGranted, err := s.fetchPermissionsStatus(ctx, workloadEvaluationMELabel, &permissions.ResourceDetails{ProjectID: s.CloudProps.GetProjectId()})
if err != nil {
return logCheckFailureAndReturnStatus(ctx, status, fmt.Sprintf("Error checking IAM permissions: %v", err.Error()), spb.State_ERROR_STATE)
}
status.IamPermissions = permissionsStatus
if !allGranted {
return logCheckFailureAndReturnStatus(ctx, status, "IAM permissions not granted", spb.State_FAILURE_STATE)
}
status.FullyFunctional = spb.State_SUCCESS_STATE
return status
}
func configValue(name string, value any, defaultValue any) *spb.ConfigValue {
return &spb.ConfigValue{
Name: name,
Value: fmt.Sprint(value),
IsDefault: fmt.Sprint(value) == fmt.Sprint(defaultValue),
}
}
func (s *Status) fetchPermissionsStatus(ctx context.Context, functionalityName string, resourceProps *permissions.ResourceDetails) ([]*spb.IAMPermission, bool, error) {
permissions, err := s.permissionsStatus(ctx, s.iamService, functionalityName, resourceProps)
if err != nil {
return nil, false, err
}
allGranted := true
var permissionsStatus []*spb.IAMPermission
for permission, granted := range permissions {
var permissionState spb.State
if granted {
permissionState = spb.State_SUCCESS_STATE
} else {
allGranted = false
permissionState = spb.State_FAILURE_STATE
}
permissionsStatus = append(permissionsStatus, &spb.IAMPermission{
Name: permission,
Granted: permissionState,
})
// Sort the permissions by name for consistency.
sort.Slice(permissionsStatus, func(i, j int) bool {
return permissionsStatus[i].GetName() < permissionsStatus[j].GetName()
})
}
return permissionsStatus, allGranted, nil
}
func logCheckFailureAndReturnStatus(ctx context.Context, status *spb.ServiceStatus, msg string, fullyFunctional spb.State) *spb.ServiceStatus {
log.CtxLogger(ctx).Errorw(msg)
status.FullyFunctional = fullyFunctional
status.ErrorMessage = msg
return status
}
func checkFilePermissions(path string, wantPermissions fs.FileMode, stat statFunc) error {
fileInfo, err := stat(path)
if err != nil {
return err
}
if fileInfo.Mode().Perm()&wantPermissions != wantPermissions {
return fmt.Errorf("%s has incorrect permissions. Got: %#o, want: %#o", path, fileInfo.Mode().Perm(), wantPermissions)
}
return nil
}