integration_test/agents/agents.go (1,031 lines of code) (raw):

// Copyright 2022 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build integration_test /* Package agents holds various helpers for interacting with the agents on GCE instances. Note: This file contains various things that appear unused, but in the future we would like them to be used for a few tests inside Google's code silo. So please don't delete things just because they are unused within the Ops Agent repo. */ package agents import ( "context" "embed" "fmt" "log" "os" "path" "regexp" "strings" "testing" "time" "github.com/GoogleCloudPlatform/opentelemetry-operations-collector/integration_test/gce-testing-internal/gce" "github.com/GoogleCloudPlatform/opentelemetry-operations-collector/integration_test/gce-testing-internal/logging" "github.com/blang/semver" "github.com/cenkalti/backoff/v4" "go.uber.org/multierr" ) // TrailingQueryWindow represents how far into the past to look when querying // for uptime metrics. const TrailingQueryWindow = 2 * time.Minute // OpsAgentPluginServerPort defines the port on which the Ops Agent UAP Plugin gRPC server runs. const OpsAgentPluginServerPort = "1234" //go:embed testdata var scriptsDir embed.FS // AgentPackage represents a thing that we ask OS Config to install for us. // One agentPackage can expand out and result in multiple AgentServices // being installed on the VM. type AgentPackage struct { // Passed to the --agent-rules flag of "gcloud ... policies create|update" Type string // The name of the package that actually gets installed on the VM. PackageName string } // AgentService represents an agent service/process that will end up running // on the VM. type AgentService struct { // The name of the systemctl service for this agent. ServiceName string // The name of the agent according to package managers. The PackageName // is what is passed to dpkg/apt/yum/zypper/etc commands. PackageName string // The name of the agent according to the uptime metric that it uploads to // the monitoring backend. // If this is "", it means that this service doesn't upload an uptime metric. UptimeMetricName string } // TestCase holds the name of a test scenario and the packages being tested. type TestCase struct { Name string AgentPackages []AgentPackage } // AgentServices expands the list of packages to install into the list of services // that will end up running on the machine. This is necessary for the ops agent, // which is one package but results in multiple running agent services. func AgentServices(t *testing.T, imageSpec string, pkgs []AgentPackage) []AgentService { t.Helper() if gce.IsWindows(imageSpec) { if len(pkgs) != 1 || pkgs[0].Type != OpsAgentType { t.Fatalf("AgentServices() assumes that the only package we want to install on Windows is the ops agent. Requested packages: %v", pkgs) } return []AgentService{ { ServiceName: "google-cloud-ops-agent", PackageName: "google-cloud-ops-agent", // This service doesn't currently have an uptime metric. UptimeMetricName: "", }, { ServiceName: "google-cloud-ops-agent-fluent-bit", PackageName: "google-cloud-ops-agent-fluent-bit", // TODO(b/170138116): Enable this metric once it is being uploaded for // Fluent-Bit. UptimeMetricName: "", }, { ServiceName: "google-cloud-ops-agent-opentelemetry-collector", PackageName: "google-cloud-ops-agent-opentelemetry-collector", UptimeMetricName: "google-cloud-ops-agent-metrics", }, } } var services []AgentService for _, pkg := range pkgs { switch pkg.Type { case LoggingAgentType: services = append(services, AgentService{ ServiceName: "google-fluentd", PackageName: "google-fluentd", UptimeMetricName: "google-fluentd", }, ) case MetricsAgentType: services = append(services, AgentService{ ServiceName: "stackdriver-agent", PackageName: "stackdriver-agent", UptimeMetricName: "stackdriver_agent", }, ) case OpsAgentType: services = append(services, AgentService{ ServiceName: "google-cloud-ops-agent-fluent-bit", PackageName: "google-cloud-ops-agent", // TODO(b/170138116): Enable this check once the uptime metric is // being uploaded for Fluent-Bit. UptimeMetricName: "", }, AgentService{ ServiceName: "google-cloud-ops-agent-opentelemetry-collector", PackageName: "google-cloud-ops-agent", UptimeMetricName: "google-cloud-ops-agent-metrics", }, ) default: t.Fatalf("Package %#v has a Type that is not supported by this test", pkg) } } return services } var ( // LoggingAgentType represents the type var for the Logging Agent. LoggingAgentType = "logging" // MetricsAgentType represents the type var for the Monitoring Agent. MetricsAgentType = "metrics" // OpsAgentType represents the type var for the Ops Agent. OpsAgentType = "ops-agent" // LoggingPackage holds the type var and package name of the Logging Agent. LoggingPackage = AgentPackage{Type: LoggingAgentType, PackageName: "google-fluentd"} // MetricsPackage holds the type var and package name of the Metrics Agent. MetricsPackage = AgentPackage{Type: MetricsAgentType, PackageName: "stackdriver-agent"} // OpsPackage holds the type var and package name of the Ops Agent. OpsPackage = AgentPackage{Type: OpsAgentType, PackageName: "google-cloud-ops-agent"} // This suffix helps Kokoro set the right Content-type for log files. See b/202432085. txtSuffix = ".txt" ) // RunOpsAgentDiagnostics will fetch as much debugging info as it can from the // given VM. This function assumes that the VM is running the ops agent. // All the commands run and their output are dumped to various files in the // directory managed by the given DirectoryLogger. func RunOpsAgentDiagnostics(ctx context.Context, logger *logging.DirectoryLogger, vm *gce.VM) { logger.ToMainLog().Printf("Starting RunOpsAgentDiagnostics()...") if gce.IsWindows(vm.ImageSpec) { runOpsAgentDiagnosticsWindows(ctx, logger, vm) return } // Tests like TestPortsAndAPIHealthChecks will make these curl operations // hang, so give them a shorter timeout to avoid hanging the whole test. metricsCtx, cancel := context.WithTimeout(ctx, 30*time.Second) defer cancel() gce.RunRemotely(metricsCtx, logger.ToFile("fluent_bit_metrics.txt"), vm, "sudo curl -s localhost:20202/metrics") gce.RunRemotely(metricsCtx, logger.ToFile("otel_metrics.txt"), vm, "sudo curl -s localhost:20201/metrics") isUAPPlugin := gce.IsOpsAgentUAPPlugin() if isUAPPlugin { gce.RunRemotely(ctx, logger.ToFile("status_for_ops_agent_uap_plugin.txt"), vm, fmt.Sprintf("grpcurl -plaintext -d '{}' localhost:%s plugin_comm.GuestAgentPlugin/GetStatus", OpsAgentPluginServerPort)) } else { gce.RunRemotely(ctx, logger.ToFile("systemctl_status_for_ops_agent.txt"), vm, "sudo systemctl status google-cloud-ops-agent*") } gce.RunRemotely(ctx, logger.ToFile("journalctl_output.txt"), vm, "sudo journalctl -xe") fileList := getOpsAgentLogFilesList(vm.ImageSpec) for _, log := range fileList { _, basename := path.Split(log) gce.RunRemotely(ctx, logger.ToFile(basename+txtSuffix), vm, "sudo cat "+log) } } func getOpsAgentLogFilesList(imageSpec string) []string { if gce.IsOpsAgentUAPPlugin() { return []string{ gce.SyslogLocation(imageSpec), OpsAgentConfigPath(imageSpec), "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/log/google-cloud-ops-agent/health-checks.log", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/log/google-cloud-ops-agent/subagents/logging-module.log", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/log/google-cloud-ops-agent/subagents/metrics-module.log", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/log/nvidia-installer.log", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/run/google-cloud-ops-agent-fluent-bit/fluent_bit_main.conf", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/run/google-cloud-ops-agent-fluent-bit/fluent_bit_parser.conf", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/run/google-cloud-ops-agent-opentelemetry-collector/otel.yaml", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/run/google-cloud-ops-agent-opentelemetry-collector/feature_tracking_otlp.json", "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/run/google-cloud-ops-agent-opentelemetry-collector/enabled_receivers_otlp.json", } } return []string{ gce.SyslogLocation(imageSpec), OpsAgentConfigPath(imageSpec), "/var/log/google-cloud-ops-agent/health-checks.log", "/var/log/google-cloud-ops-agent/subagents/logging-module.log", "/var/log/google-cloud-ops-agent/subagents/metrics-module.log", "/var/log/nvidia-installer.log", "/run/google-cloud-ops-agent-fluent-bit/fluent_bit_main.conf", "/run/google-cloud-ops-agent-fluent-bit/fluent_bit_parser.conf", "/run/google-cloud-ops-agent-opentelemetry-collector/otel.yaml", "/run/google-cloud-ops-agent-opentelemetry-collector/feature_tracking_otlp.json", "/run/google-cloud-ops-agent-opentelemetry-collector/enabled_receivers_otlp.json", } } func runOpsAgentDiagnosticsWindows(ctx context.Context, logger *logging.DirectoryLogger, vm *gce.VM) { stateDir := `C:\ProgramData\Google\Cloud Operations\Ops Agent\` if gce.IsOpsAgentUAPPlugin() { stateDir = `C:\ProgramData\Google\Compute Engine\google-guest-agent\agent_state\plugins\ops-agent-plugin\` gce.RunRemotely(ctx, logger.ToFile("ops_agent_uap_plugin_logs.txt"), vm, "Get-WinEvent -FilterHashtable @{ Logname='Application'; ProviderName='google-cloud-ops-agent-uap-plugin' } | Format-Table -AutoSize -Wrap") gce.RunRemotely(ctx, logger.ToFile("status_for_ops_agent_uap_plugin.txt"), vm, fmt.Sprintf(`C:\grpcurl.exe -plaintext -d '{}' localhost:%s plugin_comm.GuestAgentPlugin/GetStatus`, OpsAgentPluginServerPort)) gce.RunRemotely(ctx, logger.ToFile("active_processes_in_vm.txt"), vm, `Get-WmiObject -Class Win32_Process | Select-Object Name, ProcessId, ParentProcessId`) } else { gce.RunRemotely(ctx, logger.ToFile("windows_System_log.txt"), vm, "Get-WinEvent -LogName System | Format-Table -AutoSize -Wrap") gce.RunRemotely(ctx, logger.ToFile("Get-Service_output.txt"), vm, "Get-Service google-cloud-ops-agent* | Format-Table -AutoSize -Wrap") gce.RunRemotely(ctx, logger.ToFile("ops_agent_logs.txt"), vm, "Get-WinEvent -FilterHashtable @{ Logname='Application'; ProviderName='google-cloud-ops-agent' } | Format-Table -AutoSize -Wrap") gce.RunRemotely(ctx, logger.ToFile("open_telemetry_agent_logs.txt"), vm, "Get-WinEvent -FilterHashtable @{ Logname='Application'; ProviderName='google-cloud-ops-agent-opentelemetry-collector' } | Format-Table -AutoSize -Wrap") } // Fluent-Bit has not implemented exporting logs to the Windows event log yet. gce.RunRemotely(ctx, logger.ToFile("fluent_bit_agent_logs.txt"), vm, fmt.Sprintf("Get-Content -Path '%s' -Raw", stateDir+`log\logging-module.log`)) gce.RunRemotely(ctx, logger.ToFile("health-checks.txt"), vm, fmt.Sprintf("Get-Content -Path '%s' -Raw", stateDir+`log\health-checks.log`)) for _, conf := range []string{ OpsAgentConfigPath(vm.ImageSpec), stateDir + `generated_configs\fluentbit\fluent_bit_main.conf`, stateDir + `generated_configs\fluentbit\fluent_bit_parser.conf`, stateDir + `generated_configs\otel\otel.yaml`, stateDir + `generated_configs\otel\feature_tracking_otlp.json`, stateDir + `generated_configs\otel\enabled_receivers_otlp.json`, } { pathParts := strings.Split(conf, `\`) basename := pathParts[len(pathParts)-1] gce.RunRemotely(ctx, logger.ToFile(basename+txtSuffix), vm, fmt.Sprintf("Get-Content -Path '%s' -Raw", conf)) } } // WaitForUptimeMetrics waits for the given uptime metrics to be visible in // the monitoring backend, returning an error if the lookup failed or the // metrics didn't become visible. func WaitForUptimeMetrics(ctx context.Context, logger *log.Logger, vm *gce.VM, services []AgentService) error { // Run gce.WaitForMetric in parallel for each agent. var err error c := make(chan error, len(services)) for _, service := range services { service := service // https://golang.org/doc/faq#closures_and_goroutines go func() { if service.UptimeMetricName == "" { logger.Printf("Skipping lookup of uptime metric for %v", service.ServiceName) c <- nil return } _, err := gce.WaitForMetric( ctx, logger, vm, "agent.googleapis.com/agent/uptime", TrailingQueryWindow, []string{fmt.Sprintf("metric.labels.version = starts_with(%q)", service.UptimeMetricName)}, false) c <- err }() } for range services { err = multierr.Append(err, <-c) } if err != nil { return fmt.Errorf("WaitForUptimeMetrics() error: %v", err) } return nil } // CheckServicesRunning asserts that the given services are currently running // on the given VM, returning an error if they are not (or if there was an // error determining their status). func CheckServicesRunning(ctx context.Context, logger *log.Logger, vm *gce.VM, services []AgentService) error { var err error for _, service := range services { if gce.IsWindows(vm.ImageSpec) { output, statusErr := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("(Get-Service -Name '%s').Status", service.ServiceName)) if statusErr != nil { err = multierr.Append(err, fmt.Errorf("no service named %q could be found: %v", service.ServiceName, statusErr)) continue } status := strings.TrimSpace(output.Stdout) if status != "Running" { err = multierr.Append(err, fmt.Errorf(`for service %q, got status=%q, want "Running"`, service.ServiceName, status)) continue } continue } if _, statusErr := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("sudo service %s status", service.ServiceName)); statusErr != nil { err = multierr.Append(err, fmt.Errorf("RunRemotely(): status of %s was not OK. error was: %v", service.ServiceName, statusErr)) } } if err != nil { return fmt.Errorf("CheckServicesRunning() error: %v", err) } return nil } // CheckServicesRunningAndWaitForMetrics asserts that the agent services are // currently running on the given VM and waits for their uptime metrics to // become visible. func CheckServicesRunningAndWaitForMetrics(ctx context.Context, logger *log.Logger, vm *gce.VM, services []AgentService) error { logger.Print("Checking that services are running.") if err := CheckServicesRunning(ctx, logger, vm, services); err != nil { return err } logger.Print("Waiting for uptime metrics to be visible.") if err := WaitForUptimeMetrics(ctx, logger, vm, services); err != nil { return err } logger.Print("Agents are successfully running.") return nil } // CheckServicesNotRunning asserts that the given services are not running on the // given VM. func CheckServicesNotRunning(ctx context.Context, logger *log.Logger, vm *gce.VM, services []AgentService) error { logger.Print("Checking that agents are NOT running") for _, service := range services { if gce.IsWindows(vm.ImageSpec) { // There are two possible cases that are acceptable here: // 1) the service has been deleted // 2) the service still exists but is in some state other than "Running", // such as "Stopped". // We currently only see case #1, but let's permit case #2 as well. // The following command should output nothing, or maybe just a blank // line, in either case. output, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("Get-Service | Where-Object {$_.Name -eq '%s' -and $_.Status -eq 'Running'}", service.ServiceName)) if err != nil { return fmt.Errorf("CheckServicesNotRunning(): error looking up running services named %v: %v", service.ServiceName, err) } if strings.TrimSpace(output.Stdout) != "" { return fmt.Errorf("CheckServicesNotRunning(): service %v was unexpectedly 'Running'. output: %s", service.ServiceName, output.Stdout) } continue } if service.ServiceName == "google-fluentd" { continue // TODO(b/159817885): Assert that google-fluentd is not running. } if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("! sudo service %s status", service.ServiceName)); err != nil { return fmt.Errorf("CheckServicesNotRunning(): command could not be run or status of %s was unexpectedly OK. err: %v", service.ServiceName, err) } } return nil } func packageManagerCmd(vm *gce.VM) (string, error) { if gce.IsWindows(vm.ImageSpec) { return "googet -noconfirm", nil } switch vm.OS.ID { case "centos", "rhel", "rocky": return "sudo yum -y", nil case "opensuse-leap", "sles", "sles-sap": return "sudo zypper --non-interactive", nil case "debian", "ubuntu": return "sudo apt-get update; sudo apt-get -y", nil default: return "", fmt.Errorf("packageManagerCmd() doesn't support image spec %s with value '%s'", vm.ImageSpec, vm.OS.ID) } } // managePackages calls the package manager of the vm with the provided instruction (install/remove) for a set of packages. func managePackages(ctx context.Context, logger *log.Logger, vm *gce.VM, instruction string, pkgs []string) error { pkgMCmd, err := packageManagerCmd(vm) if err != nil { return err } cmd := fmt.Sprintf("%s %s %s", pkgMCmd, instruction, strings.Join(pkgs, " ")) _, err = gce.RunRemotely(ctx, logger, vm, cmd) return err } // tryInstallPackages attempts once to install the given packages. func tryInstallPackages(ctx context.Context, logger *log.Logger, vm *gce.VM, pkgs []string) error { return managePackages(ctx, logger, vm, "install", pkgs) } // InstallPackages installs the given packages on the given VM. Assumes repo is // configured. This function has some retries to paper over transient issues // that often happen when installing packages. func InstallPackages(ctx context.Context, logger *log.Logger, vm *gce.VM, pkgs []string) error { attemptFunc := func() error { return tryInstallPackages(ctx, logger, vm, pkgs) } if err := RunInstallFuncWithRetry(ctx, logger, vm, attemptFunc); err != nil { return fmt.Errorf("could not install %v. err: %v", pkgs, err) } return nil } // UninstallPackages removes the given packages from the given VM. func UninstallPackages(ctx context.Context, logger *log.Logger, vm *gce.VM, pkgs []string) error { return managePackages(ctx, logger, vm, "remove", pkgs) } // checkPackages asserts that the given packages on the given VM are in a state matching the installed argument. func checkPackages(ctx context.Context, logger *log.Logger, vm *gce.VM, pkgs []string, installed bool) error { errPrefix := "" if installed { errPrefix = "not " } if gce.IsWindows(vm.ImageSpec) { output, err := gce.RunRemotely(ctx, logger, vm, "googet installed") if err != nil { return err } for _, pkg := range pkgs { // Look for the package name in the output. \b means word boundary. expr := `\b` + pkg + `\b` re, err := regexp.Compile(expr) if err != nil { return fmt.Errorf("regexp %q failed to compile: %v", expr, err) } if m := re.FindString(output.Stdout); (m == "") == installed { return fmt.Errorf("package %q was unexpectedly %sinstalled", pkg, errPrefix) } } // Success: the expected set of packages was found in the output of "googet installed". return nil } for _, pkg := range pkgs { cmd := "" if IsRPMBased(vm.ImageSpec) { cmd = fmt.Sprintf("rpm --query %s", pkg) if !installed { cmd = "! " + cmd } } else if strings.HasPrefix(vm.ImageSpec, "debian-cloud") || strings.HasPrefix(vm.ImageSpec, "ubuntu-os-cloud") { // dpkg's package states are documented in "man 1 dpkg". // "config-files" means that the package is not installed but its config files // are still on the system. Accepting both that and "not-installed" is more // future-proof than just accepting "config-files". cmd = fmt.Sprintf( `(dpkg-query --show '--showformat=${db:Status-Status}' %s || echo 'not-installed') | grep --extended-regexp '(not-installed)|(config-files)'`, pkg) if installed { cmd = "! (" + cmd + ")" } } else { return fmt.Errorf("checkPackages() does not support image spec: %s", vm.ImageSpec) } if _, err := gce.RunRemotely(ctx, logger, vm, cmd); err != nil { return fmt.Errorf("command could not be run or %q was unexpectedly %sinstalled. err: %v", pkg, errPrefix, err) } } return nil } // CheckPackagesNotInstalled asserts that the given packages are not installed on the given VM. func CheckPackagesNotInstalled(ctx context.Context, logger *log.Logger, vm *gce.VM, pkgs []string) error { return checkPackages(ctx, logger, vm, pkgs, false) } // CheckPackagesInstalled asserts that the given packages are installed on the given VM. func CheckPackagesInstalled(ctx context.Context, logger *log.Logger, vm *gce.VM, pkgs []string) error { return checkPackages(ctx, logger, vm, pkgs, true) } // CheckAgentsUninstalled asserts that the given agent services and packages // are not installed on the given VM. func CheckAgentsUninstalled(ctx context.Context, logger *log.Logger, vm *gce.VM, services []AgentService) error { if err := CheckServicesNotRunning(ctx, logger, vm, services); err != nil { return err } var pkgs []string for _, service := range services { pkgs = append(pkgs, service.PackageName) } if err := CheckPackagesNotInstalled(ctx, logger, vm, pkgs); err != nil { return err } // TODO(martijnvs): Also check that the ops agent package itself is uninstalled. return nil } // IsRPMBased checks if the image spec is RPM based. func IsRPMBased(imageSpec string) bool { return strings.HasPrefix(imageSpec, "centos-cloud") || strings.HasPrefix(imageSpec, "rhel-") || strings.HasPrefix(imageSpec, "rocky-linux-cloud") || strings.HasPrefix(imageSpec, "suse-cloud") || strings.HasPrefix(imageSpec, "suse-sap-cloud") || strings.HasPrefix(imageSpec, "opensuse-cloud") || strings.Contains(imageSpec, "sles-") } // StripTildeSuffix strips off everything after the first ~ character. We see // version numbers with tildes on debian (e.g. 1.0.1~debian10) and the semver // library doesn't parse them out properly. func StripTildeSuffix(version string) string { ind := strings.Index(version, "~") if ind == -1 { return version } return version[:ind] } func fetchPackageVersionWindows(ctx context.Context, logger *log.Logger, vm *gce.VM, pkg string) (semver.Version, error) { output, err := gce.RunRemotely(ctx, logger, vm, "googet installed -info "+pkg) if err != nil { return semver.Version{}, err } // Look for a line in the output with a version number. Some real examples: // " Version : 20201229.01.0+win@1" // " Version : 1.0.6@1" re, err := regexp.Compile(`\s*Version\s*:\s*([0-9]+\.[0-9]+\.[0-9]+)[^0-9]+.*`) if err != nil { return semver.Version{}, fmt.Errorf("Could not compile regular expression: %v", err) } lines := strings.Split(output.Stdout, "\r\n") var version string for _, line := range lines { matches := re.FindStringSubmatch(line) if len(matches) > 1 { version = matches[1] break } } if version == "" { return semver.Version{}, fmt.Errorf("Could not parse version from googet output: %v", output.Stdout) } return semver.Make(version) } // fetchPackageVersion runs a googet/dpkg/rpm command remotely to get the // installed version of a package. It has some retries to accommodate the fact // that sometimes it takes ~1 minute between when the agents are up and running // and the time that the package shows up as installed with rpm --query // (b/178096139). // TODO(martijnvs): Understand why b/178096139 happens and remove the retries. func fetchPackageVersion(ctx context.Context, logger *log.Logger, vm *gce.VM, pkg string) (semver.Version, error) { logger.Printf("Getting %s version", pkg) if gce.IsWindows(vm.ImageSpec) { return fetchPackageVersionWindows(ctx, logger, vm, pkg) } var version semver.Version tryGetVersion := func() error { cmd := `dpkg-query --show --showformat=\$\{Version} ` + pkg if IsRPMBased(vm.ImageSpec) { cmd = `rpm --query --queryformat=%\{VERSION} ` + pkg } output, err := gce.RunRemotely(ctx, logger, vm, cmd) if err != nil { return err } vStr := strings.TrimSpace(output.Stdout) if !IsRPMBased(vm.ImageSpec) { vStr = StripTildeSuffix(vStr) } version, err = semver.Make(vStr) if err != nil { err = fmt.Errorf("semver.Make() could not convert %q to a Version: %v", vStr, err) logger.Print(err) return err } return nil } backoffPolicy := backoff.WithContext(backoff.WithMaxRetries(backoff.NewConstantBackOff(20*time.Second), 10), ctx) if err := backoff.Retry(tryGetVersion, backoffPolicy); err != nil { return semver.Version{}, err } return version, nil } // FetchPackageVersions retrieves the semver.Version on the given VM for all the agents // in the given []agents.AgentPackage. func FetchPackageVersions(ctx context.Context, logger *log.Logger, vm *gce.VM, packages []AgentPackage) (map[AgentPackage]semver.Version, error) { versions := make(map[AgentPackage]semver.Version) var err error for _, pkg := range packages { current, versionErr := fetchPackageVersion(ctx, logger, vm, pkg.PackageName) logger.Printf("fetchPackageVersion() returned version=%v, err=%v", current, versionErr) if versionErr != nil { err = multierr.Append(err, fmt.Errorf("fetchPackageVersion(%q) failed: %v", pkg.PackageName, versionErr)) continue } versions[pkg] = current } return versions, err } // isRetriableInstallError checks to see if the error may be transient. func isRetriableInstallError(imageSpec string, err error) bool { if strings.Contains(err.Error(), "Could not refresh zypper repositories.") || strings.Contains(err.Error(), "Credentials are invalid") || strings.Contains(err.Error(), "Resource temporarily unavailable") || strings.Contains(err.Error(), "System management is locked by the application") { return true } if gce.IsWindows(imageSpec) && strings.Contains(err.Error(), "context deadline exceeded") { return true // See b/197127877 for history. } if strings.Contains(imageSpec, "rhel-8-") && strings.HasSuffix(imageSpec, "-sap-ha") && strings.Contains(err.Error(), "Could not refresh the google-cloud-ops-agent yum repositories") { return true // See b/174039270 for history. } if strings.Contains(imageSpec, "rhel-8-") && strings.HasSuffix(imageSpec, "-sap-ha") && strings.Contains(err.Error(), "Failed to download metadata for repo 'rhui-rhel-8-") { return true // This happens when the RHEL servers are down. See b/189950957. } if strings.HasPrefix(imageSpec, "rhel-") && strings.Contains(err.Error(), "SSL_ERROR_SYSCALL") { return true // b/187661497. Example: screen/3PMwAvhNBKWVYub } if strings.HasPrefix(imageSpec, "rhel-") && strings.Contains(err.Error(), "Encountered end of file") { return true // b/184729120#comment31. Example: screen/4yK9evoY68LiaLr } if strings.HasPrefix(imageSpec, "ubuntu-os-cloud") && strings.Contains(err.Error(), "Clearsigned file isn't valid") { // The upstream repo was in an inconsistent state. The error looks like: // screen/7U24zrRwADyYKqb return true } if strings.HasPrefix(imageSpec, "ubuntu-os-cloud") && strings.Contains(err.Error(), "Mirror sync in progress?") { // The mirror repo was in an inconsistent state. The error looks like: // http://screen/Ai2CHc7fcRosHJu return true } if strings.HasPrefix(imageSpec, "ubuntu-os-cloud") && strings.Contains(err.Error(), "Hash Sum mismatch") { // The mirror repo was in an inconsistent state. The error looks like: // http://screen/8HjakedwVnXZvw6 return true } return false } // RunInstallFuncWithRetry runs the given installFunc, retrying errors that // seem transient. In between attempts, this function may attempt to recover // from certain errors. // This function is intended to paper over various problems that arise when // running apt/zypper/yum. installFunc can be any function that is invoking // apt/zypper/yum, directly or indirectly. func RunInstallFuncWithRetry(ctx context.Context, logger *log.Logger, vm *gce.VM, installFunc func() error) error { shouldRetry := func(err error) bool { return isRetriableInstallError(vm.ImageSpec, err) } installWithRecovery := func() error { err := installFunc() if err != nil && shouldRetry(err) && strings.Contains(vm.ImageSpec, "rhel-8-") && strings.HasSuffix(vm.ImageSpec, "-sap-ha") { logger.Println("attempting recovery steps from https://access.redhat.com/discussions/4656371 so that subsequent attempts are more likely to succeed... see b/189950957") gce.RunRemotely(ctx, logger, vm, "sudo dnf clean all && sudo rm -r /var/cache/dnf && sudo dnf upgrade") } if err != nil && !shouldRetry(err) { err = backoff.Permanent(err) } // Returning a non-permanent error triggers retries. return err } backoffPolicy := backoff.WithContext(backoff.WithMaxRetries(backoff.NewConstantBackOff(30*time.Second), 5), ctx) return backoff.Retry(installWithRecovery, backoffPolicy) } // InstallStandaloneWindowsLoggingAgent installs the Stackdriver Logging agent // on a Windows VM. func InstallStandaloneWindowsLoggingAgent(ctx context.Context, logger *log.Logger, vm *gce.VM) error { // https://cloud.google.com/logging/docs/agent/installation#joint-install // The command needed to be adjusted to work in a non-GUI context. cmd := `(New-Object Net.WebClient).DownloadFile("https://dl.google.com/cloudagents/windows/StackdriverLogging-v1-16.exe", "${env:UserProfile}\StackdriverLogging-v1-16.exe") Start-Process -FilePath "${env:UserProfile}\StackdriverLogging-v1-16.exe" -ArgumentList "/S" -Wait -NoNewWindow` _, err := gce.RunRemotely(ctx, logger, vm, cmd) return err } // InstallStandaloneWindowsMonitoringAgent installs the Stackdriver Monitoring // agent on a Windows VM. func InstallStandaloneWindowsMonitoringAgent(ctx context.Context, logger *log.Logger, vm *gce.VM) error { // https://cloud.google.com/monitoring/agent/installation#joint-install // The command needed to be adjusted to work in a non-GUI context. cmd := `(New-Object Net.WebClient).DownloadFile("https://repo.stackdriver.com/windows/StackdriverMonitoring-GCM-46.exe", "${env:UserProfile}\StackdriverMonitoring-GCM-46.exe") Start-Process -FilePath "${env:UserProfile}\StackdriverMonitoring-GCM-46.exe" -ArgumentList "/S" -Wait -NoNewWindow` _, err := gce.RunRemotely(ctx, logger, vm, cmd) return err } func getRestartOpsAgentCmd(imageSpec string) string { if gce.IsOpsAgentUAPPlugin() { grpcurlExecutable := "grpcurl" if gce.IsWindows(imageSpec) { grpcurlExecutable = `C:\grpcurl.exe` return fmt.Sprintf("%s -plaintext -d '{}' localhost:%s plugin_comm.GuestAgentPlugin/Stop; Start-Sleep -Seconds 5; %s -plaintext -d '{}' localhost:%s plugin_comm.GuestAgentPlugin/Start", grpcurlExecutable, OpsAgentPluginServerPort, grpcurlExecutable, OpsAgentPluginServerPort) } return fmt.Sprintf("%s -plaintext -d '{}' localhost:%s plugin_comm.GuestAgentPlugin/Stop && sleep 5 && %s -plaintext -d '{}' localhost:%s plugin_comm.GuestAgentPlugin/Start", grpcurlExecutable, OpsAgentPluginServerPort, grpcurlExecutable, OpsAgentPluginServerPort) } if gce.IsWindows(imageSpec) { return "Restart-Service google-cloud-ops-agent -Force" } // Return a command that works for both < 2.0.0 and >= 2.0.0 agents. return "sudo service google-cloud-ops-agent restart || sudo systemctl restart google-cloud-ops-agent" } // PackageLocation describes a location where packages // (currently, only the Ops Agent packages) live. type PackageLocation struct { // If provided, a URL for a directory in GCS containing .deb/.rpm/.goo files // to install on the testing VMs. // This setting is mutually exclusive with repoSuffix. packagesInGCS string // Package repository suffix to install from. Setting this and packagesInGCS // to "" means to install the latest stable release. repoSuffix string // Override the codename for the agent repository. // This setting is only used for ARM builds at the moment, and ignored when // installing from Artifact Registry. repoCodename string // Package repository GCP project to install from. Requires repoSuffix // to be nonempty. artifactRegistryProject string // Region the packages live in in Artifact Registry. Requires repoSuffix // to be nonempty. artifactRegistryRegion string } // LocationFromEnvVars assembles a PackageLocation from environment variables. func LocationFromEnvVars() PackageLocation { return PackageLocation{ packagesInGCS: os.Getenv("AGENT_PACKAGES_IN_GCS"), repoSuffix: os.Getenv("REPO_SUFFIX"), repoCodename: os.Getenv("REPO_CODENAME"), artifactRegistryProject: os.Getenv("ARTIFACT_REGISTRY_PROJECT"), artifactRegistryRegion: os.Getenv("ARTIFACT_REGISTRY_REGION"), } } func toEnvironment(environment map[string]string, format string, separator string) string { var assignments []string for k, v := range environment { if v != "" { assignments = append(assignments, fmt.Sprintf(format, k, v)) } } return strings.Join(assignments, separator) } func linuxEnvironment(environment map[string]string) string { return toEnvironment(environment, "%s='%s'", " ") } func windowsEnvironment(environment map[string]string) string { return toEnvironment(environment, `$env:%s='%s'`, "\n") } // InstallOpsAgent installs the Ops Agent on the given VM. Consults the given // PackageLocation to determine where to install the agent from. For details // about PackageLocation, see the documentation for the PackageLocation struct. func InstallOpsAgent(ctx context.Context, logger *log.Logger, vm *gce.VM, location PackageLocation) error { if location.packagesInGCS != "" && location.repoSuffix != "" { return fmt.Errorf("invalid PackageLocation: cannot provide both location.packagesInGCS and location.repoSuffix. location=%#v", location) } if location.artifactRegistryRegion != "" && location.repoSuffix == "" { return fmt.Errorf("invalid PackageLocation: location.artifactRegistryRegion was nonempty yet location.repoSuffix was empty. location=%#v", location) } if gce.IsOpsAgentUAPPlugin() { return InstallOpsAgentUAPPlugin(ctx, logger, vm, location) } if location.packagesInGCS != "" { return InstallPackageFromGCS(ctx, logger, vm, location.packagesInGCS) } preservedEnvironment := map[string]string{ "REPO_SUFFIX": location.repoSuffix, "REPO_CODENAME": location.repoCodename, "ARTIFACT_REGISTRY_PROJECT": location.artifactRegistryProject, "ARTIFACT_REGISTRY_REGION": location.artifactRegistryRegion, } if gce.IsWindows(vm.ImageSpec) { // Note that these commands match the ones from our public install docs // (https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation) // and keeping them in sync is encouraged so that we are testing the // same commands that our customers are running. if _, err := gce.RunRemotely(ctx, logger, vm, `(New-Object Net.WebClient).DownloadFile("https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.ps1", "${env:UserProfile}\add-google-cloud-ops-agent-repo.ps1")`); err != nil { return fmt.Errorf("InstallOpsAgent() failed to download repo script: %w", err) } runScript := func() error { scriptCmd := fmt.Sprintf(`%s & "${env:UserProfile}\add-google-cloud-ops-agent-repo.ps1" -AlsoInstall`, windowsEnvironment(preservedEnvironment)) _, err := gce.RunRemotely(ctx, logger, vm, scriptCmd) return err } // TODO: b/202526819 - Remove retries once the script does retries internally. if err := RunInstallFuncWithRetry(ctx, logger, vm, runScript); err != nil { return fmt.Errorf("InstallOpsAgent() failed to run repo script: %w", err) } return nil } if _, err := gce.RunRemotely(ctx, logger, vm, "curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh"); err != nil { return fmt.Errorf("InstallOpsAgent() failed to download repo script: %w", err) } runInstallScript := func() error { envVars := linuxEnvironment(preservedEnvironment) _, err := gce.RunRemotely(ctx, logger, vm, "sudo "+envVars+" bash -x add-google-cloud-ops-agent-repo.sh --also-install") return err } if err := RunInstallFuncWithRetry(ctx, logger, vm, runInstallScript); err != nil { return fmt.Errorf("InstallOpsAgent() error running repo script: %w", err) } return nil } // SetupOpsAgent installs the Ops Agent and installs the given config. func SetupOpsAgent(ctx context.Context, logger *log.Logger, vm *gce.VM, config string) error { return SetupOpsAgentFrom(ctx, logger, vm, config, LocationFromEnvVars()) } // RestartOpsAgent restarts the Ops Agent and waits for it to become available. func RestartOpsAgent(ctx context.Context, logger *log.Logger, vm *gce.VM) error { if _, err := gce.RunRemotely(ctx, logger, vm, getRestartOpsAgentCmd(vm.ImageSpec)); err != nil { return fmt.Errorf("RestartOpsAgent() failed to restart ops agent: %v", err) } // Give agents time to shut down. Fluent-Bit's default shutdown grace period // is 5 seconds, so we should probably give it at least that long. time.Sleep(10 * time.Second) return nil } // StartOpsAgentPluginServer starts the Ops Agent Plugin gRPC server on the testing VM in the background. func StartOpsAgentPluginServer(ctx context.Context, logger *log.Logger, vm *gce.VM, port string) error { if gce.IsWindows(vm.ImageSpec) { startUAPWindowsPlugin, err := scriptsDir.ReadFile(path.Join("testdata", "start-uap-plugin-server.ps1")) if err != nil { return fmt.Errorf("StartOpsAgentPluginServer() failed to read start-uap-plugin-server.ps1: %v", err) } if _, err := gce.RunScriptRemotely(ctx, logger, vm, string(startUAPWindowsPlugin), nil, nil); err != nil { return fmt.Errorf("StartOpsAgentPluginServer() failed to start the ops agent plugin: %v", err) } return nil } if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("sudo nohup ~/plugin --address=localhost:%s --errorlogfile=errorlog.txt --protocol=tcp 1>/dev/null 2>/dev/null &", port)); err != nil { return fmt.Errorf("StartOpsAgentPluginServer() failed to start the ops agent plugin: %v", err) } return nil } func StartCommandForImage(imageSpec string) string { if gce.IsOpsAgentUAPPlugin() { grpcurlExecutable := "grpcurl" if gce.IsWindows(imageSpec) { grpcurlExecutable = `C:\grpcurl.exe` } return fmt.Sprintf("%s -plaintext -d '{}' localhost:1234 plugin_comm.GuestAgentPlugin/Start", grpcurlExecutable) } if gce.IsWindows(imageSpec) { return "Start-Service google-cloud-ops-agent" } // Return a command that works for both < 2.0.0 and >= 2.0.0 agents. return "sudo service google-cloud-ops-agent start || sudo systemctl start google-cloud-ops-agent" } func StartOpsAgentViaUAPCommand(imageSpec string, config string) string { grpcurlExecutable := "grpcurl" if gce.IsWindows(imageSpec) { grpcurlExecutable = `C:\grpcurl.exe` } if len(config) > 0 { return fmt.Sprintf("%s -plaintext -d '{%s}' localhost:1234 plugin_comm.GuestAgentPlugin/Start", grpcurlExecutable, config) } return fmt.Sprintf("%s -plaintext -d '{}' localhost:1234 plugin_comm.GuestAgentPlugin/Start", grpcurlExecutable) } func StopCommandForImage(imageSpec string) string { if gce.IsOpsAgentUAPPlugin() { grpcurlExecutable := "grpcurl" if gce.IsWindows(imageSpec) { grpcurlExecutable = `C:\grpcurl.exe` } return fmt.Sprintf("%s -plaintext -d '{}' localhost:1234 plugin_comm.GuestAgentPlugin/Stop", grpcurlExecutable) } if gce.IsWindows(imageSpec) { return "Stop-Service google-cloud-ops-agent -Force" } // Return a command that works for both < 2.0.0 and >= 2.0.0 agents. return "sudo service google-cloud-ops-agent stop || sudo systemctl stop google-cloud-ops-agent" } func GetUAPPluginStatusForImage(imageSpec string) string { grpcurlExecutable := "grpcurl" if gce.IsWindows(imageSpec) { grpcurlExecutable = `C:\grpcurl.exe` } return fmt.Sprintf("%s -plaintext -d '{}' localhost:1234 plugin_comm.GuestAgentPlugin/GetStatus", grpcurlExecutable) } func StartOpsAgentPluginWithBackoff(ctx context.Context, logger *log.Logger, vm *gce.VM) error { tryStartOpsAgent := func() error { if _, err := gce.RunRemotely(ctx, logger, vm, StartCommandForImage(vm.ImageSpec)); err != nil { return fmt.Errorf("failed to start Ops Agent: %v", err) } time.Sleep(8 * time.Second) cmdOut, err := gce.RunRemotely(ctx, logger, vm, GetUAPPluginStatusForImage(vm.ImageSpec)) if err != nil { return fmt.Errorf("failed to retrieve Ops Agent status: %v", err) } if !strings.Contains(cmdOut.Stdout, "is running ok") { return fmt.Errorf("Ops Agent is not started successfully: stdout: %v, stderr: %v", cmdOut.Stdout, cmdOut.Stderr) } return nil } backoffPolicy := backoff.WithContext(backoff.WithMaxRetries(backoff.NewConstantBackOff(20*time.Second), 10), ctx) if err := backoff.Retry(tryStartOpsAgent, backoffPolicy); err != nil { return err } return nil } // SetupOpsAgentFrom is an overload of setupOpsAgent that allows the callsite to // decide which version of the agent gets installed. func SetupOpsAgentFrom(ctx context.Context, logger *log.Logger, vm *gce.VM, config string, location PackageLocation) error { if err := InstallOpsAgent(ctx, logger, vm, location); err != nil { return err } startupDelay := 20 * time.Second if len(config) > 0 { if gce.IsWindows(vm.ImageSpec) { // Sleep to avoid some flaky errors when restarting the agent because the // services have not fully started up yet. time.Sleep(startupDelay) } if err := gce.UploadContent(ctx, logger, vm, strings.NewReader(config), OpsAgentConfigPath(vm.ImageSpec)); err != nil { return fmt.Errorf("SetupOpsAgentFrom() failed to upload config file: %v", err) } } // UAP Plugin needs to be started regardless, since it's not automatically started upon install. if gce.IsOpsAgentUAPPlugin() { return StartOpsAgentPluginWithBackoff(ctx, logger, vm) } // Regular Ops Agent only needs a restart if the config is not empty. if len(config) > 0 { return RestartOpsAgent(ctx, logger, vm) } // Give agents time to start up. time.Sleep(startupDelay) return nil } // RecommendedMachineType returns a reasonable setting for a VM's machine type // (https://cloud.google.com/compute/docs/machine-types). Windows instances // are configured to be larger because they need more CPUs to start up in a // reasonable amount of time. func RecommendedMachineType(imageSpec string) string { if gce.IsWindows(imageSpec) { return "e2-standard-4" } if gce.IsARM(imageSpec) { return "t2a-standard-2" } return "e2-standard-2" } // CommonSetup sets up the VM for testing. func CommonSetup(t *testing.T, imageSpec string) (context.Context, *logging.DirectoryLogger, *gce.VM) { return CommonSetupWithExtraCreateArguments(t, imageSpec, nil) } // CommonSetupWithExtraCreateArguments sets up the VM for testing with extra creation arguments for the `gcloud compute instances create` command. func CommonSetupWithExtraCreateArguments(t *testing.T, imageSpec string, extraCreateArguments []string) (context.Context, *logging.DirectoryLogger, *gce.VM) { return CommonSetupWithExtraCreateArgumentsAndMetadata(t, imageSpec, extraCreateArguments, nil) } // CommonSetupWithExtraCreateArgumentsAndMetadata sets up the VM for testing with extra creation arguments for the `gcloud compute instances create` command and additional metadata. func CommonSetupWithExtraCreateArgumentsAndMetadata(t *testing.T, imageSpec string, extraCreateArguments []string, additionalMetadata map[string]string) (context.Context, *logging.DirectoryLogger, *gce.VM) { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), gce.SuggestedTimeout) t.Cleanup(cancel) gcloudConfigDir := t.TempDir() if err := gce.SetupGcloudConfigDir(ctx, gcloudConfigDir); err != nil { t.Fatalf("Unable to set up a gcloud config directory: %v", err) } ctx = gce.WithGcloudConfigDir(ctx, gcloudConfigDir) logger := gce.SetupLogger(t) logger.ToMainLog().Println("Calling SetupVM(). For details, see VM_initialization.txt.") options := gce.VMOptions{ ImageSpec: imageSpec, TimeToLive: "3h", MachineType: RecommendedMachineType(imageSpec), ExtraCreateArguments: extraCreateArguments, Metadata: additionalMetadata, } vm := gce.SetupVM(ctx, t, logger.ToFile("VM_initialization.txt"), options) logger.ToMainLog().Printf("VM is ready: %#v", vm) t.Cleanup(func() { RunOpsAgentDiagnostics(ctx, logger, vm) }) return ctx, logger, vm } // ManagedInstanceGroupVMSetup sets up a Managed Instance Group VM for testing with extra creation arguments for the `gcloud compute instances create` command and additional metadata. func ManagedInstanceGroupVMSetup(t *testing.T, imageSpec string, extraCreateArguments []string, additionalMetadata map[string]string) (context.Context, *logging.DirectoryLogger, *gce.ManagedInstanceGroupVM) { t.Helper() ctx, cancel := context.WithTimeout(context.Background(), gce.SuggestedTimeout) t.Cleanup(cancel) gcloudConfigDir := t.TempDir() if err := gce.SetupGcloudConfigDir(ctx, gcloudConfigDir); err != nil { t.Fatalf("Unable to set up a gcloud config directory: %v", err) } ctx = gce.WithGcloudConfigDir(ctx, gcloudConfigDir) logger := gce.SetupLogger(t) logger.ToMainLog().Println("Calling SetupManagedInstanceGroupVM(). For details, see VM_initialization.txt.") options := gce.VMOptions{ ImageSpec: imageSpec, TimeToLive: "3h", MachineType: RecommendedMachineType(imageSpec), ExtraCreateArguments: extraCreateArguments, Metadata: additionalMetadata, } migVM := gce.SetupManagedInstanceGroupVM(ctx, t, logger.ToFile("VM_initialization.txt"), options) logger.ToMainLog().Printf("ManagedInstanceGroupVM is ready: %#v", migVM.VM) t.Cleanup(func() { RunOpsAgentDiagnostics(ctx, logger, migVM.VM) }) return ctx, logger, migVM } func InstallOpsAgentUAPPlugin(ctx context.Context, logger *log.Logger, vm *gce.VM, location PackageLocation) error { // Used for manual testing or pre-submits if location.packagesInGCS != "" { return InstallOpsAgentUAPPluginFromGCS(ctx, logger, vm, location.packagesInGCS) } // Used for nightly builds artifactBucket := fmt.Sprintf("gs://%s-ops-agent-releases/%s", location.artifactRegistryProject, location.repoSuffix) return InstallOpsAgentUAPPluginFromGCS(ctx, logger, vm, artifactBucket) } // InstallOpsAgentUAPPluginFromGCS installs the Ops Agent plugin tarball from GCS onto the given Linux VM. // // gcsPath must point to a GCS Path to a .tar.gz file to install on the testing VMs. func InstallOpsAgentUAPPluginFromGCS(ctx context.Context, logger *log.Logger, vm *gce.VM, gcsPath string) error { if err := gce.InstallGsutilIfNeeded(ctx, logger, vm); err != nil { return err } if gce.IsWindows(vm.ImageSpec) { if _, err := gce.RunRemotely(ctx, logger, vm, "New-Item -ItemType directory -Path C:\\agentPlugin"); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf(`gsutil cp %s/google-cloud-ops-agent-plugin*.tar.gz C:\\agentPlugin`, gcsPath)); err != nil { return fmt.Errorf("error copying down agent package from GCS: %v", err) } if _, err := gce.RunRemotely(ctx, logger, vm, "ls C:\\agentPlugin"); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, `Get-ChildItem -Path "C:\agentPlugin" -Filter "google-cloud-ops-agent-plugin*.tar.gz" -File|Select-Object -First 1 -Expand FullName | ForEach-Object { if ($_){ & tar -xzf $_ -C "C:\"} }`); err != nil { return err } // Print the contents of the home dir into the logs. if _, err := gce.RunRemotely(ctx, logger, vm, "ls C:\\agentPlugin; ls C:\\"); err != nil { return err } } else { if _, err := gce.RunRemotely(ctx, logger, vm, "mkdir -p /tmp/agentPlugin"); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, "sudo gsutil cp "+gcsPath+"/google-cloud-ops-agent-plugin*.tar.gz /tmp/agentPlugin"); err != nil { return fmt.Errorf("error copying down the agent uap plugin tarball from GCS: %v", err) } // Print the contents of /tmp/agentPlugin into the logs. if _, err := gce.RunRemotely(ctx, logger, vm, "ls -la /tmp/agentPlugin"); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, "sudo find /tmp/agentPlugin -maxdepth 1 -name \"google-cloud-ops-agent-plugin*.tar.gz\" -print0 | xargs -0 -I {} sudo tar -xzf {} --no-overwrite-dir -C ~/ && ls -la"); err != nil { return err } // Print the contents of the home dir into the logs. if _, err := gce.RunRemotely(ctx, logger, vm, "ls -la ~/"); err != nil { return err } } if err := gce.InstallGrpcurlIfNeeded(ctx, logger, vm); err != nil { return err } return StartOpsAgentPluginServer(ctx, logger, vm, OpsAgentPluginServerPort) } // InstallPackageFromGCS installs the agent package from GCS onto the given Linux VM. // // gcsPath must point to a GCS Path that contains .deb/.rpm/.goo files to install on the testing VMs. // Packages with "dbgsym" in their name are skipped because customers don't // generally install those, so our tests shouldn't either. func InstallPackageFromGCS(ctx context.Context, logger *log.Logger, vm *gce.VM, gcsPath string) error { if gce.IsWindows(vm.ImageSpec) { return installWindowsPackageFromGCS(ctx, logger, vm, gcsPath) } if _, err := gce.RunRemotely(ctx, logger, vm, "mkdir -p /tmp/agentUpload /tmp/agentPlugin"); err != nil { return err } if err := gce.InstallGsutilIfNeeded(ctx, logger, vm); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, "sudo gsutil cp -r "+gcsPath+"/* /tmp/agentUpload"); err != nil { return fmt.Errorf("error copying down agent package from GCS: %v", err) } // Print the contents of /tmp/agentUpload into the logs. if _, err := gce.RunRemotely(ctx, logger, vm, "ls /tmp/agentUpload"); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, "rm /tmp/agentUpload/*dbgsym* || echo nothing to delete"); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, "mv /tmp/agentUpload/*.tar.gz /tmp/agentPlugin || echo nothing to move"); err != nil { return err } if IsRPMBased(vm.ImageSpec) { if _, err := gce.RunRemotely(ctx, logger, vm, "sudo rpm --upgrade -v --force /tmp/agentUpload/*"); err != nil { return fmt.Errorf("error installing agent from .rpm file: %v", err) } return nil } // --allow-downgrades is marked as dangerous, but I don't see another way // to get the following sequence to work (from TestUpgradeOpsAgent): // 1. install stable package from Rapture // 2. install just-built package from GCS // Nor do I know why apt considers that sequence to be a downgrade. if _, err := gce.RunRemotely(ctx, logger, vm, "sudo apt-get install --allow-downgrades --yes --verbose-versions /tmp/agentUpload/*"); err != nil { return fmt.Errorf("error installing agent from .deb file: %v", err) } return nil } // Installs the agent package from GCS (see packagesInGCS) onto the given Windows VM. func installWindowsPackageFromGCS(ctx context.Context, logger *log.Logger, vm *gce.VM, gcsPath string) error { if _, err := gce.RunRemotely(ctx, logger, vm, "New-Item -ItemType directory -Path C:\\agentUpload"); err != nil { return err } if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("gsutil cp -r %s/*.goo C:\\agentUpload", gcsPath)); err != nil { return fmt.Errorf("error copying down agent package from GCS: %v", err) } if _, err := gce.RunRemotely(ctx, logger, vm, "googet -noconfirm -verbose install -reinstall (Get-ChildItem C:\\agentUpload\\*.goo | Select-Object -Expand FullName)"); err != nil { return fmt.Errorf("error installing agent from .goo file: %v", err) } return nil } // OpsAgentConfigPath returns the platform-specific filesystem location where // the Ops Agent config is stored. func OpsAgentConfigPath(imageSpec string) string { if gce.IsWindows(imageSpec) { return `C:\Program Files\Google\Cloud Operations\Ops Agent\config\config.yaml` } return "/etc/google-cloud-ops-agent/config.yaml" } func GetOtelConfigPath(imageSpec string) string { if gce.IsOpsAgentUAPPlugin() { if gce.IsWindows(imageSpec) { return `C:\ProgramData\Google\Compute Engine\google-guest-agent\agent_state\plugins\ops-agent-plugin\generated_configs\otel\otel.yaml` } return "/var/lib/google-guest-agent/agent_state/plugins/ops-agent-plugin/run/google-cloud-ops-agent-opentelemetry-collector/otel.yaml" } if gce.IsWindows(imageSpec) { return `C:\ProgramData\Google\Cloud Operations\Ops Agent\generated_configs\otel\otel.yaml` } return "/var/run/google-cloud-ops-agent-opentelemetry-collector/otel.yaml" }