internal/pkg/agent/install/uninstall.go (405 lines of code) (raw):

// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one // or more contributor license agreements. Licensed under the Elastic License 2.0; // you may not use this file except in compliance with the Elastic License 2.0. package install import ( "context" "errors" "fmt" "io/fs" "net/http" "os" "path/filepath" "runtime" "strings" "time" "github.com/kardianos/service" "github.com/schollz/progressbar/v3" "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/application/secret" "github.com/elastic/elastic-agent/internal/pkg/agent/configuration" aerrors "github.com/elastic/elastic-agent/internal/pkg/agent/errors" "github.com/elastic/elastic-agent/internal/pkg/agent/transpiler" "github.com/elastic/elastic-agent/internal/pkg/agent/vars" "github.com/elastic/elastic-agent/internal/pkg/agent/vault" "github.com/elastic/elastic-agent/internal/pkg/capabilities" "github.com/elastic/elastic-agent/internal/pkg/config" "github.com/elastic/elastic-agent/internal/pkg/config/operations" "github.com/elastic/elastic-agent/internal/pkg/core/backoff" "github.com/elastic/elastic-agent/internal/pkg/fleetapi" fleetclient "github.com/elastic/elastic-agent/internal/pkg/fleetapi/client" "github.com/elastic/elastic-agent/pkg/component" comprt "github.com/elastic/elastic-agent/pkg/component/runtime" "github.com/elastic/elastic-agent/pkg/core/logger" "github.com/elastic/elastic-agent/pkg/features" "github.com/elastic/elastic-agent/pkg/utils" ) // fleetAudit variables control retry attempts for contacting fleet var ( fleetAuditAttempts = 5 fleetAuditWaitInit = time.Second fleetAuditWaitMax = time.Second * 10 ) // agentInfo is a custom type that implements the fleetapi.AgentInfo interface type agentInfo string func (a *agentInfo) AgentID() string { return string(*a) } // Uninstall uninstalls persistently Elastic Agent on the system. func Uninstall(ctx context.Context, cfgFile, topPath, uninstallToken string, log *logp.Logger, pt *progressbar.ProgressBar, skipFleetAudit bool) error { cwd, err := os.Getwd() if err != nil { return fmt.Errorf("unable to get current working directory") } if runtime.GOOS == "windows" && paths.HasPrefix(cwd, topPath) { return fmt.Errorf("uninstall must be run from outside the installed path '%s'", topPath) } // check if the agent was installed using --unprivileged by checking the file vault for the agent secret (needed on darwin to correctly load the vault) unprivileged, err := checkForUnprivilegedVault(ctx) if err != nil { return fmt.Errorf("error checking for unprivileged vault: %w", err) } // will only notify fleet of the uninstall command if it can gather config and agentinfo, and is not a stand-alone install localFleet := false notifyFleet := false var agentID agentInfo var cfg *configuration.Configuration func() { // check if we need to notify in a func to allow us to return early if a (non-fatal) error is encountered. // read local config c, err := operations.LoadFullAgentConfig(ctx, log, cfgFile, false, unprivileged) if err != nil { pt.Describe("notify Fleet failed: unable to read config") return } cfg, err = configuration.NewFromConfig(c) if err != nil { pt.Describe("notify Fleet failed: error transforming config") return } if cfg != nil && !configuration.IsStandalone(cfg.Fleet) { agentID = agentInfo(cfg.Settings.ID) notifyFleet = true if cfg.Fleet != nil && cfg.Fleet.Server != nil { localFleet = true } } }() // Notify fleet-server while it is still running if it's running locally if notifyFleet && localFleet { // host is set in the agent/cmd/enroll_cmd.go by createFleetServerBootstrapConfig // hosts is set in agent/application/actions/handlers/handler_action_policy_change.go by updateFleetConfig // agents running the fleet-server integration should communicate over the internal API (defaults to localhost:8221) // This may need to be fixed with https://github.com/elastic/elastic-agent/issues/4771 cfg.Fleet.Client.Hosts = []string{cfg.Fleet.Client.Host} notifyFleetAuditUninstall(ctx, log, pt, cfg, &agentID) //nolint:errcheck // ignore the error as we can't act on it } // ensure service is stopped status, err := EnsureStoppedService(topPath, pt) if err != nil { // context for the error already provided in the EnsureStoppedService function return err } // kill any running watcher if err := killWatcher(pt); err != nil { return fmt.Errorf("failed trying to kill any running watcher: %w", err) } // Uninstall components first if err := uninstallComponents(ctx, cfgFile, uninstallToken, log, pt, unprivileged); err != nil { // If service status was running it was stopped to uninstall the components. // If the components uninstall failed start the service again if status == service.StatusRunning { if startErr := StartService(topPath); startErr != nil { // context for the error already provided in the StartService function return err } } return fmt.Errorf("error uninstalling components: %w", err) } // Uninstall service only after components were uninstalled successfully pt.Describe("Removing service") err = UninstallService(topPath) // Is there a reason why we don't want to hard-fail on this? if err != nil { pt.Describe(fmt.Sprintf("Failed to Uninstall existing service: %s", err)) } else { pt.Describe("Successfully uninstalled service") } // remove, if present on platform if paths.ShellWrapperPath() != "" { err = os.Remove(paths.ShellWrapperPath()) if !os.IsNotExist(err) && err != nil { return aerrors.New( err, fmt.Sprintf("failed to remove shell wrapper (%s)", paths.ShellWrapperPath()), aerrors.M("destination", paths.ShellWrapperPath())) } } // remove existing directory pt.Describe("Removing install directory") err = RemovePath(topPath) if err != nil { pt.Describe("Failed to remove install directory") return aerrors.New( err, fmt.Sprintf("failed to remove installation directory (%s)", paths.Top()), aerrors.M("directory", paths.Top())) } pt.Describe("Removed install directory") notifyFleetIfNeeded(ctx, log, pt, cfg, agentID, notifyFleet, localFleet, skipFleetAudit, notifyFleetAuditUninstall) return nil } // Injecting notifyFleetAuditUninstall for easier unit testing func notifyFleetIfNeeded(ctx context.Context, log *logp.Logger, pt *progressbar.ProgressBar, cfg *configuration.Configuration, agentID agentInfo, notifyFleet, localFleet, skipFleetAudit bool, notifyFleetAuditUninstall NotifyFleetAuditUninstall) { if notifyFleet && !localFleet && !skipFleetAudit { notifyFleetAuditUninstall(ctx, log, pt, cfg, &agentID) //nolint:errcheck // ignore the error as we can't act on it) } } type NotifyFleetAuditUninstall func(ctx context.Context, log *logp.Logger, pt *progressbar.ProgressBar, cfg *configuration.Configuration, ai fleetapi.AgentInfo) error // notifyFleetAuditUninstall will attempt to notify fleet-server of the agent's uninstall. // // There are retries for the attempt after a 10s wait, but it is a best-effort approach. func notifyFleetAuditUninstall(ctx context.Context, log *logp.Logger, pt *progressbar.ProgressBar, cfg *configuration.Configuration, ai fleetapi.AgentInfo) error { ctx, cancel := context.WithCancel(ctx) defer cancel() pt.Describe("Attempting to notify Fleet of uninstall") client, err := fleetclient.NewAuthWithConfig(log, cfg.Fleet.AccessAPIKey, cfg.Fleet.Client) if err != nil { pt.Describe(fmt.Sprintf("notify Fleet: unable to create fleetapi client: %v", err)) return err } cmd := fleetapi.NewAuditUnenrollCmd(ai, client) req := &fleetapi.AuditUnenrollRequest{ Reason: fleetapi.ReasonUninstall, Timestamp: time.Now().UTC(), } jitterBackoff := backoffWithContext(ctx) for i := 0; i < fleetAuditAttempts; i++ { resp, err := cmd.Execute(ctx, req) if err != nil { var reqErr *fleetapi.ReqError // Do not retry if it was a context error, or an error with the request. if errors.Is(err, context.Canceled) { return ctx.Err() } else if errors.Is(err, fleetclient.ErrInvalidAPIKey) { pt.Describe("API key is invalid (normal if already unenrolled), notification dropped.") return nil } else if errors.As(err, &reqErr) { pt.Describe(fmt.Sprintf("notify Fleet: encountered unretryable error: %v", err)) return err } pt.Describe(fmt.Sprintf("notify Fleet: network error: %v (retry in %v)", err, jitterBackoff.NextWait())) jitterBackoff.Wait() continue } resp.Body.Close() switch resp.StatusCode { case http.StatusOK: pt.Describe("Successfully notified Fleet about uninstall") return nil case http.StatusBadRequest, http.StatusUnauthorized, http.StatusConflict: // BadRequest are not retried because the request body is incorrect and will not be accepted // Unauthorized are not retried because the API key has been invalidated; unauthorized is listed here but will be returned as a fleetapi.ReqError // Conflict will not retry because in this case Endpoint has indicated that it is orphaned and we do not want to overwrite that annotation pt.Describe(fmt.Sprintf("notify Fleet: failed with status code %d (no retries)", resp.StatusCode)) return fmt.Errorf("unretryable return status: %d", resp.StatusCode) default: pt.Describe(fmt.Sprintf("notify Fleet: failed with status code %d (retry in %v)", resp.StatusCode, jitterBackoff.NextWait())) jitterBackoff.Wait() } } pt.Describe("notify Fleet: failed") return fmt.Errorf("notify Fleet: failed") } // EnsureStoppedService ensures that the installed service is stopped. func EnsureStoppedService(topPath string, pt *progressbar.ProgressBar) (service.Status, error) { status, _ := StatusService(topPath) if status == service.StatusRunning { pt.Describe("Stopping service") err := StopService(topPath, 30*time.Second, 250*time.Millisecond) if err != nil { pt.Describe("Failed to issue stop service") // context for the error already provided in the StopService function return status, err } pt.Describe("Successfully stopped service") } else { pt.Describe("Service already stopped") } return status, nil } func checkForUnprivilegedVault(ctx context.Context, opts ...vault.OptionFunc) (bool, error) { // check if we have a file vault to detect if we have to use it for reading config opts = append(opts, vault.WithReadonly(true)) vaultOpts, err := vault.ApplyOptions(opts...) if err != nil { return false, err } fileVault, fileVaultErr := vault.NewFileVault(ctx, vaultOpts) if fileVaultErr == nil { ok, keyErr := fileVault.Exists(ctx, secret.AgentSecretKey) if keyErr == nil && ok { // we have a valid file vault and it contains the key, set unprivileged return true, nil } } else if !errors.Is(fileVaultErr, fs.ErrNotExist) { // we had a different error than NotExist return false, fmt.Errorf("error checking for file vault existence: %w", fileVaultErr) } return false, nil } // RemovePath helps with removal path where there is a probability // of running into an executable running that might prevent removal // on Windows. // // On Windows it is possible that a removal can spuriously error due // to an ERROR_SHARING_VIOLATION. RemovePath will retry up to 2 // seconds if it keeps getting that error. func RemovePath(path string) error { const arbitraryTimeout = 60 * time.Second start := time.Now() var lastErr error for time.Since(start) <= arbitraryTimeout { lastErr = os.RemoveAll(path) if lastErr == nil || !isRetryableError(lastErr) { return lastErr } if isBlockingOnExe(lastErr) { // try to remove the blocking exe and try again to clean up the path _ = removeBlockingExe(lastErr) } time.Sleep(500 * time.Millisecond) } return fmt.Errorf("timed out while removing %q. Last error: %w", path, lastErr) } func RemoveBut(path string, bestEffort bool, exceptions ...string) error { if len(exceptions) == 0 { return RemovePath(path) } files, err := os.ReadDir(path) if err != nil { return fmt.Errorf("error reading directory %s: %w", path, err) } for _, f := range files { if containsString(f.Name(), exceptions, runtime.GOOS != component.Windows) { continue } err = RemovePath(filepath.Join(path, f.Name())) if !bestEffort && err != nil { return fmt.Errorf("error removing path %s: %w", f.Name(), err) } } return err } func containsString(str string, a []string, caseSensitive bool) bool { if !caseSensitive { str = strings.ToLower(str) } for _, v := range a { if !caseSensitive { v = strings.ToLower(v) } if str == v { return true } } return false } func uninstallComponents(ctx context.Context, cfgFile string, uninstallToken string, log *logp.Logger, pt *progressbar.ProgressBar, unprivileged bool) error { platform, err := component.LoadPlatformDetail() if err != nil { return fmt.Errorf("failed to gather system information: %w", err) } specs, err := component.LoadRuntimeSpecs(paths.Components(), platform) if err != nil { return fmt.Errorf("failed to detect inputs and outputs: %w", err) } cfg, err := operations.LoadFullAgentConfig(ctx, log, cfgFile, false, unprivileged) if err != nil { return fmt.Errorf("error loading agent config: %w", err) } cfg, err = applyDynamics(ctx, log, cfg) if err != nil { return fmt.Errorf("error applying dynamic inputs: %w", err) } comps, err := serviceComponentsFromConfig(specs, cfg) if err != nil { return fmt.Errorf("error creating service components: %w", err) } // nothing to remove if len(comps) == 0 { return nil } // Need to read the features from config on uninstall, in order to set the tamper protection feature flag correctly if err = features.Apply(cfg); err != nil { return fmt.Errorf("could not parse and apply feature flags config: %w", err) } // check caps so we don't try uninstalling things that were already // prevented from installing caps, err := capabilities.LoadFile(paths.AgentCapabilitiesPath(), log) if err != nil { return fmt.Errorf("error checking capabilities: %w", err) } // remove each service component for _, comp := range comps { if !caps.AllowInput(comp.InputType) || !caps.AllowOutput(comp.OutputType) { // This component is not active continue } if err = uninstallServiceComponent(ctx, log, comp, uninstallToken, pt); err != nil { os.Stderr.WriteString(fmt.Sprintf("failed to uninstall component %q: %s\n", comp.ID, err)) // The decision was made to change the behaviour and leave the Agent installed if Endpoint uninstall fails // https://github.com/elastic/elastic-agent/pull/2708#issuecomment-1574251911 // Thus returning error here. return fmt.Errorf("error uninstalling component: %w", err) } } return nil } func uninstallServiceComponent(ctx context.Context, log *logp.Logger, comp component.Component, uninstallToken string, pt *progressbar.ProgressBar) error { // Do not use infinite retries when uninstalling from the command line. If the uninstall needs to be // retried the entire uninstall command can be retried. Retries may complete asynchronously with the // execution of the uninstall command, leading to bugs like https://github.com/elastic/elastic-agent/issues/3060. pt.Describe(fmt.Sprintf("Uninstalling service component %s", comp.InputType)) err := comprt.UninstallService(ctx, log, comp, uninstallToken) if err != nil { pt.Describe("Failed to uninstall service") return fmt.Errorf("error uninstalling service: %w", err) } pt.Describe("Uninstalled service") return nil } func serviceComponentsFromConfig(specs component.RuntimeSpecs, cfg *config.Config) ([]component.Component, error) { mm, err := cfg.ToMapStr() if err != nil { return nil, aerrors.New("failed to create a map from config", err) } allComps, err := specs.ToComponents(mm, nil, logp.InfoLevel, nil, map[string]uint64{}) if err != nil { return nil, fmt.Errorf("failed to render components: %w", err) } var serviceComps []component.Component for _, comp := range allComps { if comp.Err == nil && comp.InputSpec != nil && comp.InputSpec.Spec.Service != nil { // non-error and service based component serviceComps = append(serviceComps, comp) } } return serviceComps, nil } func applyDynamics(ctx context.Context, log *logger.Logger, cfg *config.Config) (*config.Config, error) { cfgMap, err := cfg.ToMapStr() if err != nil { return nil, err } ast, err := transpiler.NewAST(cfgMap) if err != nil { return nil, err } // apply dynamic inputs inputs, ok := transpiler.Lookup(ast, "inputs") if ok { varsArray, err := vars.WaitForVariables(ctx, log, cfg, 0) if err != nil { return nil, err } renderedInputs, err := transpiler.RenderInputs(inputs, varsArray) if err != nil { return nil, err } err = transpiler.Insert(ast, renderedInputs, "inputs") if err != nil { return nil, aerrors.New("inserting rendered inputs failed", err) } } finalConfig, err := ast.Map() if err != nil { return nil, err } return config.NewConfigFrom(finalConfig) } // killWatcher finds and kills any running Elastic Agent watcher. func killWatcher(pt *progressbar.ProgressBar) error { for { // finding and killing watchers is performed in a loop until no // more watchers are existing, this ensures that during uninstall // that no matter what the watchers are dead before going any further pids, err := utils.GetWatcherPIDs() if err != nil { pt.Describe("Failed to get watcher PID") return fmt.Errorf("error fetching watcher PIDs: %w", err) } if len(pids) == 0 { // step was never started so no watcher was found on first loop pt.Describe("Stopping upgrade watcher; none found") return nil } var pidsStr []string for _, pid := range pids { pidsStr = append(pidsStr, fmt.Sprintf("%d", pid)) } pt.Describe(fmt.Sprintf("Stopping upgrade watcher (%s)", strings.Join(pidsStr, ", "))) var errs error for _, pid := range pids { proc, err := os.FindProcess(pid) if err != nil { errs = errors.Join(errs, fmt.Errorf("failed to load watcher process with pid %d: %w", pid, err)) continue } err = killNoneChildProcess(proc) if err != nil && !errors.Is(err, os.ErrProcessDone) { errs = errors.Join(errs, fmt.Errorf("failed to kill watcher process with pid %d: %w", pid, err)) continue } } if errs != nil { pt.Describe("Failed to find and stop watcher processes") return errs } // wait 1 second before performing the loop again <-time.After(1 * time.Second) } } func backoffWithContext(ctx context.Context) backoff.Backoff { ch := make(chan struct{}) bo := backoff.NewEqualJitterBackoff(ch, fleetAuditWaitInit, fleetAuditWaitMax) go func() { <-ctx.Done() close(ch) }() return bo }