rootcmd.go (554 lines of code) (raw):

package main import ( "context" "fmt" "os" "runtime" "runtime/debug" "strings" "time" "github.com/aliyun/aliyun_assist_client/thirdparty/aliyun-cli/cli" "github.com/aliyun/aliyun_assist_client/thirdparty/aliyun-cli/i18n" logrusr "github.com/aliyun/aliyun_assist_client/thirdparty/bombsimon/logrusr/v3" "github.com/aliyun/aliyun_assist_client/thirdparty/service" "github.com/aliyun/aliyun_assist_client/thirdparty/single" "github.com/aliyun/aliyun_assist_client/thirdparty/sirupsen/logrus" "github.com/kirinlabs/HttpRequest" "github.com/tidwall/gjson" "k8s.io/klog/v2" "github.com/aliyun/aliyun_assist_client/agent/channel" "github.com/aliyun/aliyun_assist_client/agent/checkagentpanic" "github.com/aliyun/aliyun_assist_client/agent/checkkdump" "github.com/aliyun/aliyun_assist_client/agent/checkospanic" "github.com/aliyun/aliyun_assist_client/agent/checkvirt" "github.com/aliyun/aliyun_assist_client/agent/clientreport" "github.com/aliyun/aliyun_assist_client/agent/commandermanager" "github.com/aliyun/aliyun_assist_client/agent/cryptdata" "github.com/aliyun/aliyun_assist_client/agent/flagging" "github.com/aliyun/aliyun_assist_client/agent/heartbeat" "github.com/aliyun/aliyun_assist_client/agent/hybrid" "github.com/aliyun/aliyun_assist_client/agent/hybrid/instance" "github.com/aliyun/aliyun_assist_client/agent/install" "github.com/aliyun/aliyun_assist_client/agent/log" "github.com/aliyun/aliyun_assist_client/agent/metrics" "github.com/aliyun/aliyun_assist_client/agent/perfmon" "github.com/aliyun/aliyun_assist_client/agent/pluginmanager" "github.com/aliyun/aliyun_assist_client/agent/statemanager" "github.com/aliyun/aliyun_assist_client/agent/taskengine" "github.com/aliyun/aliyun_assist_client/agent/taskengine/timermanager" "github.com/aliyun/aliyun_assist_client/agent/update" "github.com/aliyun/aliyun_assist_client/agent/util" "github.com/aliyun/aliyun_assist_client/agent/util/daemon" "github.com/aliyun/aliyun_assist_client/agent/util/osutil" "github.com/aliyun/aliyun_assist_client/agent/util/powerutil" "github.com/aliyun/aliyun_assist_client/agent/util/wrapgo" "github.com/aliyun/aliyun_assist_client/agent/version" "github.com/aliyun/aliyun_assist_client/common/pathutil" commander_server "github.com/aliyun/aliyun_assist_client/interprocess/commander/server" configure_server "github.com/aliyun/aliyun_assist_client/interprocess/configure/server" cryptdata_server "github.com/aliyun/aliyun_assist_client/interprocess/cryptdata/server" "github.com/aliyun/aliyun_assist_client/interprocess/messagebus/buses" messagebus_server "github.com/aliyun/aliyun_assist_client/interprocess/messagebus/server" ) type Options struct { GetHelp bool GetVersion bool GetGitHash bool Install bool Remove bool Start bool Stop bool Register bool DeRegister bool Region string Tags []string ActivationCode string ActivationId string NetWorkMode string InstanceName string RunAsCommon bool RunAsDaemon bool LogPath string IsVerbose bool } type program struct{} const ( HelpFlagName = "help" VersionFlagName = "version" GithashFlagName = "githash" InstallFlagName = "install" RemoveFlagName = "remove" StartFlagName = "start" StopFlagName = "stop" VerboseFlagName = "verbose" RegisterFlagName = "register" DeRegisterFlagName = "deregister" RegionFlagName = "RegionId" TagFlagName = "tag" ActivationCodeFlagName = "ActivationCode" ActivationIdFlagName = "ActivationId" NetworkModeFlagName = "NetworkMode" InstanceNameFlagName = "InstanceName" LogPathFlagName = "LogPath" RunAsCommonFlagName = "common" RunAsDaemonFlagName = "daemon" ) var ( G_Running bool = true G_StopEvent chan struct{} = nil SingleAppLock *single.Single Started bool persistentFlags = []cli.Flag{ { Name: HelpFlagName, Shorthand: 'h', Short: i18n.T(`print help`, `打印此帮助`), AssignedMode: cli.AssignedNone, Persistent: true, Category: "caller", }, { Name: LogPathFlagName, Shorthand: 'L', Short: i18n.T(`log path`, `指定日志保存目录`), AssignedMode: cli.AssignedOnce, Persistent: true, Category: "caller", }, } rootFlags = []cli.Flag{ { Name: VersionFlagName, Shorthand: 'v', Short: i18n.T(`print version`, `打印版本号`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: GithashFlagName, Short: i18n.T(`print git hash`, `打印Git commit哈希值`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: InstallFlagName, Short: i18n.T(`install assist`, `安装云助手agent为系统服务`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: RemoveFlagName, Short: i18n.T(`remove assist`, `删除已安装的云助手agent系统服务`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: StartFlagName, Short: i18n.T(`start assist`, `启动云助手agent系统服务`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: StopFlagName, Short: i18n.T(`stop assist`, `停止云助手agent系统服务`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: VerboseFlagName, Shorthand: 'V', Short: i18n.T(`enable verbose`, `启用云助手agent的详细运行过程输出`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: RegisterFlagName, Shorthand: 'r', Short: i18n.T(`register as aliyun managed instance`, `注册为云助手托管实例`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: DeRegisterFlagName, Shorthand: 'u', Short: i18n.T(`unregister as aliyun managed instance`, `取消注册为云助手托管实例`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: RegionFlagName, Shorthand: 'R', Short: i18n.T(`used in register mode`, `(该参数仅限在注册为云助手托管实例时使用)`), AssignedMode: cli.AssignedOnce, Category: "caller", }, { Name: TagFlagName, Shorthand: 'T', Short: i18n.T(`used in register mode`, `(该参数仅限在注册为云助手托管实例时使用)`), AssignedMode: cli.AssignedRepeatable, Category: "caller", }, { Name: ActivationCodeFlagName, Shorthand: 'C', Short: i18n.T(`used in register mode`, `(该参数仅限在注册为云助手托管实例时使用)`), AssignedMode: cli.AssignedOnce, Category: "caller", }, { Name: ActivationIdFlagName, Shorthand: 'I', Short: i18n.T(`used in register mode`, `(该参数仅限在注册为云助手托管实例时使用)`), AssignedMode: cli.AssignedOnce, Category: "caller", }, { Name: NetworkModeFlagName, Shorthand: 'm', Short: i18n.T(`used in register mode`, `(该参数仅限在注册为云助手托管实例时使用)`), AssignedMode: cli.AssignedOnce, Category: "caller", }, { Name: InstanceNameFlagName, Shorthand: 'N', Short: i18n.T(`used in register mode`, `(该参数仅限在注册为云助手托管实例时使用)`), AssignedMode: cli.AssignedOnce, Category: "caller", }, { Name: RunAsCommonFlagName, Shorthand: 'c', Short: i18n.T(`run as common`, `以 common 模式运行`), AssignedMode: cli.AssignedNone, Category: "caller", }, { Name: RunAsDaemonFlagName, Shorthand: 'd', Short: i18n.T(`start as daemon`, `切换到后台运行`), AssignedMode: cli.AssignedNone, Category: "caller", }, } rootCmd = cli.Command{ Name: "aliyun-service", Short: i18n.T(`Aliyun Assist Copyright (c) 2017-2023 Alibaba Group Holding Limited`, `Aliyun Assist Copyright (c) 2017-2023 Alibaba Group Holding Limited`), Usage: "aliyun-service [subcommand] [flags]", Sample: "", EnableUnknownFlag: false, Run: runRootCommand, } ) func init() { for i := range persistentFlags { rootCmd.Flags().Add(&persistentFlags[i]) } for j := range rootFlags { rootCmd.Flags().Add(&rootFlags[j]) } } func (p *program) Start(s service.Service) error { go p.run() return nil } func (p *program) run() { log.GetLogger().Infof("Starting...... version: %s githash: %s", version.AssistVersion, version.GitCommitHash) SingleAppLock = single.New("AliyunAssistClientSingleLock") if err := SingleAppLock.CheckLock(); err != nil && err == single.ErrAlreadyRunning { log.GetLogger().Fatal("another instance of the app is already running, exiting") } checkagentpanic.RedirectStdouterr() G_Running = true G_StopEvent = make(chan struct{}) initConfiguration(log.GetLogger().WithField("phase", "InitConfig")) if err := timermanager.InitTimerManager(); err != nil { log.GetLogger().Fatalln("Failed to initialize timer manager: " + err.Error()) return } channel.TryStartGshellChannel() if runtime.GOOS == "windows" { pathutil.SetCurrentEnvPath() } // Logging current working directory information if currentWorkingDirectory, err := os.Getwd(); err == nil { log.GetLogger().Infof("Current working directory is: %s", currentWorkingDirectory) } else { log.GetLogger().WithError(err).Errorln("Failed to obtain current working directory") } sleep_internals_seconds := 3 for { host := util.GetServerHost() if host != "" { log.GetLogger().Println("GET_HOST_OK ", host) break } else { log.GetLogger().Println("GET_HOST_ERROR") } time.Sleep(time.Duration(sleep_internals_seconds) * time.Second) sleep_internals_seconds = sleep_internals_seconds * 2 if sleep_internals_seconds > 180 { sleep_internals_seconds = 180 } } // Use clientreport.LogAndReportPanic as default panic handler to report panic wrapgo.SetDefaultPanicHandler(clientreport.LogAndReportPanic) // Try to handle panic from code below defer func() { if panicPayload := recover(); panicPayload != nil { stacktrace := debug.Stack() wrapgo.CallDefaultPanicHandler(panicPayload, stacktrace) } }() // Check last panic and report it wrapgo.CallWithPanicHandler(checkagentpanic.CheckAgentPanic, clientreport.LogAndReportIgnorePanic) if instance.IsHybrid() { // Check hybrid instance's fingerprint file hybrid.CheckFingerprint() util.SetHTTPPostErrHandler(func(httpResp *HttpRequest.Response, httpErr error) { if httpResp != nil { content, _ := httpResp.Content() respJson := gjson.Parse(content) errMsg := respJson.Get("errMsg") if errMsg.Exists() && errMsg.String() == "instance_deregistered" { log.GetLogger().Info("Clean up hybrid instance info and stop agent process self, because of errMsg: ", errMsg.String()) // Service process will be stopped after hybrid.CleanUpRegisterDataAndExit() hybrid.CleanUpRegisterDataAndExit() } } }) } // Check in main goroutine and update as soon as possible, which use stricter // timeout limitation. NOTE: The preparation phase timeout parameter should // be considered as the whole timeout toleration minus minimum sleeping time // for safe updating (5s) minus normal execution time of updating script // (usually less than 5s), e.g., 50s - 5s - 5s = 40s. if err := update.SafeBootstrapUpdate(time.Duration(40)*time.Second, time.Duration(30)*time.Second); err != nil { log.GetLogger().Errorln("Failed to check update when starting: " + err.Error()) // Failed to update at starting phase would not terminate agent // return } if err := update.InitCheckUpdateTimer(); err != nil { log.GetLogger().Fatalln("Failed to initialize update checker: " + err.Error()) metrics.GetUpdateFailedEvent( "errormsg", fmt.Sprintf("InitCheckUpdateTimer error: %s", err.Error()), ).ReportEvent() return } if disabled, err := flagging.DetectNormalizingCRLFDisabled(); disabled { log.GetLogger().WithError(err).Warning("CRLF-normalization has been disabled due to configuration") } if disabled, err := flagging.DetectTaskOutputRingbufferDisabled(); disabled { log.GetLogger().WithError(err).Warning("TaskOutput-Ringbuffer has been disabled due to configuration") } channel.StartChannelMgr() // Register callback functions that will be called when the network recover heartbeat.RegisterActionWhenNetRecover(map[string]func(){ "SelectAvailableChannel": channel.OnNetworkRecover, }) if err := heartbeat.InitHeartbeatTimer(); err != nil { log.GetLogger().Fatalln("Failed to initialize heartbeat: " + err.Error()) return } // Start ipc server and init cryptdata package before fetching tasks, // because they may be relied on by tasks. cryptdata.Init() // Init commander manager commandermanager.InitCommanderManager("") // Initialize and serve inter-process functionalities in parallel wrapgo.GoWithDefaultPanicHandler(func() { messagebus_server.ListenAndServe(log.GetLogger(), buses.GetCentralEndpoint(true), nil, []messagebus_server.RegisterFunc{ cryptdata_server.RegisterAssistAgentServer, commander_server.RegisterAssistAgentServer, configure_server.RegisterAssistAgentServer, }, ) }) // TODO: First heart-beat may fail and be failed to indicate agent is ready. // Retrying should be tried here. heartbeat.PingwithRetries(3) // Finally, fetching tasks could be allowed and agent starts to run normally. taskengine.EnableFetchingTask() log.GetLogger().Infoln("Started successfully") // And also log to stdout, which would be written to systemd-journal as well // as console via systemd fmt.Println("Started successfully") Started = true // Periodic tasks are retrieved only once at startup. // The interval between startup fetch task and the first heart-beat should // be minimized as much as possible. wrapgo.CallWithDefaultPanicHandler(func() { isColdstart, err := flagging.IsColdstart() if err != nil { log.GetLogger().WithError(err).Errorln("Error encountered when detecting cold-start flag") } else { startType := "not cold start" if isColdstart { startType = "cold start" } metrics.GetBaseStartupEvent( "type", startType, "osName", osutil.GetVersion(), ).ReportEvent() } if !taskengine.IsStartupFetched() { taskengine.Fetch(false, "", taskengine.NormalTaskType) } else { log.GetLogger().Infoln("Startup tasks has been fetched together with kick_off tasks") } }) // Execute operations that are not time sensitive finally, minimize the interval between critical // steps like fetch startup task and the first heart-beat. wrapgo.CallWithDefaultPanicHandler(func() { // Report last os panic if panic record found if isColdstart, err := flagging.IsColdstart(); err != nil || isColdstart { wrapgo.GoWithDefaultPanicHandler(checkospanic.ReportLastOsPanic) } // Initialize non-critical periodic items, failure of initialization will not interrupt agent. if err := statemanager.InitStateManagerTimer(); err != nil { log.GetLogger().Errorln("Failed to initialize statemanager: " + err.Error()) } pluginmanager.InitPluginCheckTimer() if err := checkkdump.CheckKdumpTimer(); err != nil { log.GetLogger().Errorln("Failed to StartKdumpCheckTimer: ", err) } else { log.GetLogger().Infoln("Start StartKdumpCheckTimer") } if err := checkvirt.StartVirtIoVersionReport(); err != nil { log.GetLogger().Errorln("Failed to StartVirtIoVersionReport: " + err.Error()) } else { log.GetLogger().Infoln("Start StartVirtIoVersionReport success") } // Start self kill monitor time.Sleep(time.Duration(3*60) * time.Second) log.GetLogger().Infoln("Start PerfMon ......") perfmon.StartSelfKillMon() }) } func (p *program) Stop(s service.Service) error { log.GetLogger().Println("Stopping ......") if Started { // Report stop event only after successfully starting ctx, cancel := context.WithTimeout(context.Background(), time.Duration(2)*time.Second) defer cancel() go func() { defer cancel() reportAgentStop(ctx) }() <-ctx.Done() } // channel.StopChannelMgr() // //websocket.DisconnectWebsocketServer() // G_Running = false // close(G_StopEvent) // SingleAppLock.TryUnlock() // perfmon.StopSelfKillMon() log.GetLogger().Println("Stopped") return nil } func reportAgentStop(ctx context.Context) { reason := "unknown" shutdown, err := powerutil.IsSystemShutdown(ctx) if err != nil { log.GetLogger().WithError(err).Error("IsSystemShutdown") } else if shutdown { reason = "shutdown" } metrics.GetBaseStoppedEvent( "reason", reason, ).ReportEventSync() } func parseOptions(ctx *cli.Context) Options { options := Options{} options.GetHelp = ctx.Flags().Get(HelpFlagName).IsAssigned() options.GetVersion = ctx.Flags().Get(VersionFlagName).IsAssigned() options.GetGitHash = ctx.Flags().Get(GithashFlagName).IsAssigned() options.Install = ctx.Flags().Get(InstallFlagName).IsAssigned() options.Remove = ctx.Flags().Get(RemoveFlagName).IsAssigned() options.Start = ctx.Flags().Get(StartFlagName).IsAssigned() options.Stop = ctx.Flags().Get(StopFlagName).IsAssigned() options.IsVerbose = ctx.Flags().Get(VerboseFlagName).IsAssigned() options.Register = ctx.Flags().Get(RegisterFlagName).IsAssigned() options.DeRegister = ctx.Flags().Get(DeRegisterFlagName).IsAssigned() options.Region, _ = ctx.Flags().Get(RegionFlagName).GetValue() options.Tags = ctx.Flags().Get(TagFlagName).GetValues() options.ActivationCode, _ = ctx.Flags().Get(ActivationCodeFlagName).GetValue() options.ActivationId, _ = ctx.Flags().Get(ActivationIdFlagName).GetValue() options.NetWorkMode, _ = ctx.Flags().Get(NetworkModeFlagName).GetValue() options.InstanceName, _ = ctx.Flags().Get(InstanceNameFlagName).GetValue() options.LogPath, _ = ctx.Flags().Get(LogPathFlagName).GetValue() options.RunAsCommon = ctx.Flags().Get(RunAsCommonFlagName).IsAssigned() options.RunAsDaemon = ctx.Flags().Get(RunAsDaemonFlagName).IsAssigned() return options } func runRootCommand(ctx *cli.Context, args []string) error { options := parseOptions(ctx) log.InitLog("aliyun_assist_main.log", options.LogPath, false) // Redirect logging messages from kubernetes CRI client via klog to logrus // used by ourselves klog.SetLogger(logrusr.New(log.GetLogger()).WithName("klog")) if options.LogPath != "" { pathutil.SetScriptPath(options.LogPath) } e := PatchGolang() if e != nil { log.GetLogger().Fatal("PatchGolang failed :", e.Error()) } if options.IsVerbose { util.SetVerboseMode(true) } if options.GetHelp { // aliyun-cli/cli library handles "help" flag internally, and here needs // to do nothing. return nil } if options.GetVersion { fmt.Println(version.AssistVersion) return nil } if options.GetGitHash { fmt.Println(version.GitCommitHash) return nil } if options.Register { tags := []hybrid.Tag{} for _, tag := range options.Tags { words := strings.Split(tag, "=") if len(words) != 2 { fmt.Println("Invalid tag: ", tag) cli.Exit(1) } tags = append(tags, hybrid.Tag{ Key: words[0], Value: words[1], }) } if !hybrid.Register(options.Region, options.ActivationCode, options.ActivationId, options.InstanceName, options.NetWorkMode, true, tags) { cli.Exit(1) } return nil } if options.DeRegister { if !hybrid.UnRegister(true) { cli.Exit(1) } return nil } if options.RunAsDaemon { // TODO: Check other options like --install, --remove, --start, --stop should not be passed if err := daemon.Daemonize(); err != nil { fmt.Fprintln(os.Stderr, "Failed to start aliyun-service as daemon:", err) cli.Exit(1) } return nil } svcConfig := install.ServiceConfig() prg := &program{} s, err := service.New(prg, svcConfig) if err != nil { return fmt.Errorf("new service error %w", err) } if options.Stop { if err := s.Stop(); err != nil { return fmt.Errorf("stop assist failed: %w", err) } else { fmt.Println("stop assist ok") return nil } } if options.Remove { if err := s.Uninstall(); err != nil { return fmt.Errorf("uninstall assist failed: %w", err) } else { fmt.Println("uninstall assist ok") return nil } } if options.Install { if err := s.Install(); err != nil { return fmt.Errorf("install assist failed: %w", err) } else { fmt.Println("install assist ok") return nil } } if options.Start { if err := s.Start(); err != nil { return fmt.Errorf("start assist failed: %w", err) } else { fmt.Println("start assist ok") return nil } } err = s.Run() if err != nil { log.GetLogger().Println(err.Error()) return err } return nil } func initConfiguration(logger logrus.FieldLogger) { flagging.InitConfig(logger) flagging.RegisterCallbackAndApply(logger, map[string]flagging.Callback{ flagging.ASSIST_DAEMON_ACTIVE: daemon.OperateAssistDaemon, }) }