command/chaosmonkey.go (320 lines of code) (raw):

// Copyright 2016 Netflix, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package command import ( "fmt" "log" "math" "os" "runtime/debug" "strings" "time" flag "github.com/spf13/pflag" "github.com/Netflix/chaosmonkey/v2" "github.com/Netflix/chaosmonkey/v2/clock" "github.com/Netflix/chaosmonkey/v2/config" "github.com/Netflix/chaosmonkey/v2/config/param" "github.com/Netflix/chaosmonkey/v2/deploy" "github.com/Netflix/chaosmonkey/v2/deps" "github.com/Netflix/chaosmonkey/v2/mysql" "github.com/Netflix/chaosmonkey/v2/schedstore" "github.com/Netflix/chaosmonkey/v2/schedule" "github.com/Netflix/chaosmonkey/v2/spinnaker" ) // Version is the version number const Version = "2.0.2" func printVersion() { fmt.Printf("%s\n", Version) } var ( // configPaths is where Chaos Monkey will look for a chaosmonkey.toml // configuration file configPaths = [...]string{".", "/apps/chaosmonkey", "/etc", "/etc/chaosmonkey"} ) // Usage prints usage func Usage() { usage := ` Chaos Monkey Usage: chaosmonkey <command> ... command: migrate | schedule | terminate | fetch-schedule | outage | config | email | eligible | intest Install ------- Installs chaosmonkey with all the setup required, e.g setting up the cron, appling database migration etc. migrate ------- Applies database migration to the database defined in the configuration file. schedule [--max-apps=<N>] [--apps=foo,bar,baz] [--no-record-schedule] -------------------------------------------------------------------- Generates a schedule of terminations for the day and installs the terminations as local cron jobs that call "chaosmonkey terminate ..." --apps=foo,bar,baz Optionally specify an explicit list of apps to schedule. This is primarily used for debugging. --max-apps=<N> Optionally specify the maximum number of apps that Chaos Monkey will schedule. This is primarily used for debugging. --no-record-schedule Do not record the schedule with the database. This is primarily used for debugging. terminate <app> <account> [--region=<region>] [--stack=<stack>] [--cluster=<cluster>] [--leashed] ----------------------------------------------------------------------------------------------------------------- Terminates an instance from a given app and account. Optionally specify a region, stack, cluster. The --leashed flag forces chaosmonkey to run in leashed mode. When leashed, Chaos Monkey will check if an instance should be terminated, but will not actually terminate it. fetch-schedule -------------- Queries the database to see if there is an existing schedule of terminations for today. If so, downloads the schedule and sets up cron jobs to implement the schedule. outage ------ Output "true" if there is an ongoing outage, otherwise "false". Used for debugging. config [<app>] ------------ Query Spinnaker for the config for a specific app and dump it to standard out. This is only used for debugging. If no app is specified, dump the Monkey-level configuration options to standard out. Examples: chaosmonkey config chaosguineapig chaosmonkey config eligible <app> <account> [--region=<region>] [--stack=<stack>] [--cluster=<cluster>] ------------------------------------------------------------------------------------- Dump a list of instance-ids that are eligible for termination for a given app, account, and optionally region, stack, and cluster. intest ------ Outputs "true" on standard out if running within a test environment, otherwise outputs "false" account <name> -------------- Look up an cloud account ID by name. Example: chaosmonkey account test provider <name> --------------- Look up the cloud provider by account name. Example: chaosmonkey provider test clusters <app> <account> ------------------------ List the clusters for a given app and account Example: chaosmonkey clusters chaosguineapig test regions <cluster> <account> --------------------------- List the regions for a given cluster and account Example: chaosmonkey regions chaosguineapig test ` fmt.Printf(usage) } func init() { // Prepend the pid to log statements log.SetPrefix(fmt.Sprintf("[%5d] ", os.Getpid())) } // Execute is the main entry point for the chaosmonkey cli. func Execute() { regionPtr := flag.String("region", "", "region of termination group") stackPtr := flag.String("stack", "", "stack of termination group") clusterPtr := flag.String("cluster", "", "cluster of termination group") appsPtr := flag.String("apps", "", "comma-separated list of apps to schedule for termination") noRecordSchedulePtr := flag.Bool("no-record-schedule", false, "do not record schedule") versionPtr := flag.BoolP("version", "v", false, "show version") flag.Usage = Usage // These flags, if specified, override config values maxAppsFlag := "max-apps" leashedFlag := "leashed" flag.Int(maxAppsFlag, math.MaxInt32, "max number of apps to examine for termination") flag.Bool(leashedFlag, false, "force leashed mode") flag.Parse() if len(flag.Args()) == 0 { if *versionPtr { printVersion() os.Exit(0) } flag.Usage() os.Exit(1) } cmd := flag.Arg(0) cfg, err := getConfig() if err != nil { log.Fatalf("FATAL: failed to load config: %v", err) } // Associate config values with flags err = cfg.BindPFlag(param.MaxApps, flag.Lookup(maxAppsFlag)) if err != nil { log.Fatalf("FATAL: failed to bind flag: --%s: %v", maxAppsFlag, err) } err = cfg.BindPFlag(param.Leashed, flag.Lookup(leashedFlag)) if err != nil { log.Fatalf("FATAL: failed to bind flag: --%s: %v", leashedFlag, err) } spin, err := spinnaker.NewFromConfig(cfg) if err != nil { log.Fatalf("FATAL: spinnaker.New failed: %+v", err) } outage, err := deps.GetOutage(cfg) if err != nil { log.Fatalf("FATAL: deps.GetOutage fail: %+v", err) } sql, err := mysql.NewFromConfig(cfg) if err != nil { log.Fatalf("FATAL: could not initialize mysql connection: %+v", err) } cons, err := deps.GetConstrainer(cfg) if err != nil { log.Fatalf("FATAL: deps.GetConstrainer failed: %+v", err) } // Ensure mysql object gets closed defer func() { _ = sql.Close() }() switch cmd { case "install": executable := ChaosmonkeyExecutable{} Install(cfg, executable, sql) case "migrate": Migrate(sql) case "schedule": log.Println("chaosmonkey schedule starting") defer log.Println("chaosmonkey schedule done") var apps []string if *appsPtr != "" { // User explicitly specified list of apps on the command line apps = strings.Split(*appsPtr, ",") } else { // User did not explicitly specify list of apps, get 'em all var err error apps, err = spin.AppNames() if err != nil { log.Fatalf("FATAL: could not retrieve list of app names: %v", err) } } var schedStore schedstore.SchedStore schedStore = sql if *noRecordSchedulePtr { schedStore = nullSchedStore{} } Schedule(spin, schedStore, cfg, spin, cons, apps) case "fetch-schedule": FetchSchedule(sql, cfg) case "terminate": if len(flag.Args()) != 3 { flag.Usage() os.Exit(1) } app := flag.Arg(1) account := flag.Arg(2) trackers, err := deps.GetTrackers(cfg) if err != nil { log.Fatalf("FATAL: could not create trackers: %+v", err) } errCounter, err := deps.GetErrorCounter(cfg) if err != nil { log.Fatalf("FATAL: could not create error counter: %+v", err) } env, err := deps.GetEnv(cfg) if err != nil { log.Fatalf("FATAL: could not determine environment: %+v", err) } defer logOnPanic(errCounter) // Handler in case of panic deps := deps.Deps{ MonkeyCfg: cfg, Checker: sql, ConfGetter: spin, Cl: clock.New(), Dep: spin, T: spin, Trackers: trackers, Ou: outage, ErrCounter: errCounter, Env: env, } Terminate(deps, app, account, *regionPtr, *stackPtr, *clusterPtr) case "outage": Outage(outage) case "config": if len(flag.Args()) != 2 { DumpMonkeyConfig(cfg) return } app := flag.Arg(1) DumpConfig(spin, app) case "eligible": if len(flag.Args()) != 3 { flag.Usage() os.Exit(1) } app := flag.Arg(1) account := flag.Arg(2) Eligible(spin, spin, app, account, *regionPtr, *stackPtr, *clusterPtr) case "intest": env, err := deps.GetEnv(cfg) if err != nil { log.Fatalf("FATAL: could not determine environment: %+v", err) } fmt.Println(env.InTest()) case "account": if len(flag.Args()) != 2 { flag.Usage() os.Exit(1) } account := flag.Arg(1) id, err := spin.AccountID(account) if err != nil { fmt.Printf("ERROR: Could not retrieve id for account: %s. Reason: %v\n", account, err) return } fmt.Println(id) case "provider": if len(flag.Args()) != 2 { flag.Usage() os.Exit(1) } account := flag.Arg(1) provider, err := spin.CloudProvider(account) if err != nil { fmt.Printf("ERROR: Could not retrieve provider for account: %s. Reason: %v\n", account, err) return } fmt.Println(provider) case "clusters": if len(flag.Args()) != 3 { flag.Usage() os.Exit(1) } app := flag.Arg(1) account := flag.Arg(2) clusters, err := spin.GetClusterNames(app, deploy.AccountName(account)) if err != nil { fmt.Printf("ERROR: %v\n", err) os.Exit(1) } for _, cluster := range clusters { fmt.Println(cluster) } case "regions": if len(flag.Args()) != 3 { flag.Usage() os.Exit(1) } cluster := flag.Arg(1) account := flag.Arg(2) DumpRegions(cluster, account, spin) default: flag.Usage() os.Exit(1) } } func init() { // All logs to stdout log.SetOutput(os.Stdout) } // logOnPanic increments an error metric and logs if a panic happens func logOnPanic(errCounter chaosmonkey.ErrorCounter) { if e := recover(); e != nil { log.Printf("FATAL: panic: %s: %s", e, debug.Stack()) err := errCounter.Increment() if err != nil { log.Printf("failed to increment error counter: %s", err) } } } // return configuration info func getConfig() (*config.Monkey, error) { cfg, err := config.Load(configPaths[:]) if err != nil { return nil, err } return cfg, nil } // nullSchedStore is a no-op implementation of api.SchedStore type nullSchedStore struct{} // Retrieve implements api.SchedStore.Retrieve func (n nullSchedStore) Retrieve(date time.Time) (*schedule.Schedule, error) { return nil, fmt.Errorf("nullSchedStore does not support Retrieve function") } // Publish implements api.SchedStore.Publish func (n nullSchedStore) Publish(date time.Time, sched *schedule.Schedule) error { return nil }