chaosmonkey.go (109 lines of code) (raw):
// Copyright 2016 Netflix, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package chaosmonkey contains our domain models
package chaosmonkey
import (
"fmt"
"time"
)
const (
// App grouping: Chaos Monkey kills one instance per app per day
App Group = iota
// Stack grouping: Chaos Monkey kills one instance per stack per day
Stack
// Cluster grouping: Chaos Monkey kills one instance per cluster per day
Cluster
)
type (
// AppConfig contains app-specific configuration parameters for Chaos Monkey
AppConfig struct {
Enabled bool
RegionsAreIndependent bool
MeanTimeBetweenKillsInWorkDays int
MinTimeBetweenKillsInWorkDays int
Grouping Group
Exceptions []Exception
Whitelist *[]Exception
}
// Group describes what Chaos Monkey considers a group of instances
// Chaos Monkey will randomly kill an instance from each group.
// The group generally maps onto what the service owner considers
// a "cluster", which is different from Spinnaker's notion of a cluster.
Group int
// Exception describes clusters that have been opted out of chaos monkey
// If one of the members is a "*", it matches everything. That is the only
// wildcard value
// For example, this will opt-out all of the cluters in the test account:
// Exception{ Account:"test", Stack:"*", Cluster:"*", Region: "*"}
Exception struct {
Account string
Stack string
Detail string
Region string
}
// Instance contains naming info about an instance
Instance interface {
// AppName is the name of the Netflix app
AppName() string
// AccountName is the name of the account the instance is running in (e.g., prod, test)
AccountName() string
// RegionName is the name of the AWS region (e.g., us-east-1
RegionName() string
// StackName returns the "stack" part of app-stack-detail in cluster names
StackName() string
// ClusterName is the full cluster name: app-stack-detail
ClusterName() string
// ASGName is the name of the ASG associated with the instance
ASGName() string
// ID is the instance ID, e.g. i-dbcba24c
ID() string
// CloudProvider returns the cloud provider (e.g., "aws")
CloudProvider() string
}
// Termination contains information about an instance termination.
Termination struct {
Instance Instance // The instance that will be terminated
Time time.Time // Termination time
Leashed bool // If true, track the termination but do not execute it
}
// Tracker records termination events an a tracking system such as Chronos
Tracker interface {
// Track pushes a termination event to the tracking system
Track(t Termination) error
}
// ErrorCounter counts when errors occur.
ErrorCounter interface {
Increment() error
}
// Decryptor decrypts encrypted text. It is used for decrypting
// sensitive credentials that are stored encrypted
Decryptor interface {
Decrypt(ciphertext string) (string, error)
}
// Env provides information about the environment that Chaos Monkey has been
// deployed to.
Env interface {
// InTest returns true if Chaos Monkey is running in a test environment
InTest() bool
}
// AppConfigGetter retrieves App configuration info
AppConfigGetter interface {
// Get returns the App config info by app name
Get(app string) (*AppConfig, error)
}
// Checker checks to see if a termination is permitted given min time between terminations
//
// if the termination is permitted, returns (true, nil)
// otherwise, returns false with an error
//
// Returns ErrViolatesMinTime if violates min time between terminations
//
// Note that this call may change the state of the server: if the checker returns true, the termination will be recorded.
Checker interface {
// Check checks if a termination is permitted and, if so, records the
// termination time on the server.
// The endHour (hour time when Chaos Monkey stops killing) is in the
// time zone specified by loc.
Check(term Termination, appCfg AppConfig, endHour int, loc *time.Location) error
}
// Terminator provides an interface for killing instances
Terminator interface {
// Kill terminates a running instance
Execute(trm Termination) error
}
// Outage provides an interface for checking if there is currently an outage
// This provides a mechanism to check if there's an ongoing outage, since
// Chaos Monkey doesn't run during outages
Outage interface {
// Outage returns true if there is an ongoing outage
Outage() (bool, error)
}
// ErrViolatesMinTime represents an error when trying to record a termination
// that violates the min time between terminations for that particular app
ErrViolatesMinTime struct {
InstanceID string // the most recent terminated instance id
KilledAt time.Time // the time that the most recent instance was terminated
Loc *time.Location // local time zone location
}
)
// String returns a string representation for a Group
func (g Group) String() string {
switch g {
case App:
return "app"
case Stack:
return "stack"
case Cluster:
return "cluster"
}
panic("Unknown Group value")
}
// NewAppConfig constructs a new app configuration with reasonable defaults
// with specified accounts enabled/disabled
func NewAppConfig(exceptions []Exception) AppConfig {
result := AppConfig{
Enabled: true,
RegionsAreIndependent: true,
MeanTimeBetweenKillsInWorkDays: 5,
Grouping: Cluster,
Exceptions: exceptions,
}
return result
}
// Matches returns true if an exception matches an ASG
func (ex Exception) Matches(account, stack, detail, region string) bool {
return exFieldMatches(ex.Account, account) &&
exFieldMatches(ex.Stack, stack) &&
exFieldMatches(ex.Detail, detail) &&
exFieldMatches(ex.Region, region)
}
// exFieldMatches checks if an exception field matches a given value
// It's true if field is "*" or if the field is the same string as the value
func exFieldMatches(field, value string) bool {
return field == "*" || field == value
}
func (e ErrViolatesMinTime) Error() string {
s := fmt.Sprintf("Would violate min between kills: instance %s was killed at %s", e.InstanceID, e.KilledAt)
// If we know the time zone, report that as well
if e.Loc != nil {
s += fmt.Sprintf(" (%s)", e.KilledAt.In(e.Loc))
}
return s
}