agent/health/healthcheck.go (178 lines of code) (raw):
// Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"). You may not
// use this file except in compliance with the License. A copy of the
// License is located at
//
// http://aws.amazon.com/apache2.0/
//
// or in the "license" file accompanying this file. This file is distributed
// on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing
// permissions and limitations under the License.
// Package health contains routines that periodically reports health information of the agent
package health
import (
"math/rand"
"runtime/debug"
"time"
"github.com/aws/amazon-ssm-agent/agent/appconfig"
"github.com/aws/amazon-ssm-agent/agent/context"
"github.com/aws/amazon-ssm-agent/agent/log"
"github.com/aws/amazon-ssm-agent/agent/sdkutil"
"github.com/aws/amazon-ssm-agent/agent/ssm"
"github.com/aws/amazon-ssm-agent/agent/ssmconnectionchannel"
"github.com/aws/amazon-ssm-agent/agent/version"
"github.com/aws/amazon-ssm-agent/common/identity"
"github.com/aws/amazon-ssm-agent/common/identity/availableidentities/ec2"
"github.com/aws/amazon-ssm-agent/common/identity/availableidentities/ecs"
"github.com/aws/amazon-ssm-agent/common/identity/availableidentities/onprem"
"github.com/carlescere/scheduler"
)
type IHealthCheck interface {
ModuleName() string
ModuleExecute() (err error)
ModuleStop() (err error)
GetAgentState() (a AgentState, err error)
}
// HealthCheck encapsulates the logic on configuring, starting and stopping core modules
type HealthCheck struct {
context context.T
healthCheckStopPolicy *sdkutil.StopPolicy
healthJob *scheduler.Job
service ssm.Service
}
const (
name = "HealthCheck"
// AgentName is the name of the current agent.
AgentName = "amazon-ssm-agent"
)
var healthModule *HealthCheck
var newEC2Identity = func(log log.T) identity.IAgentIdentityInner {
if identityRef := ec2.NewEC2Identity(log); identityRef != nil {
return identityRef
}
return nil
}
var newECSIdentity = func(log log.T) identity.IAgentIdentityInner {
if identityRef := ecs.NewECSIdentity(log); identityRef != nil {
return identityRef
}
return nil
}
var newOnPremIdentity = func(log log.T, config *appconfig.SsmagentConfig) identity.IAgentIdentityInner {
if identityRef := onprem.NewOnPremIdentity(log, config); identityRef != nil {
return identityRef
}
return nil
}
// AgentState enumerates active and passive agentMode
type AgentState int32
const (
//Active would suggest the agent is going to start with full capacity since SSM can be reached
Active AgentState = 1
//Passive would suggest that the agent is in Backoff and the health will be checked based on current capacity
Passive AgentState = 0
)
// NewHealthCheck creates a new health check core module.
// Only one health core module must exist at a time
func NewHealthCheck(context context.T, svc ssm.Service) *HealthCheck {
if healthModule != nil {
context.Log().Debug("Health process has already been initialized.")
return healthModule
}
healthContext := context.With("[" + name + "]")
healthCheckStopPolicy := sdkutil.NewStopPolicy(name, 10)
healthModule = &HealthCheck{
context: healthContext,
healthCheckStopPolicy: healthCheckStopPolicy,
service: svc,
}
return healthModule
}
// schedules recurrent updateHealth calls
func (h *HealthCheck) scheduleUpdateHealth() {
var err error
if h.healthJob, err = scheduler.Every(h.scheduleInMinutes()).Minutes().Run(h.updateHealth); err != nil {
h.context.Log().Errorf("unable to schedule health update. %v", err)
}
return
}
// updates SSM with the instance health information
func (h *HealthCheck) updateHealth() {
log := h.context.Log()
defer func() {
if r := recover(); r != nil {
log.Errorf("Update health panic: \n%v", r)
log.Errorf("Stacktrace:\n%s", debug.Stack())
}
}()
log.Infof("%s reporting agent health.", name)
appConfig := h.context.AppConfig()
var isEC2, isECS, isOnPrem bool
var ec2Identity, ecsIdentity identity.IAgentIdentityInner
onpremIdentity := newOnPremIdentity(log, &appConfig)
isOnPrem = onpremIdentity != nil && onpremIdentity.IsIdentityEnvironment()
if !isOnPrem {
ec2Identity = newEC2Identity(log)
ecsIdentity = newECSIdentity(log)
isEC2 = ec2Identity != nil && ec2Identity.IsIdentityEnvironment()
isECS = ecsIdentity != nil && ecsIdentity.IsIdentityEnvironment()
}
var availabilityZone = ""
var availabilityZoneId = ""
if isEC2 && !isECS && !isOnPrem {
availabilityZone, _ = ec2Identity.AvailabilityZone()
availabilityZoneId, _ = ec2Identity.AvailabilityZoneId()
}
var ssmConnectionChannel = ""
channel := ssmconnectionchannel.GetConnectionChannel()
ssmConnectionChannel = string(channel)
log.Debugf("got SSM connection channel value: %v", ssmConnectionChannel)
var err error
//TODO when will status become inactive?
// If both ssm config and command is inactive => agent is inactive.
if _, err = h.service.UpdateInstanceInformation(log, version.Version, "Active", AgentName, availabilityZone, availabilityZoneId, ssmConnectionChannel); err != nil {
sdkutil.HandleAwsError(log, err, h.healthCheckStopPolicy)
}
if !h.healthCheckStopPolicy.IsHealthy() {
h.service = ssm.NewService(h.context)
h.healthCheckStopPolicy.ResetErrorCount()
}
return
}
// scheduleInMinutes Run Schedule In Minutes
func (h *HealthCheck) scheduleInMinutes() int {
updateHealthFrequencyMins := 5
config := h.context.AppConfig()
log := h.context.Log()
// Appconstants contain default run-time constants
constants := h.context.AppConstants()
if constants.MinHealthFrequencyMinutes <= config.Ssm.HealthFrequencyMinutes && config.Ssm.HealthFrequencyMinutes <= constants.MaxHealthFrequencyMinutes {
updateHealthFrequencyMins = config.Ssm.HealthFrequencyMinutes
} else {
log.Debug("HealthFrequencyMinutes is outside allowable limits. Limiting to 5 minutes default.")
}
log.Debugf("%v frequency is every %d minutes.", name, updateHealthFrequencyMins)
return updateHealthFrequencyMins
}
// ICoreModule implementation
// ModuleName returns the module name
func (h *HealthCheck) ModuleName() string {
return name
}
// ModuleExecute starts the scheduling of the health check module
func (h *HealthCheck) ModuleExecute() (err error) {
defer func() {
if msg := recover(); msg != nil {
h.context.Log().Errorf("health check ModuleExecute run panic: %v", msg)
}
}()
rand.Seed(time.Now().UTC().UnixNano())
scheduleInMinutes := h.scheduleInMinutes()
randomSeconds := rand.Intn(scheduleInMinutes * 60)
// First call updateHealth once
go h.updateHealth()
// Wait randomSeconds and schedule recurrent updateHealth calls
next := time.Duration(randomSeconds) * time.Second
go func(h *HealthCheck) {
select {
case <-time.After(next):
go h.scheduleUpdateHealth()
}
}(h)
return
}
// ModuleStop handles the termination of the health check module job
func (h *HealthCheck) ModuleStop() (err error) {
if h.healthJob != nil {
h.context.Log().Info("stopping update instance health job.")
h.healthJob.Quit <- true
}
return nil
}
// ping sends an empty ping to the health service to identify if the service exists
func (h *HealthCheck) ping() (err error) {
if h.healthCheckStopPolicy.HasError() {
h.service = ssm.NewService(h.context)
h.healthCheckStopPolicy.ResetErrorCount()
}
_, err = h.service.UpdateEmptyInstanceInformation(h.context.Log(), version.Version, AgentName)
if err != nil {
h.healthCheckStopPolicy.AddErrorCount(1)
}
return err
}
// GetAgentState returns the state of the agent. It is the caller's responsibility to log the error
func (h *HealthCheck) GetAgentState() (a AgentState, err error) {
if err = h.ping(); err != nil {
return Passive, err
}
return Active, err
}