pkg/monitor/cluster/cluster.go (212 lines of code) (raw):
package cluster
// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.
import (
"context"
"net/http"
"sync"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/sirupsen/logrus"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
configv1 "github.com/openshift/api/config/v1"
configclient "github.com/openshift/client-go/config/clientset/versioned"
machineclient "github.com/openshift/client-go/machine/clientset/versioned"
operatorclient "github.com/openshift/client-go/operator/clientset/versioned"
mcoclient "github.com/openshift/machine-config-operator/pkg/generated/clientset/versioned"
"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/env"
"github.com/Azure/ARO-RP/pkg/hive"
"github.com/Azure/ARO-RP/pkg/metrics"
"github.com/Azure/ARO-RP/pkg/monitor/dimension"
"github.com/Azure/ARO-RP/pkg/monitor/emitter"
"github.com/Azure/ARO-RP/pkg/monitor/monitoring"
arov1alpha1 "github.com/Azure/ARO-RP/pkg/operator/apis/aro.openshift.io/v1alpha1"
aroclient "github.com/Azure/ARO-RP/pkg/operator/clientset/versioned"
"github.com/Azure/ARO-RP/pkg/util/steps"
)
var _ monitoring.Monitor = (*Monitor)(nil)
type Monitor struct {
log *logrus.Entry
hourlyRun bool
oc *api.OpenShiftCluster
dims map[string]string
restconfig *rest.Config
cli kubernetes.Interface
configcli configclient.Interface
operatorcli operatorclient.Interface
maocli machineclient.Interface
mcocli mcoclient.Interface
m metrics.Emitter
arocli aroclient.Interface
env env.Interface
tenantID string
ocpclientset client.Client
hiveclientset client.Client
// access below only via the helper functions in cache.go
cache struct {
cos *configv1.ClusterOperatorList
cs *arov1alpha1.ClusterList
cv *configv1.ClusterVersion
ns *corev1.NodeList
arodl *appsv1.DeploymentList
}
wg *sync.WaitGroup
hiveClusterManager hive.ClusterManager
doc *api.OpenShiftClusterDocument
}
func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftCluster, doc *api.OpenShiftClusterDocument, env env.Interface, tenantID string, m metrics.Emitter, hiveRestConfig *rest.Config, hourlyRun bool, wg *sync.WaitGroup, hiveClusterManager hive.ClusterManager) (*Monitor, error) {
r, err := azure.ParseResourceID(oc.ID)
if err != nil {
return nil, err
}
dims := map[string]string{
dimension.ResourceID: oc.ID,
dimension.SubscriptionID: r.SubscriptionID,
dimension.ClusterResourceGroup: r.ResourceGroup,
dimension.ResourceName: r.ResourceName,
}
cli, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return nil, err
}
configcli, err := configclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}
maocli, err := machineclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}
mcocli, err := mcoclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}
arocli, err := aroclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}
operatorcli, err := operatorclient.NewForConfig(restConfig)
if err != nil {
return nil, err
}
// lazy discovery will not attempt to reach out to the apiserver immediately
mapper, err := apiutil.NewDynamicRESTMapper(restConfig, apiutil.WithLazyDiscovery)
if err != nil {
return nil, err
}
ocpclientset, err := client.New(restConfig, client.Options{
Mapper: mapper,
})
if err != nil {
return nil, err
}
hiveclientset, err := getHiveClientSet(hiveRestConfig)
if err != nil {
log.Error(err)
}
return &Monitor{
log: log,
hourlyRun: hourlyRun,
oc: oc,
dims: dims,
restconfig: restConfig,
cli: cli,
configcli: configcli,
operatorcli: operatorcli,
maocli: maocli,
mcocli: mcocli,
arocli: arocli,
env: env,
tenantID: tenantID,
m: m,
ocpclientset: ocpclientset,
hiveclientset: hiveclientset,
wg: wg,
hiveClusterManager: hiveClusterManager,
doc: doc,
}, nil
}
func getHiveClientSet(hiveRestConfig *rest.Config) (client.Client, error) {
if hiveRestConfig == nil {
return nil, nil
}
// lazy discovery will not attempt to reach out to the apiserver immediately
mapper, err := apiutil.NewDynamicRESTMapper(hiveRestConfig, apiutil.WithLazyDiscovery)
if err != nil {
return nil, err
}
hiveclientset, err := client.New(hiveRestConfig, client.Options{
Mapper: mapper,
})
if err != nil {
return nil, err
}
return hiveclientset, nil
}
// Monitor checks the API server health of a cluster
func (mon *Monitor) Monitor(ctx context.Context) (errs []error) {
defer mon.wg.Done()
mon.log.Debug("monitoring")
if mon.hourlyRun {
mon.emitGauge("cluster.provisioning", 1, map[string]string{
"provisioningState": mon.oc.Properties.ProvisioningState.String(),
"failedProvisioningState": mon.oc.Properties.FailedProvisioningState.String(),
})
}
//this API server healthz check must be first, our geneva monitor relies on this metric to always be emitted.
statusCode, err := mon.emitAPIServerHealthzCode(ctx)
if err != nil {
errs = append(errs, err)
mon.emitFailureToGatherMetric(steps.FriendlyName(mon.emitAPIServerHealthzCode), err)
}
// If API is not returning 200, fallback to checking ping and short circuit the rest of the checks
if statusCode != http.StatusOK {
err := mon.emitAPIServerPingCode(ctx)
if err != nil {
errs = append(errs, err)
mon.emitFailureToGatherMetric(steps.FriendlyName(mon.emitAPIServerPingCode), err)
}
return
}
for _, f := range []func(context.Context) error{
mon.emitAroOperatorHeartbeat,
mon.emitAroOperatorConditions,
mon.emitNSGReconciliation,
mon.emitClusterOperatorConditions,
mon.emitClusterOperatorVersions,
mon.emitClusterVersionConditions,
mon.emitClusterVersions,
mon.emitDaemonsetStatuses,
mon.emitDeploymentStatuses,
mon.emitMachineConfigPoolConditions,
mon.emitMachineConfigPoolUnmanagedNodeCounts,
mon.emitNodeConditions,
mon.emitPodConditions,
mon.emitDebugPodsCount,
mon.detectQuotaFailure,
mon.emitReplicasetStatuses,
mon.emitStatefulsetStatuses,
mon.emitJobConditions,
mon.emitSummary,
mon.emitHiveRegistrationStatus,
mon.emitClusterSync,
mon.emitOperatorFlagsAndSupportBanner,
mon.emitMaintenanceState,
mon.emitMDSDCertificateExpiry,
mon.emitIngressAndAPIServerCertificateExpiry,
mon.emitEtcdCertificateExpiry,
mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable
mon.emitCWPStatus,
} {
err = f(ctx)
if err != nil {
errs = append(errs, err)
mon.emitFailureToGatherMetric(steps.FriendlyName(f), err)
// keep going
}
}
return
}
func (mon *Monitor) emitFailureToGatherMetric(friendlyFuncName string, err error) {
mon.log.Printf("%s: %s", friendlyFuncName, err)
mon.emitGauge("monitor.clustererrors", 1, map[string]string{"monitor": friendlyFuncName})
}
func (mon *Monitor) emitGauge(m string, value int64, dims map[string]string) {
emitter.EmitGauge(mon.m, m, value, mon.dims, dims)
}