in oracle/controllers/instancecontroller/instance_controller_patching.go [64:292]
func (r *InstanceReconciler) patchingStateMachine(req ctrl.Request, instanceReadyCond *v1.Condition, dbInstanceCond *v1.Condition, inst *v1alpha1.Instance, ctx context.Context, stsParams *controllers.StsParams, config *v1alpha1.Config, databasePatchingTimeout time.Duration, log logr.Logger) (ctrl.Result, error, bool) {
// Conditions not initialized yet
if instanceReadyCond == nil || dbInstanceCond == nil {
log.Info("patchingStateMachine: Instance not ready yet, proceed with main reconciliation")
return ctrl.Result{}, nil, false
}
switch instanceReadyCond.Reason {
case k8s.CreateComplete, k8s.ExportComplete, k8s.RestoreComplete, k8s.ImportComplete, k8s.PatchingRecoveryCompleted:
// PatchingRecoveryCompleted is also a stable state and we need this check to avoid the infinite loop of retrying Patching with failed images.
if instanceReadyCond.Reason == k8s.PatchingRecoveryCompleted && reflect.DeepEqual(inst.Spec.Images, inst.Status.LastFailedImages) {
return ctrl.Result{}, nil, true
}
inst.Status.CurrentActiveStateMachine = "PatchingStateMachine"
if result, err := r.startPatchingBackup(req, ctx, inst, log); err != nil {
// In case of k8s conflict retry, otherwise switch to failed state
if !apierrors.IsConflict(err) {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingBackupFailure, "")
}
return result, err, true
}
log.Info("patchingStateMachine: CreateComplete->PatchingBackupStarted")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingBackupStarted, "Patching Backup Started")
return ctrl.Result{Requeue: true}, nil, true
case k8s.PatchingBackupStarted:
completed, err := r.isPatchingBackupCompleted(ctx, *inst)
if err != nil {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingBackupFailure, "")
return ctrl.Result{}, err, true
} else if !completed {
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil, true
}
log.Info("patchingStateMachine: PatchingBackupStarted->DeploymentSetPatchingInProgress")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingInProgress, "Patching backup completed, continuing patching")
return ctrl.Result{Requeue: true}, nil, true
case k8s.DeploymentSetPatchingInProgress:
elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
if elapsed > deploymentPatchingTimeout {
msg := fmt.Sprintf("agentPatchingStateMachine: Agent patching timed out after %v", deploymentPatchingTimeout)
log.Info(msg)
r.Recorder.Eventf(inst, corev1.EventTypeWarning, "InstanceReady", msg)
log.Info("agentPatchingStateMachine: DeploymentSetPatchingInProgress->DeploymentSetPatchingRollbackInProgress")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingRollbackInProgress, msg)
return ctrl.Result{}, errors.New(msg), true
}
// TODO: Reconcile other agents if we add them
res, err := r.reconcileMonitoring(ctx, inst, r.Log, stsParams.Images)
if err != nil {
log.Info("agentPatchingStateMachine: DeploymentSetPatchingInProgress->DeploymentSetPatchingRollbackInProgress")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingRollbackInProgress, "")
return ctrl.Result{}, err, true
}
if res.RequeueAfter > 0 {
return res, nil, true
}
log.Info("agentPatchingStateMachine: DeploymentSetPatchingInProgress->DeploymentSetPatchingComplete")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingComplete, "")
return ctrl.Result{Requeue: true}, nil, true
case k8s.DeploymentSetPatchingComplete:
// We know Deployment patching is complete, check status of Oracle
oracleRunning, err := r.isOracleUpAndRunning(ctx, inst, req.Namespace, log)
if err != nil {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to check Oracle status")
return ctrl.Result{}, err, true
}
if !oracleRunning {
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
}
// If there are no new images specified with respect to the stateful set skip this state.
if !isStatefulSetPatchingRequired(inst.Status.ActiveImages, inst.Spec.Images) {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingComplete, "")
return ctrl.Result{Requeue: true}, nil, true
}
// Start software patching
if _, err, _ := r.startStatefulSetPatching(req, ctx, *inst, stsParams, log); err != nil {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingFailure, "")
return ctrl.Result{}, err, true
}
log.Info("patchingStateMachine: DeploymentSetPatchingComplete->StatefulSetPatchingInProgress")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingInProgress, "")
return ctrl.Result{Requeue: true}, nil, true
case k8s.DeploymentSetPatchingRollbackInProgress:
elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
if elapsed > deploymentPatchingTimeout {
msg := fmt.Sprintf("agentPatchingStateMachine: Agent patching timed out after %v", deploymentPatchingTimeout)
log.Info(msg)
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, msg)
return ctrl.Result{}, errors.New(msg), true
}
// TODO: Reconcile other agents if we add them
res, err := r.reconcileMonitoring(ctx, inst, r.Log, inst.Status.ActiveImages)
if err != nil {
log.Info("agentPatchingStateMachine: DeploymentSetPatchingRollbackInProgress->PatchingRecoveryFailure")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, "")
return ctrl.Result{}, err, true
}
if res.RequeueAfter > 0 {
return res, nil, true
}
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryCompleted, "")
return ctrl.Result{Requeue: true}, nil, true
case k8s.StatefulSetPatchingInProgress:
// Track software patching runtime and terminate if its running beyond timeout interval
elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
if elapsed > databasePatchingTimeout {
msg := fmt.Sprintf("Software patching timed out after %v", databasePatchingTimeout)
log.Info(msg)
r.Recorder.Eventf(inst, corev1.EventTypeWarning, "InstanceReady", msg)
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingFailure, msg)
return ctrl.Result{}, errors.New(msg), true
}
// Monitor patching progress
if !r.updateProgressCondition(ctx, *inst, req.NamespacedName.Namespace, k8s.StatefulSetPatchingInProgress, log) {
log.Info("waiting for STS creation to complete: requeue after 30 seconds")
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
}
log.Info("patchingStateMachine: StatefulSetPatchingInProgress->StatefulSetPatchingComplete")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingComplete, "")
return ctrl.Result{Requeue: true}, nil, true
case k8s.StatefulSetPatchingComplete:
// We know STS is up, check status of Oracle
oracleRunning, err := r.isOracleUpAndRunning(ctx, inst, req.Namespace, log)
if err != nil {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to check Oracle status")
return ctrl.Result{}, err, true
}
if !oracleRunning {
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
}
// Start patching
if err := r.startDatabasePatching(req, ctx, *inst, log); err != nil {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to start database patching")
return ctrl.Result{}, err, true
}
log.Info("patchingStateMachine: StatefulSetPatchingComplete->DatabasePatchingInProgress")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingInProgress, "Calling ApplyDataPatch()")
return ctrl.Result{Requeue: true}, nil, true
case k8s.DatabasePatchingInProgress:
// Track database patching runtime and terminate if its running beyond timeout interval
elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
if elapsed > databasePatchingTimeout {
msg := fmt.Sprintf("Database patching timed out after %v", databasePatchingTimeout)
log.Info(msg)
r.Recorder.Eventf(inst, corev1.EventTypeWarning, "InstanceReady", msg)
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, msg)
return ctrl.Result{}, errors.New(msg), true
}
// Monitor patching progress
done, err := r.isDatabasePatchingDone(ctx, req, *inst, log)
if err != nil {
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to check datapatch status")
return ctrl.Result{}, err, true
}
if !done {
log.Info("datapatch still in progress, waiting")
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
}
log.Info("patchingStateMachine: DatabasePatchingInProgress->DatabasePatchingComplete")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingComplete, "Calling ApplyDataPatch()")
return ctrl.Result{Requeue: true}, nil, true
case k8s.DatabasePatchingComplete:
log.Info("patchingStateMachine: DatabasePatchingComplete->CreateComplete")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionTrue, k8s.CreateComplete, "")
// Update current service image path
inst.Status.ActiveImages = cloneMap(stsParams.Images)
inst.Status.CurrentActiveStateMachine = ""
log.Info("patchingStateMachine: patching done", "updating CurrentServiceImage", inst.Spec.Images)
return ctrl.Result{}, nil, true
case k8s.StatefulSetPatchingFailure, k8s.DatabasePatchingFailure:
// Remove old STS/PVC so we can recover.
if done, err := r.deleteOldSTSandPVCs(ctx, *inst, *stsParams, r.Log); err != nil {
log.Info("patchingStateMachine: PatchingRecoveryInProgress->PatchingRecoveryFailure")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, "Failed to restore from snapshot after patching failure")
return ctrl.Result{}, err, true
} else if !done {
r.Log.Info("STS/PVC removal in progress, waiting")
return ctrl.Result{Requeue: true}, nil, true
}
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryInProgress, "Restoring snapshot due to patching failure")
log.Info("patchingStateMachine: XXXPatchingFailure->PatchingRecoveryInProgress")
return ctrl.Result{Requeue: true}, nil, true
case k8s.PatchingRecoveryInProgress:
// always retry recoverFromPatchingFailure to keep STS correct
// in case we flipflop between states.
if err := r.recoverFromPatchingFailure(ctx, *inst, stsParams); err != nil {
return ctrl.Result{}, err, true
}
if complete := r.isRecoveryFromPatchingFailureComplete(req, ctx, *inst); !complete {
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
}
shortCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
oracleRunning, err := r.isOracleUpAndRunning(shortCtx, inst, req.Namespace, log)
if err != nil {
log.Info("patchingStateMachine: PatchingRecoveryInProgress->PatchingRecoveryFailure")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, "Failed to restore from snapshot after patching failure. Could not retrieve status of Oracle")
return ctrl.Result{}, err, true
}
if !oracleRunning {
return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
}
inst.Status.LastFailedImages = cloneMap(inst.Spec.Images)
log.Info("patchingStateMachine: PatchingRecoveryInProgress->PatchingRecoveryCompleted")
k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionTrue, k8s.PatchingRecoveryCompleted, "Finished restoring from snapshot after patching failure")
inst.Status.CurrentActiveStateMachine = ""
return ctrl.Result{}, nil, true
default:
log.Info("patchingStateMachine: no action needed, proceed with main reconciliation")
return ctrl.Result{}, nil, false
}
}