func()

in oracle/controllers/instancecontroller/instance_controller_patching.go [64:292]


func (r *InstanceReconciler) patchingStateMachine(req ctrl.Request, instanceReadyCond *v1.Condition, dbInstanceCond *v1.Condition, inst *v1alpha1.Instance, ctx context.Context, stsParams *controllers.StsParams, config *v1alpha1.Config, databasePatchingTimeout time.Duration, log logr.Logger) (ctrl.Result, error, bool) {
	// Conditions not initialized yet
	if instanceReadyCond == nil || dbInstanceCond == nil {
		log.Info("patchingStateMachine: Instance not ready yet, proceed with main reconciliation")
		return ctrl.Result{}, nil, false
	}

	switch instanceReadyCond.Reason {
	case k8s.CreateComplete, k8s.ExportComplete, k8s.RestoreComplete, k8s.ImportComplete, k8s.PatchingRecoveryCompleted:

		// PatchingRecoveryCompleted is also a stable state and we need this check to avoid the infinite loop of retrying Patching with failed images.
		if instanceReadyCond.Reason == k8s.PatchingRecoveryCompleted && reflect.DeepEqual(inst.Spec.Images, inst.Status.LastFailedImages) {
			return ctrl.Result{}, nil, true
		}

		inst.Status.CurrentActiveStateMachine = "PatchingStateMachine"
		if result, err := r.startPatchingBackup(req, ctx, inst, log); err != nil {
			// In case of k8s conflict retry, otherwise switch to failed state
			if !apierrors.IsConflict(err) {
				k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingBackupFailure, "")
			}
			return result, err, true
		}
		log.Info("patchingStateMachine: CreateComplete->PatchingBackupStarted")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingBackupStarted, "Patching Backup Started")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.PatchingBackupStarted:
		completed, err := r.isPatchingBackupCompleted(ctx, *inst)
		if err != nil {
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingBackupFailure, "")
			return ctrl.Result{}, err, true
		} else if !completed {
			return ctrl.Result{RequeueAfter: 30 * time.Second}, nil, true
		}
		log.Info("patchingStateMachine: PatchingBackupStarted->DeploymentSetPatchingInProgress")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingInProgress, "Patching backup completed, continuing patching")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.DeploymentSetPatchingInProgress:
		elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
		if elapsed > deploymentPatchingTimeout {
			msg := fmt.Sprintf("agentPatchingStateMachine: Agent patching timed out after %v", deploymentPatchingTimeout)
			log.Info(msg)
			r.Recorder.Eventf(inst, corev1.EventTypeWarning, "InstanceReady", msg)
			log.Info("agentPatchingStateMachine: DeploymentSetPatchingInProgress->DeploymentSetPatchingRollbackInProgress")
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingRollbackInProgress, msg)
			return ctrl.Result{}, errors.New(msg), true
		}
		// TODO: Reconcile other agents if we add them
		res, err := r.reconcileMonitoring(ctx, inst, r.Log, stsParams.Images)
		if err != nil {
			log.Info("agentPatchingStateMachine: DeploymentSetPatchingInProgress->DeploymentSetPatchingRollbackInProgress")
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingRollbackInProgress, "")
			return ctrl.Result{}, err, true
		}
		if res.RequeueAfter > 0 {
			return res, nil, true
		}

		log.Info("agentPatchingStateMachine: DeploymentSetPatchingInProgress->DeploymentSetPatchingComplete")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DeploymentSetPatchingComplete, "")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.DeploymentSetPatchingComplete:
		// We know Deployment patching is complete, check status of Oracle
		oracleRunning, err := r.isOracleUpAndRunning(ctx, inst, req.Namespace, log)
		if err != nil {
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to check Oracle status")
			return ctrl.Result{}, err, true
		}
		if !oracleRunning {
			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
		}
		// If there are no new images specified with respect to the stateful set skip this state.
		if !isStatefulSetPatchingRequired(inst.Status.ActiveImages, inst.Spec.Images) {
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingComplete, "")
			return ctrl.Result{Requeue: true}, nil, true
		}

		// Start software patching
		if _, err, _ := r.startStatefulSetPatching(req, ctx, *inst, stsParams, log); err != nil {
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingFailure, "")
			return ctrl.Result{}, err, true
		}
		log.Info("patchingStateMachine: DeploymentSetPatchingComplete->StatefulSetPatchingInProgress")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingInProgress, "")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.DeploymentSetPatchingRollbackInProgress:
		elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
		if elapsed > deploymentPatchingTimeout {
			msg := fmt.Sprintf("agentPatchingStateMachine: Agent patching timed out after %v", deploymentPatchingTimeout)
			log.Info(msg)
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, msg)
			return ctrl.Result{}, errors.New(msg), true
		}
		// TODO: Reconcile other agents if we add them
		res, err := r.reconcileMonitoring(ctx, inst, r.Log, inst.Status.ActiveImages)
		if err != nil {
			log.Info("agentPatchingStateMachine: DeploymentSetPatchingRollbackInProgress->PatchingRecoveryFailure")
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, "")
			return ctrl.Result{}, err, true
		}
		if res.RequeueAfter > 0 {
			return res, nil, true
		}

		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryCompleted, "")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.StatefulSetPatchingInProgress:
		// Track software patching runtime and terminate if its running beyond timeout interval
		elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
		if elapsed > databasePatchingTimeout {
			msg := fmt.Sprintf("Software patching timed out after %v", databasePatchingTimeout)
			log.Info(msg)
			r.Recorder.Eventf(inst, corev1.EventTypeWarning, "InstanceReady", msg)
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingFailure, msg)
			return ctrl.Result{}, errors.New(msg), true
		}
		// Monitor patching progress
		if !r.updateProgressCondition(ctx, *inst, req.NamespacedName.Namespace, k8s.StatefulSetPatchingInProgress, log) {
			log.Info("waiting for STS creation to complete: requeue after 30 seconds")
			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
		}
		log.Info("patchingStateMachine: StatefulSetPatchingInProgress->StatefulSetPatchingComplete")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.StatefulSetPatchingComplete, "")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.StatefulSetPatchingComplete:
		// We know STS is up, check status of Oracle
		oracleRunning, err := r.isOracleUpAndRunning(ctx, inst, req.Namespace, log)
		if err != nil {
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to check Oracle status")
			return ctrl.Result{}, err, true
		}
		if !oracleRunning {
			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
		}
		// Start patching
		if err := r.startDatabasePatching(req, ctx, *inst, log); err != nil {
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to start database patching")
			return ctrl.Result{}, err, true
		}
		log.Info("patchingStateMachine: StatefulSetPatchingComplete->DatabasePatchingInProgress")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingInProgress, "Calling ApplyDataPatch()")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.DatabasePatchingInProgress:
		// Track database patching runtime and terminate if its running beyond timeout interval
		elapsed := k8s.ElapsedTimeFromLastTransitionTime(instanceReadyCond, time.Second)
		if elapsed > databasePatchingTimeout {
			msg := fmt.Sprintf("Database patching timed out after %v", databasePatchingTimeout)
			log.Info(msg)
			r.Recorder.Eventf(inst, corev1.EventTypeWarning, "InstanceReady", msg)
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, msg)
			return ctrl.Result{}, errors.New(msg), true
		}
		// Monitor patching progress
		done, err := r.isDatabasePatchingDone(ctx, req, *inst, log)
		if err != nil {
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingFailure, "Failed to check datapatch status")
			return ctrl.Result{}, err, true
		}
		if !done {
			log.Info("datapatch still in progress, waiting")
			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
		}
		log.Info("patchingStateMachine: DatabasePatchingInProgress->DatabasePatchingComplete")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.DatabasePatchingComplete, "Calling ApplyDataPatch()")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.DatabasePatchingComplete:
		log.Info("patchingStateMachine: DatabasePatchingComplete->CreateComplete")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionTrue, k8s.CreateComplete, "")
		// Update current service image path
		inst.Status.ActiveImages = cloneMap(stsParams.Images)
		inst.Status.CurrentActiveStateMachine = ""
		log.Info("patchingStateMachine: patching done", "updating CurrentServiceImage", inst.Spec.Images)
		return ctrl.Result{}, nil, true

	case k8s.StatefulSetPatchingFailure, k8s.DatabasePatchingFailure:
		// Remove old STS/PVC so we can recover.
		if done, err := r.deleteOldSTSandPVCs(ctx, *inst, *stsParams, r.Log); err != nil {
			log.Info("patchingStateMachine: PatchingRecoveryInProgress->PatchingRecoveryFailure")
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, "Failed to restore from snapshot after patching failure")
			return ctrl.Result{}, err, true
		} else if !done {
			r.Log.Info("STS/PVC removal in progress, waiting")
			return ctrl.Result{Requeue: true}, nil, true
		}

		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryInProgress, "Restoring snapshot due to patching failure")
		log.Info("patchingStateMachine: XXXPatchingFailure->PatchingRecoveryInProgress")
		return ctrl.Result{Requeue: true}, nil, true

	case k8s.PatchingRecoveryInProgress:
		// always retry recoverFromPatchingFailure to keep STS correct
		// in case we flipflop between states.
		if err := r.recoverFromPatchingFailure(ctx, *inst, stsParams); err != nil {
			return ctrl.Result{}, err, true
		}

		if complete := r.isRecoveryFromPatchingFailureComplete(req, ctx, *inst); !complete {
			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
		}
		shortCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
		defer cancel()
		oracleRunning, err := r.isOracleUpAndRunning(shortCtx, inst, req.Namespace, log)
		if err != nil {
			log.Info("patchingStateMachine: PatchingRecoveryInProgress->PatchingRecoveryFailure")
			k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionFalse, k8s.PatchingRecoveryFailure, "Failed to restore from snapshot after patching failure. Could not retrieve status of Oracle")
			return ctrl.Result{}, err, true
		}
		if !oracleRunning {
			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil, true
		}
		inst.Status.LastFailedImages = cloneMap(inst.Spec.Images)
		log.Info("patchingStateMachine: PatchingRecoveryInProgress->PatchingRecoveryCompleted")
		k8s.InstanceUpsertCondition(&inst.Status, k8s.Ready, v1.ConditionTrue, k8s.PatchingRecoveryCompleted, "Finished restoring from snapshot after patching failure")
		inst.Status.CurrentActiveStateMachine = ""
		return ctrl.Result{}, nil, true

	default:
		log.Info("patchingStateMachine: no action needed, proceed with main reconciliation")
		return ctrl.Result{}, nil, false
	}
}