func()

in tpu-provisioner/internal/controller/deletion_controller.go [56:135]


func (r *DeletionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
	lg := ctrllog.FromContext(ctx)

	lg.V(3).Info("Reconciling Node")

	var node corev1.Node
	if err := r.Get(ctx, req.NamespacedName, &node); err != nil {
		if apierrors.IsNotFound(err) {
			// Don't requeue, Node no longer exists (or does not exist in the cache).
			return ctrl.Result{}, nil
		}
		return ctrl.Result{}, fmt.Errorf("getting node: %w", err)
	}

	// NOTE: Because of the cache filter in main.go, this check should always evaluate to false.
	if node.GetLabels()[cloud.LabelNodepoolManager] != cloud.LabelNodepoolManagerTPUPodinator {
		lg.V(3).Info("Node was not provisioned by this controller, ignoring")
		return ctrl.Result{}, nil
	}

	// Avoid noisy reconciliation when nodes are shutting down.
	for _, c := range node.Status.Conditions {
		if c.Type == corev1.NodeReady &&
			c.Status == corev1.ConditionFalse &&
			c.Reason == "KubeletNotReady" &&
			c.Message == "node is shutting down" {
			lg.V(3).Info("Node is shutting down, ignoring")
			return ctrl.Result{}, nil
		}
	}

	// Ensure node was not just created to make sure Pods have had time to schedule.
	if since := time.Since(node.GetCreationTimestamp().Time); since < r.NodeCriteria.MinLifetime {
		wait := r.NodeCriteria.MinLifetime - since + time.Second
		lg.V(3).Info("Node was just created, ignoring", "waiting", wait)
		return ctrl.Result{RequeueAfter: wait}, nil
	}

	nodePoolLabelKey := r.Provider.NodePoolLabelKey()
	nodePoolName, ok := node.GetLabels()[nodePoolLabelKey]
	if !ok {
		lg.V(3).Info("No node pool label found on node, ignoring", "labelKey", nodePoolLabelKey)
		return ctrl.Result{}, nil
	}

	// Ensure the JobSet whose pods created this node pool is either gone, completed, or failed before
	// deleting the node pool.
	jobSetName, exists := node.Labels[cloud.LabelJobSetName]
	if !exists {
		jobSetName, exists = node.Labels[cloud.LabelProvisionerNodepoolID]
		if !exists {
			lg.V(3).Info("Node missing jobset name label", "node", node.Name)
			return ctrl.Result{}, nil
		}
	}
	jobSetNamespace, exists := node.Labels[cloud.LabelJobSetNamespace]
	if !exists {
		lg.V(3).Info("Node missing jobset namespace label, using default", "node", node.Name)
		jobSetNamespace = "default"
	}
	var js jobset.JobSet
	if err := r.Get(ctx, types.NamespacedName{Name: jobSetName, Namespace: jobSetNamespace}, &js); err != nil {
		// Case 1: If JobSet no longer exists, delete the node pool.
		if apierrors.IsNotFound(err) {
			return r.deleteNodePool(ctx, &node, fmt.Sprintf("JobSet %s no longer exists", jobSetName))
		}
		return ctrl.Result{}, err
	}
	// Case 2: if JobSet is in completed or failed state, delete node pool.
	if jobSetCompleted(&js) || jobSetFailed(&js) {
		return r.deleteNodePool(ctx, &node, fmt.Sprintf("JobSet %s execution has ended (completed or failed)", jobSetName))
	}

	// No need to check all the other nodes, which will have the same jobset name label, we can end
	// the loop early.
	// Log the fact we are not deleting at a high verbosity level to avoid polluting logs but
	// allow for improved debugability.
	lg.V(5).Info("Node pool for JobSet is still in use, not deleting", "nodePoolName", nodePoolName, "jobSetName", jobSetName)
	return ctrl.Result{}, nil
}