in tpu-provisioner/internal/controller/deletion_controller.go [56:135]
func (r *DeletionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
lg := ctrllog.FromContext(ctx)
lg.V(3).Info("Reconciling Node")
var node corev1.Node
if err := r.Get(ctx, req.NamespacedName, &node); err != nil {
if apierrors.IsNotFound(err) {
// Don't requeue, Node no longer exists (or does not exist in the cache).
return ctrl.Result{}, nil
}
return ctrl.Result{}, fmt.Errorf("getting node: %w", err)
}
// NOTE: Because of the cache filter in main.go, this check should always evaluate to false.
if node.GetLabels()[cloud.LabelNodepoolManager] != cloud.LabelNodepoolManagerTPUPodinator {
lg.V(3).Info("Node was not provisioned by this controller, ignoring")
return ctrl.Result{}, nil
}
// Avoid noisy reconciliation when nodes are shutting down.
for _, c := range node.Status.Conditions {
if c.Type == corev1.NodeReady &&
c.Status == corev1.ConditionFalse &&
c.Reason == "KubeletNotReady" &&
c.Message == "node is shutting down" {
lg.V(3).Info("Node is shutting down, ignoring")
return ctrl.Result{}, nil
}
}
// Ensure node was not just created to make sure Pods have had time to schedule.
if since := time.Since(node.GetCreationTimestamp().Time); since < r.NodeCriteria.MinLifetime {
wait := r.NodeCriteria.MinLifetime - since + time.Second
lg.V(3).Info("Node was just created, ignoring", "waiting", wait)
return ctrl.Result{RequeueAfter: wait}, nil
}
nodePoolLabelKey := r.Provider.NodePoolLabelKey()
nodePoolName, ok := node.GetLabels()[nodePoolLabelKey]
if !ok {
lg.V(3).Info("No node pool label found on node, ignoring", "labelKey", nodePoolLabelKey)
return ctrl.Result{}, nil
}
// Ensure the JobSet whose pods created this node pool is either gone, completed, or failed before
// deleting the node pool.
jobSetName, exists := node.Labels[cloud.LabelJobSetName]
if !exists {
jobSetName, exists = node.Labels[cloud.LabelProvisionerNodepoolID]
if !exists {
lg.V(3).Info("Node missing jobset name label", "node", node.Name)
return ctrl.Result{}, nil
}
}
jobSetNamespace, exists := node.Labels[cloud.LabelJobSetNamespace]
if !exists {
lg.V(3).Info("Node missing jobset namespace label, using default", "node", node.Name)
jobSetNamespace = "default"
}
var js jobset.JobSet
if err := r.Get(ctx, types.NamespacedName{Name: jobSetName, Namespace: jobSetNamespace}, &js); err != nil {
// Case 1: If JobSet no longer exists, delete the node pool.
if apierrors.IsNotFound(err) {
return r.deleteNodePool(ctx, &node, fmt.Sprintf("JobSet %s no longer exists", jobSetName))
}
return ctrl.Result{}, err
}
// Case 2: if JobSet is in completed or failed state, delete node pool.
if jobSetCompleted(&js) || jobSetFailed(&js) {
return r.deleteNodePool(ctx, &node, fmt.Sprintf("JobSet %s execution has ended (completed or failed)", jobSetName))
}
// No need to check all the other nodes, which will have the same jobset name label, we can end
// the loop early.
// Log the fact we are not deleting at a high verbosity level to avoid polluting logs but
// allow for improved debugability.
lg.V(5).Info("Node pool for JobSet is still in use, not deleting", "nodePoolName", nodePoolName, "jobSetName", jobSetName)
return ctrl.Result{}, nil
}