func pickPodsToUpdate()

in controllers/util/solr_update_util.go [208:311]


func pickPodsToUpdate(cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentation, state NodeReplicaState, maxPodsToUpdate int, logger logr.Logger) (podsToUpdate []corev1.Pod) {
	sortNodePodsBySafety(outOfDatePods.Running, state.NodeContents, cloud)

	updateOptions := cloud.Spec.UpdateStrategy.ManagedUpdateOptions
	var maxShardReplicasUnavailableCache map[string]int
	// In case the user wants all shardReplicas to be unavailable at the same time, populate the cache with the total number of replicas per shard.
	if updateOptions.MaxShardReplicasUnavailable != nil && updateOptions.MaxShardReplicasUnavailable.Type == intstr.Int && updateOptions.MaxShardReplicasUnavailable.IntVal <= int32(0) {
		maxShardReplicasUnavailableCache = state.TotalShardReplicas
	} else {
		maxShardReplicasUnavailableCache = make(map[string]int, len(state.TotalShardReplicas))
	}

	for _, pod := range outOfDatePods.ScheduledForDeletion {
		nodeContent, isInClusterState := state.PodContents(cloud, pod.Name)

		// This pod will be deleted, add its information to future down shards
		if isInClusterState && nodeContent.live {
			for shard, additionalReplicaCount := range nodeContent.activeReplicasPerShard {
				state.ShardReplicasNotActive[shard] += additionalReplicaCount
			}
		}
	}

	for _, pod := range outOfDatePods.Running {
		isSafeToUpdate := true
		nodeContent, isInClusterState := state.PodContents(cloud, pod.Name)
		var reason string
		// The overseerLeader can only be upgraded by itself
		if !isInClusterState || !nodeContent.InClusterState() {
			// All pods not in the cluster state are safe to upgrade
			isSafeToUpdate = true
			reason = "Pod not in represented in the cluster state"
		} else {
			// The overseer is a special case
			if nodeContent.overseerLeader {
				// The overseerLeader can only be upgraded by itself
				// We want to update it when it's the last out of date pods and all nodes are "live"
				// But we want to make sure it still follows the same replicasDown rules as the other nodes, so still use that logic
				// This works if there are other solr nodes not managed by this SolrCloud resource, because we just check that this is the last
				// pod managed for this SolrCloud that has not been updated.
				if len(outOfDatePods.Running) == 1 && state.AllManagedPodsLive {
					isSafeToUpdate = true
					reason = "Pod is overseer and all other nodes have been updated."
				} else {
					isSafeToUpdate = false
					reason = "Pod is overseer and must wait for all other pods to be updated and live."
				}
			}
			// Only check the replicaSaftey if the node starts out as isSafeToUpdate, otherwise the check is redundant
			// If the node is not live, then consider it safe to be updated.
			if isSafeToUpdate {
				if !nodeContent.live {
					reason = "Pod's Solr Node is not live, therefore it is safe to take down."
				} else {
					for shard, additionalReplicaCount := range nodeContent.totalReplicasPerShard {
						// If all of the replicas for a shard on the node are down, then this is safe to kill.
						// Currently this logic lets replicas in recovery continue recovery rather than killing them.
						if additionalReplicaCount == nodeContent.downReplicasPerShard[shard] {
							continue
						}

						notActiveReplicaCount, _ := state.ShardReplicasNotActive[shard]

						// If the maxBatchNodeUpgradeSpec is passed as a decimal between 0 and 1, then calculate as a percentage of the number of nodes
						maxShardReplicasDown, _ := ResolveMaxShardReplicasUnavailable(updateOptions.MaxShardReplicasUnavailable, shard, state.TotalShardReplicas, maxShardReplicasUnavailableCache)

						// We have to allow killing of Pods that have multiple replicas of a shard
						// Therefore only check the additional Replica count if some replicas of that shard are already being upgraded
						// Also we only want to check the addition of the active replicas, as the non-active replicas are already included in the check.
						if notActiveReplicaCount > 0 && notActiveReplicaCount+nodeContent.activeReplicasPerShard[shard] > maxShardReplicasDown {
							reason = fmt.Sprintf("Shard %s already has %d replicas not active, taking down %d more would put it over the maximum allowed down: %d", shard, notActiveReplicaCount, nodeContent.activeReplicasPerShard[shard], maxShardReplicasDown)
							isSafeToUpdate = false
							break
						}
					}

					if reason == "" {
						reason = "Pod's replicas are safe to take down, adhering to the minimum active replicas per shard."
					}
				}
			}
		}
		if isSafeToUpdate {
			// Only add future replicas that will be taken down, if the node is "live".
			// If the node is not "live", then the replicas on that node will have already been counted as "not active".
			if isInClusterState && nodeContent.live {
				for shard, additionalReplicaCount := range nodeContent.activeReplicasPerShard {
					state.ShardReplicasNotActive[shard] += additionalReplicaCount
				}
			}
			logger.Info("Pod selected to be deleted for update.", "pod", pod.Name, "reason", reason)
			podsToUpdate = append(podsToUpdate, pod)

			// Stop after the maxBatchNodeUpdate count, if one is provided.
			if maxPodsToUpdate >= 1 && len(podsToUpdate) >= maxPodsToUpdate {
				logger.Info("Pod update selection complete. Maximum number of pods able to be updated reached.", "maxPodsToUpdate", maxPodsToUpdate)
				break
			}
		} else {
			logger.Info("Pod not able to be killed for update.", "pod", pod.Name, "reason", reason)
		}
	}
	return podsToUpdate
}