func checkSchedulingContext()

in pkg/scheduler/health_checker.go [235:315]


func checkSchedulingContext(schedulerContext *ClusterContext) []dao.HealthCheckInfo {
	// check for negative resources
	var partitionsWithNegResources []string
	var nodesWithNegResources []string
	// total partition resource = sum of node resources
	var totalResourceMismatch []string
	// node total resource = allocated resource + occupied resource + available resource
	var nodeTotalMismatch []string
	// check reservation/node ration
	var partitionReservationRatio []float32
	// check for orphan allocations
	orphanAllocationsOnNode := make([]*objects.Allocation, 0)
	orphanAllocationsOnApp := make([]*objects.Allocation, 0)

	for _, part := range schedulerContext.GetPartitionMapClone() {
		if part.GetAllocatedResource().HasNegativeValue() {
			partitionsWithNegResources = append(partitionsWithNegResources, part.Name)
		}
		if part.GetTotalPartitionResource().HasNegativeValue() {
			partitionsWithNegResources = append(partitionsWithNegResources, part.Name)
		}
		sumNodeResources := resources.NewResource()
		sumNodeAllocatedResources := resources.NewResource()
		sumReservation := 0
		for _, node := range part.GetNodes() {
			sumNodeResources.AddTo(node.GetCapacity())
			sumNodeAllocatedResources.AddTo(node.GetAllocatedResource())
			sumReservation += len(node.GetReservationKeys())
			calculatedTotalNodeRes := resources.Add(node.GetAllocatedResource(), node.GetOccupiedResource())
			calculatedTotalNodeRes.AddTo(node.GetAvailableResource())
			if !resources.Equals(node.GetCapacity(), calculatedTotalNodeRes) {
				nodeTotalMismatch = append(nodeTotalMismatch, node.NodeID)
			}
			if node.GetAllocatedResource().HasNegativeValue() {
				nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
			}
			if node.GetAvailableResource().HasNegativeValue() {
				nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
			}
			if node.GetCapacity().HasNegativeValue() {
				nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
			}
			if node.GetOccupiedResource().HasNegativeValue() {
				nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
			}
			orphanAllocationsOnNode = append(orphanAllocationsOnNode, checkNodeAllocations(node, part)...)
		}
		// check if there are allocations assigned to an app but there are missing from the nodes
		for _, app := range part.GetApplications() {
			orphanAllocationsOnApp = append(orphanAllocationsOnApp, checkAppAllocations(app, part.nodes)...)
		}
		partitionReservationRatio = append(partitionReservationRatio, float32(sumReservation)/(float32(part.GetTotalNodeCount())))
		if !resources.EqualsOrEmpty(sumNodeResources, part.GetTotalPartitionResource()) {
			totalResourceMismatch = append(totalResourceMismatch, part.Name)
		}
	}
	var info = make([]dao.HealthCheckInfo, 0)
	info = append(info, CreateCheckInfo(len(partitionsWithNegResources) == 0, "Negative resources",
		"Check for negative resources in the partitions",
		fmt.Sprintf("Partitions with negative resources: %q", partitionsWithNegResources)))
	info = append(info, CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative resources",
		"Check for negative resources in the nodes",
		fmt.Sprintf("Nodes with negative resources: %q", nodesWithNegResources)))
	info = append(info, CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency of data",
		"Check if total partition resource == sum of the node resources from the partition",
		fmt.Sprintf("Partitions with inconsistent data: %q", totalResourceMismatch)))
	info = append(info, CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of data",
		"Check if node total resource = allocated resource + occupied resource + available resource",
		fmt.Sprintf("Nodes with inconsistent data: %q", nodeTotalMismatch)))
	// mark it as succeeded for a while until we will know what is not considered a normal value anymore
	info = append(info, CreateCheckInfo(true, "Reservation check",
		"Check the reservation nr compared to the number of nodes",
		fmt.Sprintf("Reservation/node nr ratio: %f", partitionReservationRatio)))
	info = append(info, CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan allocation on node check",
		"Check if there are orphan allocations on the nodes",
		fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode)))
	info = append(info, CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan allocation on app check",
		"Check if there are orphan allocations on the applications",
		fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp)))
	return info
}