func()

in benchmarks/benchmark/tools/model-load-benchmark/k8sclient/k8sclient.go [150:225]


func (k *Client) DeployAndMonitorPod(pod *v1.Pod) (time.Duration, error) {
	maxPeriodSeconds := int32(5)
	maxInitialDelay := int32(0)
	for _, container := range pod.Spec.Containers {
		if container.ReadinessProbe != nil && container.ReadinessProbe.PeriodSeconds > maxPeriodSeconds {
			maxPeriodSeconds = container.ReadinessProbe.PeriodSeconds
			maxInitialDelay = max(maxInitialDelay, container.ReadinessProbe.InitialDelaySeconds)
		}
	}
	// Create the pod
	namespace := "default"
	if pod.GetNamespace() != "" {
		namespace = pod.GetNamespace()
	} else {
		pod.SetNamespace(namespace)
	}
	err := k.DeletePod(pod)
	if err != nil {
		return -1, fmt.Errorf("failed to delete existing pod: %v", err)
	}
	pod, err = k.client.CoreV1().Pods(namespace).Create(context.TODO(), pod, metav1.CreateOptions{})
	if err != nil {
		return -1, fmt.Errorf("failed to create pod: %v", err)
	}
	// wait for pod to be placed on node
	err = k.GetPodNode(pod)
	if err != nil {
		return -1, fmt.Errorf("failed to deploy pod: %v", err)
	}
	startTime := time.Now()
	defer k.DeletePod(pod)
	time.Sleep(time.Duration(maxInitialDelay) * time.Second)
	// Monitor the pod status with exponential backoff
	retryInterval := time.Duration(maxPeriodSeconds) * time.Second
	failureCount := 0
	for {
		pod, err := k.client.CoreV1().Pods("default").Get(context.TODO(), pod.Name, metav1.GetOptions{})
		if err != nil {
			return -1, fmt.Errorf("failed to get pod status: %v", err)
		}

		// Check if all containers are ready
		allContainersReady := true
		for _, containerStatus := range pod.Status.ContainerStatuses {
			if !containerStatus.Ready {
				allContainersReady = false
				break
			}
		}

		if pod.Status.Phase == v1.PodRunning && allContainersReady {
			endTime := time.Now()
			return endTime.Sub(startTime), nil
		}

		switch pod.Status.Phase {
		case v1.PodSucceeded:
			endTime := time.Now()
			return endTime.Sub(startTime), nil
		case v1.PodFailed:
			return -1, fmt.Errorf("pod %s failed: %s", pod.Name, pod.Status.Reason)
		}

		// Exponential backoff with a maximum interval
		time.Sleep(retryInterval)
		retryInterval *= 2
		if retryInterval > maxRetryInterval && retryInterval > time.Duration(maxPeriodSeconds)*time.Second {
			retryInterval = maxRetryInterval
		}
		failureCount++
		if failureCount > failureThreshold {
			break
		}
	}
	return -1, fmt.Errorf("pod monitoring timeout, not all containers ready")
}