in benchmarks/benchmark/tools/model-load-benchmark/k8sclient/k8sclient.go [150:225]
func (k *Client) DeployAndMonitorPod(pod *v1.Pod) (time.Duration, error) {
maxPeriodSeconds := int32(5)
maxInitialDelay := int32(0)
for _, container := range pod.Spec.Containers {
if container.ReadinessProbe != nil && container.ReadinessProbe.PeriodSeconds > maxPeriodSeconds {
maxPeriodSeconds = container.ReadinessProbe.PeriodSeconds
maxInitialDelay = max(maxInitialDelay, container.ReadinessProbe.InitialDelaySeconds)
}
}
// Create the pod
namespace := "default"
if pod.GetNamespace() != "" {
namespace = pod.GetNamespace()
} else {
pod.SetNamespace(namespace)
}
err := k.DeletePod(pod)
if err != nil {
return -1, fmt.Errorf("failed to delete existing pod: %v", err)
}
pod, err = k.client.CoreV1().Pods(namespace).Create(context.TODO(), pod, metav1.CreateOptions{})
if err != nil {
return -1, fmt.Errorf("failed to create pod: %v", err)
}
// wait for pod to be placed on node
err = k.GetPodNode(pod)
if err != nil {
return -1, fmt.Errorf("failed to deploy pod: %v", err)
}
startTime := time.Now()
defer k.DeletePod(pod)
time.Sleep(time.Duration(maxInitialDelay) * time.Second)
// Monitor the pod status with exponential backoff
retryInterval := time.Duration(maxPeriodSeconds) * time.Second
failureCount := 0
for {
pod, err := k.client.CoreV1().Pods("default").Get(context.TODO(), pod.Name, metav1.GetOptions{})
if err != nil {
return -1, fmt.Errorf("failed to get pod status: %v", err)
}
// Check if all containers are ready
allContainersReady := true
for _, containerStatus := range pod.Status.ContainerStatuses {
if !containerStatus.Ready {
allContainersReady = false
break
}
}
if pod.Status.Phase == v1.PodRunning && allContainersReady {
endTime := time.Now()
return endTime.Sub(startTime), nil
}
switch pod.Status.Phase {
case v1.PodSucceeded:
endTime := time.Now()
return endTime.Sub(startTime), nil
case v1.PodFailed:
return -1, fmt.Errorf("pod %s failed: %s", pod.Name, pod.Status.Reason)
}
// Exponential backoff with a maximum interval
time.Sleep(retryInterval)
retryInterval *= 2
if retryInterval > maxRetryInterval && retryInterval > time.Duration(maxPeriodSeconds)*time.Second {
retryInterval = maxRetryInterval
}
failureCount++
if failureCount > failureThreshold {
break
}
}
return -1, fmt.Errorf("pod monitoring timeout, not all containers ready")
}