in src/watchdog/src/pkg/watchdog/metric_generator.go [81:160]
func (mg *metricGenerator) generatePodMetric(pod *v1.Pod) podMetric {
hostIP := pod.Status.HostIP
if hostIP == "" {
hostIP = "unscheduled"
}
phase := strings.ToLower(string(pod.Status.Phase))
if phase == "" {
phase = "unknown"
}
initialized, scheduled, ready, containersReady := "unknown", "unknown", "unknown", "unknown"
isConditionUnknown := false
conditions := pod.Status.Conditions
for _, cond := range conditions {
status := strings.ToLower(string(cond.Status))
switch t := cond.Type; t {
case v1.PodReady:
ready = status
case v1.PodInitialized:
initialized = status
case v1.PodScheduled:
scheduled = status
case v1.ContainersReady:
containersReady = status
default:
isConditionUnknown = true
klog.Warningf("Unknown pod condition type: %v %v", cond, status)
}
}
bound := false
if pod.Spec.NodeName != "" {
bound = true
}
labels := pod.ObjectMeta.Labels
serviceName := labels["app"]
jobName := labels["jobName"]
var containerStatuses []containerMetric
for _, cStatus := range pod.Status.ContainerStatuses {
status := containerMetric{name: cStatus.Name}
if cStatus.State.Running != nil {
status.status = "running"
} else if cStatus.State.Waiting != nil {
status.status = "waiting"
} else if cStatus.State.Terminated != nil {
status.status = "terminated"
} else {
status.status = "unknown"
}
status.ready = cStatus.Ready
containerStatuses = append(containerStatuses, status)
}
var gpuUsed int
for _, c := range pod.Spec.Containers {
limit := mg.getGpuNumber(c.Resources.Limits)
request := mg.getGpuNumber(c.Resources.Requests)
gpuUsed = int(math.Max(float64(limit), float64(request)))
}
return podMetric{
name: pod.ObjectMeta.Name,
namespace: pod.ObjectMeta.Namespace,
nodeName: pod.Spec.NodeName,
hostIP: hostIP,
phase: phase,
initialized: initialized,
scheduled: scheduled,
ready: ready,
containersReady: containersReady,
isConditionUnknown: isConditionUnknown,
bound: bound,
serviceName: serviceName,
jobName: jobName,
containers: containerStatuses,
gpuUsed: gpuUsed,
}
}