gpudirect-tcpx/tcpx-metrics-server.yaml (59 lines of code) (raw):
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: tcpx-metrics-server
namespace: kube-system
labels:
k8s-app: tcpx-metrics-server
spec:
selector:
matchLabels:
k8s-app: tcpx-metrics-server
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: tcpx-metrics-server
k8s-app: tcpx-metrics-server
spec:
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-h100-80gb
tolerations:
- operator: "Exists"
hostNetwork: true
containers:
- image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpx-metrics:latest
name: tcpx-metrics-server
resources:
requests:
cpu: 150m
securityContext:
capabilities:
add:
- NET_ADMIN
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: CLOUD_MONITORING_ENDPOINT
value: "monitoring.googleapis.com:443"
- name: CONTAINER_NAME
value: "tcpx-metrics-server"