gpudirect-tcpx/optmem-max-ds.yaml (50 lines of code) (raw):
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: optmem-max-ds
namespace: kube-system
labels:
k8s-app: optmem-max-ds
spec:
selector:
matchLabels:
k8s-app: optmem-max-ds
template:
metadata:
labels:
name: optmem-max-ds
k8s-app: optmem-max-ds
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-h100-80gb
hostNetwork: true
tolerations:
- operator: "Exists"
initContainers:
- name: optmem-max
image: gke.gcr.io/gke-distroless/bash
securityContext:
privileged: true
command:
- /bin/bash
- -c
- "set -x; cat /proc/sys/net/core/optmem_max; echo 131072 > /proc/sys/net/core/optmem_max"
containers:
- name: pause
image: registry.k8s.io/pause:3.9
securityContext:
runAsUser: 2023
runAsGroup: 2023
allowPrivilegeEscalation: false
capabilities:
drop: ["all"]
seccompProfile:
type: RuntimeDefault
readOnlyRootFilesystem: true