helm_chart/HyperPodHelmChart/charts/neuron-device-plugin/templates/k8s-neuron-device-plugin.yaml (82 lines of code) (raw):
# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: neuron-device-plugin-daemonset
namespace: {{ .Values.namespace }}
spec:
selector:
matchLabels:
name: neuron-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
# Uncomment the annotation below if k8s version is 1.13 or lower
# annotations:
# scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: neuron-device-plugin-ds
spec:
serviceAccount: neuron-device-plugin
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: aws.amazon.com/neuron
operator: Exists
effect: NoSchedule
- key: sagemaker.amazonaws.com/node-health-status
operator: Equal
value: Unschedulable
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: "node.kubernetes.io/instance-type"
operator: In
values:
- inf1.xlarge
- inf1.2xlarge
- inf1.6xlarge
- inf1.24xlarge
- inf2.xlarge
- inf2.8xlarge
- inf2.24xlarge
- inf2.48xlarge
- trn1.2xlarge
- trn1.32xlarge
- trn1n.32xlarge
- ml.inf2.xlarge
- ml.inf2.8xlarge
- ml.inf2.24xlarge
- ml.inf2.48xlarge
- ml.trn1.2xlarge
- ml.trn1.32xlarge
- ml.trn1n.32xlarge
- ml.trn2.48xlarge
containers:
# Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin
- image: public.ecr.aws/neuron/neuron-device-plugin:2.19.16.0
imagePullPolicy: Always
name: neuron-device-plugin
env:
- name: KUBECONFIG
value: /etc/kubernetes/kubelet.conf
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
- name: infa-map
mountPath: /run
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
- name: infa-map
hostPath:
path: /run