helm_chart/HyperPodHelmChart/charts/neuron-device-plugin/templates/k8s-neuron-device-plugin.yaml (82 lines of code) (raw):

# https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/ apiVersion: apps/v1 kind: DaemonSet metadata: name: neuron-device-plugin-daemonset namespace: {{ .Values.namespace }} spec: selector: matchLabels: name: neuron-device-plugin-ds updateStrategy: type: RollingUpdate template: metadata: # Uncomment the annotation below if k8s version is 1.13 or lower # annotations: # scheduler.alpha.kubernetes.io/critical-pod: "" labels: name: neuron-device-plugin-ds spec: serviceAccount: neuron-device-plugin tolerations: - key: CriticalAddonsOnly operator: Exists - key: aws.amazon.com/neuron operator: Exists effect: NoSchedule - key: sagemaker.amazonaws.com/node-health-status operator: Equal value: Unschedulable effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: "node.kubernetes.io/instance-type" operator: In values: - inf1.xlarge - inf1.2xlarge - inf1.6xlarge - inf1.24xlarge - inf2.xlarge - inf2.8xlarge - inf2.24xlarge - inf2.48xlarge - trn1.2xlarge - trn1.32xlarge - trn1n.32xlarge - ml.inf2.xlarge - ml.inf2.8xlarge - ml.inf2.24xlarge - ml.inf2.48xlarge - ml.trn1.2xlarge - ml.trn1.32xlarge - ml.trn1n.32xlarge - ml.trn2.48xlarge containers: # Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin - image: public.ecr.aws/neuron/neuron-device-plugin:2.19.16.0 imagePullPolicy: Always name: neuron-device-plugin env: - name: KUBECONFIG value: /etc/kubernetes/kubelet.conf - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins - name: infa-map mountPath: /run volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins - name: infa-map hostPath: path: /run