deploy/overlays/gke-release/node.yaml (148 lines of code) (raw):

# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. kind: DaemonSet apiVersion: apps/v1 metadata: name: lustre-csi-node spec: selector: matchLabels: k8s-app: lustre-csi-driver updateStrategy: type: RollingUpdate rollingUpdate: maxUnavailable: 10% template: metadata: annotations: seccomp.security.alpha.kubernetes.io/pod: "runtime/default" labels: k8s-app: lustre-csi-driver spec: # LNet config is cached on the host once the Lustre client is initialized. # Using hostNetwork is required to ensure the CSI driver pod maintains a consistent IP that remains tied to the node. See details in b/395004950. hostNetwork: true securityContext: seccompProfile: type: RuntimeDefault priorityClassName: csi-lustre-node serviceAccount: lustre-csi-node-sa nodeSelector: kubernetes.io/os: linux kubernetes.io/arch: amd64 cloud.google.com/gke-os-distribution: cos initContainers: - name: disable-loadpin image: gcr.io/cos-cloud/cos-dkms securityContext: privileged: true command: ["/bin/sh", "-c"] args: - | # Disable LoadPin if it's not already disabled if cat /proc/cmdline | grep "loadpin"; then echo "LoadPin has already been disabled. Move to kmod installation." else echo "Sleep 60s until the node is ready" sleep 60 echo "LoadPin is not disabled. Disabling LoadPin now." mkdir -p /mnt/disks mount /dev/disk/by-label/EFI-SYSTEM /mnt/disks sed -i -e 's|module.sig_enforce=0|module.sig_enforce=0 loadpin.enforce=0|g' /mnt/disks/efi/boot/grub.cfg umount /mnt/disks echo 1 > /proc/sys/kernel/sysrq echo b > /proc/sysrq-trigger fi volumeMounts: - name: dev mountPath: /dev - name: install-lustre-mods image: gcr.io/cos-cloud/cos-dkms securityContext: privileged: true command: ["/bin/sh", "-c"] args: - | # Install the Lustre client drivers. # # --gcs-bucket: Specifies the GCS bucket containing the driver packages ('cos-default'). # --module-version: Sets the lustre client driver version. # --kernelmodulestree: Sets the path to the kernel modules directory on the host ('/host_modules'). # --lsb-release-path: Specifies the path to the lsb-release file on the host ('/host_etc/lsb-release'). # --insert-on-install: Inserts the module into the kernel after installation. # --module-arg lnet.accept_port=6988: This is crucial for setting the LNET port. # Lustre uses LNET for network communication, and this # parameter configures the port LNET will use. This is # essential for proper communication between Lustre clients # and servers. In this case, we're setting it to 6988. # TODO(rishitagolla): Set module version to 2.14 when gke picks up cos-117-18613-164-93. /usr/bin/cos-dkms install lustre-client-drivers --gcs-bucket=cos-default --module-version=2.16.0 --kernelmodulestree=/host_modules --module-arg=lnet.accept_port=6988 --lsb-release-path=/host_etc/lsb-release --insert-on-install --logtostderr volumeMounts: - name: host-etc mountPath: /host_etc/lsb-release - name: host-modules mountPath: /host_modules containers: - name: lustre-csi-driver securityContext: privileged: true readOnlyRootFilesystem: true image: gke.gcr.io/lustre-csi-driver imagePullPolicy: Always args: - --v=5 - --endpoint=unix:/csi/csi.sock - --nodeid=$(KUBE_NODE_NAME) - --node=true resources: limits: cpu: 200m memory: 200Mi requests: cpu: 5m memory: 10Mi env: - name: KUBE_NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName volumeMounts: - name: kubelet-dir mountPath: /var/lib/kubelet mountPropagation: "Bidirectional" - name: socket-dir mountPath: /csi - name: csi-driver-registrar securityContext: readOnlyRootFilesystem: true allowPrivilegeEscalation: false capabilities: drop: - all image: registry.k8s.io/sig-storage/csi-node-driver-registrar imagePullPolicy: IfNotPresent args: - "--v=3" - "--csi-address=/csi/csi.sock" - "--kubelet-registration-path=$(DRIVER_REG_SOCK_PATH)" resources: limits: cpu: 50m memory: 100Mi requests: cpu: 10m memory: 10Mi env: - name: DRIVER_REG_SOCK_PATH value: /var/lib/kubelet/plugins/lustre.csi.storage.gke.io/csi.sock volumeMounts: - name: socket-dir mountPath: /csi - name: registration-dir mountPath: /registration volumes: - name: registration-dir hostPath: path: /var/lib/kubelet/plugins_registry/ type: Directory - name: kubelet-dir hostPath: path: /var/lib/kubelet type: Directory - name: socket-dir hostPath: path: /var/lib/kubelet/plugins/lustre.csi.storage.gke.io/ type: DirectoryOrCreate - name: host-etc hostPath: path: /etc/lsb-release - name: host-modules hostPath: path: /lib/modules - name: dev hostPath: path: /dev # https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ # See "special case". This will tolerate everything. Node component should # be scheduled on all nodes. tolerations: - operator: Exists