gpudirect-tcpxo/cos-enable-kdump.yaml (75 lines of code) (raw):
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Deploy this DaemonSet to enable kdump on the COS nodes with the
# "cloud.google.com/gke-kdump-enabled=true" label.
#
# WARNING: Enabling kdump requires node reboot. Therefore, in order to avoid
# disrupting your workloads, it is recommended to create a new node pool with
# the "cloud.google.com/gke-kdump-enabled=true" label in your cluster,
# deploy the DaemonSet to enable kdump in that node pool, and then migrate
# your workloads to the new node pool.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: enable-kdump
namespace: kube-system
spec:
selector:
matchLabels:
name: enable-kdump
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: enable-kdump
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: In
values:
- nvidia-h100-80gb
tolerations:
- operator: "Exists"
volumes:
- name: host
hostPath:
path: /
initContainers:
- name: enable-kdump
image: ubuntu
command:
- /bin/bash
- -c
- |
function verify_base_image {
local id="$(grep "^ID=" /host/etc/os-release)"
if [[ "${id#*=}" != "cos" ]]; then
echo "This kdump feature switch is designed to run on Container-Optimized OS only"
exit 0
fi
}
function check_kdump_feature {
chroot /host /usr/sbin/kdump_helper show
}
function enable_kdump_feature_and_reboot_if_needed {
chroot /host /usr/sbin/kdump_helper enable
local -r is_enabled=$(chroot /host /usr/sbin/kdump_helper show | grep "kdump enabled" | sed -rn "s/kdump enabled: (.*)/\1/p")
local -r is_ready=$(chroot /host /usr/sbin/kdump_helper show | grep "kdump ready" | sed -rn "s/kdump ready: (.*)/\1/p")
if [[ "${is_enabled}" == "true" && "${is_ready}" == "false" ]]; then
echo "kdump is enabled. Rebooting for it to take effect."
chroot /host systemctl reboot
fi
}
verify_base_image
check_kdump_feature
enable_kdump_feature_and_reboot_if_needed
resources:
requests:
memory: 5Mi
cpu: 5m
securityContext:
privileged: true
volumeMounts:
- name: host
mountPath: /host
containers:
- image: gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830
name: pause
nodeSelector:
"gke-kdump-enabled": "true"
"cloud.google.com/gke-os-distribution": "cos"