sample_workloads/nccltest/a3/gke/templates/nccl_benchmarks.yaml (172 lines of code) (raw):
# yamllint disable
{{- $requiredVar := .Values.cluster.nNodes | required ".Values.cluster.nNodes is required, add --set cluster.nNodes=2" -}}
{{ $timestamp := now | date "2006-01-02-150405" }}
apiVersion: v1
kind: Service
metadata:
name: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}"
spec:
selector:
name: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}"
clusterIP: None
---
{{ $node_count := .Values.cluster.nNodes | int }}
{{ $superblock_count := .Values.cluster.nSuperblocks | int }}
{{ $nodesPerSuperblock := divf $node_count $superblock_count | ceil }}
{{- range $node_index, $element := until $node_count }}
apiVersion: v1
kind: Pod
metadata:
name: nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}-pod{{ $node_index }}
{{- if eq $node_index 0 }}
labels:
name: nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}
{{- end }}
spec:
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostname: nccl-benchmarks-pod{{ $node_index }}
subdomain: nccl-benchmarks-{{ $timestamp }}
serviceAccountName: "default"
restartPolicy: Never
{{- if $.Values.cluster.sbPlacement }}
{{ $superblockChunk := div $node_index $nodesPerSuperblock | int }}
{{ $superblockIndex := add $.Values.cluster.startSuperblock $superblockChunk | int }}
nodeSelector:
superblock: "{{ $superblockIndex }}"
{{- end }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
tolerations:
- operator: "Exists"
key: nvidia.com/gpu
volumes:
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
- name: tcpd-socket
hostPath:
path: /run/tcpx
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 200Gi
- name: tcpx-nccl-plugin-volume
emptyDir: {}
- name: workload-terminated-volume
emptyDir: {}
initContainers:
- name: tcpx-nccl-plugin-installer
image: {{ $.Values.ncclPlugin.image }}:{{ $.Values.ncclPlugin.tag }}
imagePullPolicy: Always
volumeMounts:
- name: tcpx-nccl-plugin-volume
mountPath: /var/lib/tcpx
resources:
requests:
cpu: 150m
command:
- /bin/bash
- -c
- |
/scripts/container_entry.sh install --install-nccl
{{- if $.Values.telemetry.gpu }} \
--nccl-buildtype=nvtx
{{- end }}
containers:
- name: tcpd-daemon
image: {{ $.Values.rxdm.image }}:{{ $.Values.rxdm.tag }}
imagePullPolicy: Always
command:
- "bash"
- "-c"
- |
/tcpgpudmarxd/build/app/tcpgpudmarxd {{- range $.Values.rxdm.flags }} {{.}} {{- end }} &
while [ ! -e "/usr/share/nccl_benchmarks/workload_terminated" ]; do sleep 10; done
pkill -e "^"tcpgpudmarxd || true
sleep 30
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: tcpd-socket
mountPath: /tmp
- name: workload-terminated-volume
mountPath: /usr/share/nccl_benchmarks
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
- name: nccl-benchmarks
image: {{ $.Values.ncclBenchmarks.image }}:{{ $.Values.ncclBenchmarks.tag }}
imagePullPolicy: Always
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN
- SYS_PTRACE
- IPC_LOCK
env:
- name: JOB_TIMESTAMP
value: "{{ $timestamp }}"
- name: JOB_NAME
value: "{{ $.Release.Name }}"
- name: MASTER_ADDR
value: "nccl-benchmarks-{{ $.Release.Name }}-{{ $timestamp }}"
- name: NNODES
value: "{{ $node_count }}"
- name: NODE_RANK
value: "{{ $node_index }}"
- name: GCS_BUCKET
value: "{{ $.Values.cluster.gcsBucket }}"
- name: LD_LIBRARY_PATH
value: "/usr/local/tcpx/lib64:/usr/local/nvidia/lib64"
- name: BENCHMARKS_CSV
value: "{{ $.Values.ncclBenchmarks.benchmarks }}"
- name: MASKS_CSV
value: "{{ $.Values.ncclBenchmarks.masks }}"
- name: MSG_SIZE_BEGIN
value: "{{ $.Values.ncclBenchmarks.msgSizeBegin }}"
- name: MSG_SIZE_END
value: "{{ $.Values.ncclBenchmarks.msgSizeEnd }}"
- name: GPUS_PER_NODE
value: "{{ $.Values.ncclBenchmarks.gpusPerNode }}"
- name: WARMUP_ITERS
value: "{{ $.Values.ncclBenchmarks.warmupIters }}"
- name: RUN_ITERS
value: "{{ $.Values.ncclBenchmarks.runIters }}"
- name: N_RUNS
value: "{{ $.Values.ncclBenchmarks.nRuns }}"
- name: UNRESERVED_CORES
value: "{{ $.Values.ncclPlugin.unreservedCores }}"
- name: GPU_TELEMETRY
value: "{{ $.Values.telemetry.gpu }}"
{{- range $key, $value := $.Values.ncclPlugin.envs }}
- name: "{{ $key }}"
value: "{{ $value }}"
{{- end }}
{{- if $.Values.telemetry.gpu }}
- name: NCCL_PROXY_NVTX_ENABLE
value: "1"
{{- end }}
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: tcpd-socket
mountPath: /tmp
- name: shared-memory
mountPath: /dev/shm
- name: tcpx-nccl-plugin-volume
mountPath: /usr/local/tcpx
- name: workload-terminated-volume
mountPath: /usr/share/nccl_benchmarks
resources:
limits:
nvidia.com/gpu: !!int 8
---
{{- end }}