sample_workloads/lit-gpt-demo/helm/templates/litgpt.yaml (197 lines of code) (raw):
{{- $requiredVar := .Values.cluster.nNodes | required ".Values.cluster.nNodes is required" -}}
{{- $requiredVar := .Values.cluster.nodePool | required ".Values.cluster.nodePool is required" -}}
{{- $requiredVar := .Values.network.ncclIfnames | required ".Values.ncclIfnames is required" -}}
{{- $requiredVar := .Values.logging.jobTimestamp | required ".Values.jobTimestamp is required" -}}
{{- $requiredVar := .Values.logging.experimentDir | required ".Values.experimentDir is required" -}}
{{- $requiredVar := .Values.workload.gcsDataBucket | required ".Values.gcsDataBucket is required" -}}
{{- $requiredVar := .Values.workload.dataDir| required ".Values.dataDir is required" -}}
{{- $requiredVar := .Values.workload.image | required ".Values.image is required" -}}
apiVersion: v1
kind: Service
metadata:
name: "pytorch-leader-{{$.Release.Name}}"
spec:
selector:
name: "pytorch-leader-{{$.Release.Name}}"
clusterIP: None
ports:
- name: pytorch-leader
port: 6002
---
{{$node_count := .Values.cluster.nNodes | int}}
# This needs to be updated to allow uneven distribution of nodes to SBs
{{- $root := . -}}
{{range $node_index, $element := until $node_count}}
apiVersion: v1
kind: Pod
metadata:
name: litgpt-{{$.Release.Name}}-pod{{$node_index}}
{{if eq $node_index 0}}
labels:
name: pytorch-leader-{{$.Release.Name}}
{{end}}
spec:
hostNetwork: true
dnsPolicy: ClusterFirstWithHostNet
hostname: litgpt-pod{{$node_index}}
subdomain: litgpt-{{$.Release.Name}}
serviceAccountName: "default"
restartPolicy: Never
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
- key: cloud.google.com/gke-nodepool
operator: In
values: [{{$.Values.cluster.nodePool}}]
tolerations:
- operator: "Exists"
key: nvidia.com/gpu
- operator: "Exists"
key: cloud.google.com/impending-node-termination
volumes:
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia/lib64
- name: tcpd-socket
hostPath:
path: /run/tcpx
- name: shared-memory
emptyDir:
medium: "Memory"
sizeLimit: 200Gi
- name: workload-terminated-volume
emptyDir: {}
- name: tcpx-nccl-plugin-volume
emptyDir: {}
- name: data-volume
hostPath:
path: /home/data
{{if eq $root.Values.network.useTcpx "yes"}}
initContainers:
- name: tcpx-nccl-plugin-installer
image: {{$root.Values.network.ncclPlugin}}
imagePullPolicy: Always
volumeMounts:
- name: tcpx-nccl-plugin-volume
mountPath: /var/lib/tcpx
resources:
requests:
cpu: 150m
command:
- /bin/sh
- -c
- |
/scripts/container_entry.sh install --install-nccl
{{end}}
containers:
{{if eq $root.Values.network.useTcpx "yes"}}
- name: tcpd-daemon
image: {{$root.Values.network.rxdmContainer}}
imagePullPolicy: Always
command:
- "bash"
- "-c"
- |
/tcpgpudmarxd/build/app/tcpgpudmarxd --gpu_nic_preset a3vm --gpu_shmem_type fd --setup_param "--verbose 128 2 0" &
while [ ! -e "/usr/share/litgpt/workload_terminated" ]; do sleep 10; done
securityContext:
privileged: true
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: tcpd-socket
mountPath: /tmp
- name: workload-terminated-volume
mountPath: /usr/share/litgpt
env:
- name: LD_LIBRARY_PATH
value: /usr/local/nvidia/lib64
{{end}}
- name: litgpt
image: {{$root.Values.workload.image}}
imagePullPolicy: Always
securityContext:
privileged: true
capabilities:
add:
- SYS_ADMIN
- SYS_PTRACE
- IPC_LOCK
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: LD_LIBRARY_PATH
value: "/usr/lib/x86_64-linux-gnu:/usr/local/nvidia/lib64"
- name: JOB_TIMESTAMP
value: "{{$root.Values.logging.jobTimestamp}}"
- name: MASTER_ADDR
value: "pytorch-leader-{{$.Release.Name}}"
- name: NCCL_SOCKET_IFNAME
value: "{{$root.Values.network.ncclIfnames}}"
- name: NNODES
value: "{{$node_count}}"
- name: NODE_RANK
value: "{{ $node_index }}"
- name: USE_TCPX
value: "{{$root.Values.network.useTcpx}}"
- name: TCPX_FORCE_ACK
value: "{{$root.Values.network.tcpxForceAck}}"
- name: DISABLE_PMTU
value: "{{$root.Values.network.disablePmtu}}"
- name: CPU_PINNING_MODE
value: "{{$root.Values.network.cpuPinningMode}}"
- name: GCS_EXPERIMENT_BUCKET
value: "{{$root.Values.logging.gcsExperimentBucket}}"
- name: EXPERIMENT_ROOT_DIR
value: "{{$root.Values.logging.experimentDir}}"
- name: GCS_DATA_BUCKET
value: "{{$root.Values.workload.gcsDataBucket}}"
- name: DATA_DIR
value: "{{$root.Values.workload.dataDir}}"
- name: BATCH_SIZE
value: "{{$root.Values.workload.batchSize}}"
- name: MICRO_BATCH_SIZE
value: "{{$root.Values.workload.microBatchSize}}"
- name: MODEL_NAME
value: "{{$root.Values.workload.modelName}}"
- name: WARMUP_ITERS
value: "{{$root.Values.workload.warmupIters}}"
- name: COLLECT_NSYS_PROFILE
value: "{{$root.Values.logging.collectNsysProfile}}"
- name: CLUSTER_TYPE
value: GKE
- name: NCCL_NVLS_ENABLE
value: '0'
- name: NCCL_DEBUG
value: "{{$root.Values.logging.ncclDebugLevel}}"
- name: NUMBER_OF_EPOCHS
value: "{{$root.Values.workload.numberOfEpochs}}"
- name: STEPS_PER_EPOCH
value: "{{$root.Values.workload.stepsPerEpoch}}"
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia/lib64
- name: tcpx-nccl-plugin-volume
mountPath: /usr/local/tcpx
- name: tcpd-socket
mountPath: /tmp
- name: shared-memory
mountPath: /dev/shm
- name: workload-terminated-volume
mountPath: /usr/share/litgpt
- name: data-volume
mountPath: /data
resources:
limits:
nvidia.com/gpu: !!int 8
---
{{end}}