modules/compute/gke-node-pool/gpu-direct-workload/sample-tcpx-workload-job.yaml

# Copyright 2024 "Google LLC" # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: batch/v1 kind: Job metadata: name: my-sample-job spec: parallelism: 2 completions: 2 completionMode: Indexed template: spec: containers: - name: nccl-test image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9 imagePullPolicy: Always command: - /bin/sh - -c - | service ssh restart; sleep infinity; env: - name: LD_LIBRARY_PATH value: /usr/local/nvidia/lib64 volumeMounts: - name: config-volume mountPath: /configs resources: limits: nvidia.com/gpu: 8 volumes: - name: config-volume configMap: name: nccl-configmap defaultMode: 0777 restartPolicy: Never backoffLimit: 0

modules/compute/gke-node-pool/gpu-direct-workload/sample-tcpx-workload-job.yaml (36 lines of code) (raw):