ssiog/launch/job.yaml (95 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: batch/v1
kind: Job
metadata:
generateName: ssiog-job-training-
spec:
parallelism: 2
completions: 2
completionMode: Indexed
template:
metadata:
labels:
app: ssiog-job
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
spec:
restartPolicy: Never
serviceAccountName: ssiog-runner-ksa
hostNetwork: true
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- ssiog-job
topologyKey: "kubernetes.io/hostname"
containers:
- name: ssiog-benchmark
image: us-west1-docker.pkg.dev/gcs-tess/ssiog-training/v0.8.0@sha256:48e36ae4f0920d91951b8fef39e55ddc796a11ed7acfc9b88efd2e8555b9b718
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: JOB_COMPLETION_INDEX
valueFrom:
fieldRef:
fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
- name: JOB_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['job-name']
securityContext:
privileged: true
command:
- /bin/bash
- -c
- |
mkdir -p /mnt/benchmark-output/test_0-8-0-0/${JOB_NAME} || true
mkdir -p /output/ || true
args=(
--prefix /mnt/benchmark-inputs
--object-count-limit=16
--epochs=2
--background-threads=8
--sample-size=65536
--steps=50
--batch-size=32
--group-size=1
--log-metrics=True
--metrics-file=/output/results-${JOB_COMPLETION_INDEX}.csv
--log-level=INFO
--read-order=FullRandom
--export-metrics=True
--exporter-type=cloud
--label=test_0-8-0-0
)
/app/training.py "${args[@]}"
echo "Copying the local metrics to bucket..."
cp -r /output/* /mnt/benchmark-output/test_0-8-0-0/${JOB_NAME}/
volumeMounts:
- mountPath: /mnt/benchmark-output
name: gcsfuse-outputs
readOnly: false
- mountPath: /mnt/benchmark-inputs
name: mnt-inputs
readOnly: true
volumes:
- name: gcsfuse-outputs
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: princer-ssiog-metrics-bkt
- name: mnt-inputs
csi:
driver: gcsfuse.csi.storage.gke.io
readOnly: false
volumeAttributes:
bucketName: princer-ssiog-data-bkt
mountOptions: "debug_fuse,implicit-dirs,metadata-cache:ttl-secs:-1,metadata-cache:stat-cache-max-size-mb:-1,metadata-cache:type-cache-max-size-mb:-1"