use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/manifests/fine-tune-l4-dws.yaml (114 lines of code) (raw):
apiVersion: v1
kind: Service
metadata:
name: headless-svc-l4
spec:
clusterIP: None # clusterIP must be None to create a headless service
selector:
job-name: finetune-gemma-l4 # must match Job name
---
apiVersion: batch/v1
kind: Job
metadata:
name: finetune-gemma-l4
spec:
backoffLimit: 0
completions: 2
parallelism: 2
completionMode: Indexed
template:
metadata:
labels:
app: finetune-job
ml-platform: fine-tuning
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/memory-limit: "35Gi"
cluster-autoscaler.kubernetes.io/consume-provisioning-request: l4-job
cluster-autoscaler.kubernetes.io/provisioning-class-name: "queued-provisioning.gke.io"
spec:
containers:
- name: gpu-job
imagePullPolicy: Always
image: V_IMAGE_URL
ports:
- containerPort: 29500
securityContext:
privileged: true
resources:
requests:
nvidia.com/gpu: "2"
limits:
nvidia.com/gpu: "2"
command:
- bash
- -c
- |
accelerate launch \
--config_file fsdp_config.yaml \
--debug \
--main_process_ip finetune-gemma-l4-0.headless-svc-l4 \
--main_process_port 29500 \
--machine_rank ${JOB_COMPLETION_INDEX} \
--num_processes 4 \
--num_machines 2 \
fine_tune.py
env:
- name: "EXPERIMENT"
value: "V_EXPERIMENT"
- name: "MLFLOW_ENABLE"
value: "V_MLFLOW_ENABLE"
- name: "MLFLOW_TRACKING_URI"
value: "V_MLFLOW_TRACKING_URI"
- name: "MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"
value: "V_MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"
- name: "TRAINING_DATASET_BUCKET"
value: "V_DATA_BUCKET"
- name: "TRAINING_DATASET_PATH"
value: "V_TRAINING_DATASET_PATH"
- name: MODEL_NAME
value: "V_MODEL_NAME"
- name: NEW_MODEL
value: "gemma-ft"
- name: MODEL_PATH
value: "V_MODEL_PATH"
- name: EPOCHS
value: "1"
- name: TRAIN_BATCH_SIZE
value: "V_TRAIN_BATCH_SIZE"
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
volumeMounts:
- mountPath: /dev/shm
name: dshm
- name: gcs-fuse-csi-ephemeral
mountPath: /model-data
readOnly: false
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
restartPolicy: OnFailure
serviceAccountName: V_KSA
subdomain: headless-svc-l4
terminationGracePeriodSeconds: 600
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- key: "on-demand"
value: "true"
operator: "Equal"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: V_MODEL_BUCKET
mountOptions: "implicit-dirs"
gcsfuseLoggingSeverity: warning