use-cases/model-fine-tuning-pipeline/fine-tuning/pytorch/manifests/fine-tune-l4-dws.yaml (114 lines of code) (raw):

apiVersion: v1 kind: Service metadata: name: headless-svc-l4 spec: clusterIP: None # clusterIP must be None to create a headless service selector: job-name: finetune-gemma-l4 # must match Job name --- apiVersion: batch/v1 kind: Job metadata: name: finetune-gemma-l4 spec: backoffLimit: 0 completions: 2 parallelism: 2 completionMode: Indexed template: metadata: labels: app: finetune-job ml-platform: fine-tuning annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/memory-limit: "35Gi" cluster-autoscaler.kubernetes.io/consume-provisioning-request: l4-job cluster-autoscaler.kubernetes.io/provisioning-class-name: "queued-provisioning.gke.io" spec: containers: - name: gpu-job imagePullPolicy: Always image: V_IMAGE_URL ports: - containerPort: 29500 securityContext: privileged: true resources: requests: nvidia.com/gpu: "2" limits: nvidia.com/gpu: "2" command: - bash - -c - | accelerate launch \ --config_file fsdp_config.yaml \ --debug \ --main_process_ip finetune-gemma-l4-0.headless-svc-l4 \ --main_process_port 29500 \ --machine_rank ${JOB_COMPLETION_INDEX} \ --num_processes 4 \ --num_machines 2 \ fine_tune.py env: - name: "EXPERIMENT" value: "V_EXPERIMENT" - name: "MLFLOW_ENABLE" value: "V_MLFLOW_ENABLE" - name: "MLFLOW_TRACKING_URI" value: "V_MLFLOW_TRACKING_URI" - name: "MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING" value: "V_MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING" - name: "TRAINING_DATASET_BUCKET" value: "V_DATA_BUCKET" - name: "TRAINING_DATASET_PATH" value: "V_TRAINING_DATASET_PATH" - name: MODEL_NAME value: "V_MODEL_NAME" - name: NEW_MODEL value: "gemma-ft" - name: MODEL_PATH value: "V_MODEL_PATH" - name: EPOCHS value: "1" - name: TRAIN_BATCH_SIZE value: "V_TRAIN_BATCH_SIZE" - name: HF_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token volumeMounts: - mountPath: /dev/shm name: dshm - name: gcs-fuse-csi-ephemeral mountPath: /model-data readOnly: false nodeSelector: cloud.google.com/gke-accelerator: nvidia-l4 restartPolicy: OnFailure serviceAccountName: V_KSA subdomain: headless-svc-l4 terminationGracePeriodSeconds: 600 tolerations: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" - key: "on-demand" value: "true" operator: "Equal" effect: "NoSchedule" volumes: - name: dshm emptyDir: medium: Memory - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: V_MODEL_BUCKET mountOptions: "implicit-dirs" gcsfuseLoggingSeverity: warning