train-cnn/train.yaml (41 lines of code) (raw):

taskGroups: - taskSpec: runnables: - container: imageUri: us-central1-docker.pkg.dev/<your project>/<your repo>/<your image name> volumes: /tmp:/tmp commands: - /scripts/localize.sh - barrier: name: all-vms-ready - container: imageUri: us-central1-docker.pkg.dev/<your project>/<your repo>/<your image name> options: --network host volumes: /tmp:/tmp commands: - /scripts/runtorch.sh - barrier: name: training-complete environment: variables: DATA_ROOT: /mnt/disks/dogs-vs-cats/data volumes: - nfs: server: <your Filestore IP address> remotePath: <your Filestore share path, e.g. /share> mountPath: /mnt/disks/dogs-vs-cats taskCount: 8 taskCountPerNode: 1 requireHostsFile: true allocationPolicy: instances: - policy: bootDisk: sizeGb: 250 machineType: n1-standard-4 accelerators: - type: nvidia-tesla-v100 count: 1 installGpuDrivers: true logsPolicy: destination: CLOUD_LOGGING