qwiklabs/llama2-finetuning-slurm/hpc-slurm-llama2.yaml (260 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. --- blueprint_name: hpc-slurm-llama vars: project_id: ## Set GCP Project ID Here ## bucket_model: ## Set your bucket name prefix here ## deployment_name: hpc-slurm-llama2 region: us-central1 zone: us-central1-a zone_list: [us-central1-a, us-central1-b, us-central1-c] image_name: llama2-slurm-v6-20241115t205427z image_project: qwiklabs-resources instance_image_custom: true disk_size_gb: 200 # Documentation for each of the modules used below can be found at # https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md deployment_groups: - group: enable_apis modules: - id: enable_apis source: community/modules/project/service-enablement settings: gcp_service_list: [ "cloudresourcemanager.googleapis.com", "stackdriver.googleapis.com", "iam.googleapis.com", "logging.googleapis.com", "compute.googleapis.com" ] - group: setup modules: ## Monitoring - id: hpc_dash source: modules/monitoring/dashboard settings: title: HPC - id: gpu_dash source: modules/monitoring/dashboard settings: title: GPU base_dashboard: Empty widgets: - | { "title": "GPU Memory Utilization", "xyChart": { "dataSets": [ { "timeSeriesQuery": { "timeSeriesFilter": { "filter": "metric.type=\"agent.googleapis.com/gpu/memory/bytes_used\" resource.type=\"gce_instance\"", "aggregation": { "perSeriesAligner": "ALIGN_MEAN", "crossSeriesReducer": "REDUCE_NONE", "groupByFields": [] } } }, "plotType": "LINE", "targetAxis": "Y1", "minAlignmentPeriod": "60s" } ], "chartOptions": { "mode": "COLOR", "displayHorizontal": false }, "thresholds": [], "yAxis": { "scale": "LINEAR" } } } - | { "title": "GPU Utilization", "xyChart": { "dataSets": [ { "timeSeriesQuery": { "prometheusQuery": "avg_over_time(agent_googleapis_com:gpu_utilization{monitored_resource=\"gce_instance\"}[${__interval}])" }, "plotType": "LINE", "targetAxis": "Y1" } ], "chartOptions": { "mode": "COLOR", "displayHorizontal": false }, "thresholds": [], "yAxis": { "scale": "LINEAR" } } } ## network - id: network1 source: modules/network/vpc ## Filesystems - id: homefs source: community/modules/file-system/nfs-server use: [network1] settings: local_mounts: [/home] disk_size: 2560 instance_image: project: "cloud-hpc-image-public" family: "hpc-rocky-linux-8" - id: data_bucket source: community/modules/file-system/cloud-storage-bucket settings: name_prefix: $(vars.bucket_model) random_suffix: true force_destroy: true local_mount: /data_bucket mount_options: defaults,_netdev,implicit_dirs,allow_other,dir_mode=0777,file_mode=766 - id: move_files source: ./files use: [data_bucket] - id: startup_script source: modules/scripts/startup-script settings: install_cloud_ops_agent: false runners: - type: shell destination: startup-script.sh content: | #!/bin/bash CONDA_BASE=/opt/conda source $CONDA_BASE/bin/activate base conda init --system # UnInstall Stackdriver Agent sudo systemctl stop stackdriver-agent.service sudo systemctl disable stackdriver-agent.service curl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh sudo dpkg --configure -a sudo bash add-monitoring-agent-repo.sh --uninstall sudo bash add-monitoring-agent-repo.sh --remove-repo # Install ops-agent curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh sudo bash add-google-cloud-ops-agent-repo.sh --also-install sudo service google-cloud-ops-agent start pip install git+https://github.com/huggingface/transformers - group: cluster modules: - id: n1t4_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: zones: $(vars.zone_list) node_count_dynamic_max: 1 bandwidth_tier: gvnic_enabled disk_size_gb: $(vars.disk_size_gb) enable_public_ips: true enable_placement: false advanced_machine_features: threads_per_core: 1 machine_type: n1-standard-96 guest_accelerator: - type: nvidia-tesla-t4 count: 4 on_host_maintenance: TERMINATE instance_image: name: $(vars.image_name) project: $(vars.image_project) - id: n1t4_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [n1t4_nodeset] settings: partition_name: n1t4 is_default: true exclusive: false - id: n2_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: zones: $(vars.zone_list) node_count_dynamic_max: 1 bandwidth_tier: gvnic_enabled disk_size_gb: $(vars.disk_size_gb) enable_public_ips: true advanced_machine_features: threads_per_core: 1 machine_type: n2-standard-4 on_host_maintenance: TERMINATE instance_image: name: $(vars.image_name) project: $(vars.image_project) - id: n2_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [n2_nodeset] settings: partition_name: n2 is_default: true - id: g2_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: zones: $(vars.zone_list) node_count_dynamic_max: 1 bandwidth_tier: gvnic_enabled disk_size_gb: $(vars.disk_size_gb) enable_public_ips: true advanced_machine_features: threads_per_core: 1 machine_type: g2-standard-96 on_host_maintenance: TERMINATE instance_image: name: $(vars.image_name) project: $(vars.image_project) - id: g2_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [g2_nodeset] settings: partition_name: g2gpu8 is_default: false - id: a2_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: zones: $(vars.zone_list) node_count_dynamic_max: 1 bandwidth_tier: gvnic_enabled disk_size_gb: $(vars.disk_size_gb) enable_public_ips: true advanced_machine_features: threads_per_core: 1 machine_type: a2-highgpu-4g on_host_maintenance: TERMINATE instance_image: name: $(vars.image_name) project: $(vars.image_project) - id: a2_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [a2_nodeset] settings: partition_name: a2gpu4 is_default: false - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network1] settings: name_prefix: login machine_type: n2-standard-4 enable_login_public_ips: true instance_image: name: $(vars.image_name) project: $(vars.image_project) - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - n1t4_partition - n2_partition - g2_partition - a2_partition - slurm_login - homefs - data_bucket settings: enable_controller_public_ips: true controller_startup_script: $(startup_script.startup_script) controller_startup_scripts_timeout: 21600 login_startup_script: $(startup_script.startup_script) login_startup_scripts_timeout: 21600 instance_image: name: $(vars.image_name) project: $(vars.image_project)