llama2-finetuning-slurm/hpc-slurm-llama2.yaml (306 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. --- blueprint_name: hpc-slurm-llama vars: project_id: ## Set GCP Project ID Here ## bucket_model: ## Set your bucket name prefix here ## deployment_name: hpc-slurm-llama2 region: us-central1 zone: us-central1-a zone_list: [us-central1-a, us-central1-b, us-central1-c] new_image_family: llama2-slurm-v6 instance_image_custom: true disk_size_gb: 200 # Documentation for each of the modules used below can be found at # https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md deployment_groups: - group: enable_apis modules: - id: enable_apis source: community/modules/project/service-enablement settings: gcp_service_list: [ "cloudresourcemanager.googleapis.com", "stackdriver.googleapis.com", "iam.googleapis.com", "logging.googleapis.com", "compute.googleapis.com" ] - group: setup modules: ## Monitoring - id: hpc_dash source: modules/monitoring/dashboard settings: title: HPC - id: gpu_dash source: modules/monitoring/dashboard settings: title: GPU base_dashboard: Empty widgets: - | { "title": "GPU Memory Utilization", "xyChart": { "dataSets": [ { "timeSeriesQuery": { "timeSeriesFilter": { "filter": "metric.type=\"agent.googleapis.com/gpu/memory/bytes_used\" resource.type=\"gce_instance\"", "aggregation": { "perSeriesAligner": "ALIGN_MEAN", "crossSeriesReducer": "REDUCE_NONE", "groupByFields": [] } } }, "plotType": "LINE", "targetAxis": "Y1", "minAlignmentPeriod": "60s" } ], "chartOptions": { "mode": "COLOR", "displayHorizontal": false }, "thresholds": [], "yAxis": { "scale": "LINEAR" } } } - | { "title": "GPU Utilization", "xyChart": { "dataSets": [ { "timeSeriesQuery": { "prometheusQuery": "avg_over_time(agent_googleapis_com:gpu_utilization{monitored_resource=\"gce_instance\"}[${__interval}])" }, "plotType": "LINE", "targetAxis": "Y1" } ], "chartOptions": { "mode": "COLOR", "displayHorizontal": false }, "thresholds": [], "yAxis": { "scale": "LINEAR" } } } ## network - id: network1 source: modules/network/vpc ## Filesystems - id: homefs source: community/modules/file-system/nfs-server use: [network1] settings: local_mounts: [/home] disk_size: 2560 instance_image: project: "cloud-hpc-image-public" family: "hpc-rocky-linux-8" - id: data_bucket source: community/modules/file-system/cloud-storage-bucket settings: name_prefix: $(vars.bucket_model) random_suffix: true force_destroy: true local_mount: /data_bucket mount_options: defaults,_netdev,implicit_dirs,allow_other,dir_mode=0777,file_mode=766 - id: move_files source: ./files use: [data_bucket] ## Install Scripts - id: packer_script # configure conda environment for llama source: modules/scripts/startup-script settings: runners: - type: shell destination: install-ml-libraries.sh content: | #!/bin/bash # this script is designed to execute on Slurm images published by SchedMD that: # - are based on Debian 11 distribution of Linux # - have NVIDIA Drivers v530 pre-installed # - have CUDA Toolkit 12.1 pre-installed. set -e -o pipefail CONDA_BASE=/opt/conda if [ -d $CONDA_BASE ]; then exit 0 fi DL_DIR=\$(mktemp -d) cd $DL_DIR curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh HOME=$DL_DIR bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -b -p $CONDA_BASE cd - rm -rf $DL_DIR unset DL_DIR tee /tmp/llama2_env.yml << EOLL name: llama2 channels: - conda-forge - nvidia - nvidia/label/cuda-12.4.0 dependencies: - appdirs - loralib - black - black-jupyter - py7zr - scipy - optimum - datasets - accelerate - peft - fairscale - fire - sentencepiece - transformers - huggingface_hub - git - pip - pip: - bitsandbytes - nvidia-cudnn-cu12 - dataclasses - nvidia-nccl-cu12 - trl - torch - torchaudio - torchvision - nvitop EOLL source $CONDA_BASE/bin/activate base conda env create -n llama2 --file /tmp/llama2_env.yml - id: startup_script source: modules/scripts/startup-script settings: install_cloud_ops_agent: false runners: - type: shell destination: startup-script.sh content: | #!/bin/bash CONDA_BASE=/opt/conda source $CONDA_BASE/bin/activate base conda init --system # UnInstall Stackdriver Agent sudo systemctl stop stackdriver-agent.service sudo systemctl disable stackdriver-agent.service curl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh sudo dpkg --configure -a sudo bash add-monitoring-agent-repo.sh --uninstall sudo bash add-monitoring-agent-repo.sh --remove-repo # Install ops-agent curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh sudo bash add-google-cloud-ops-agent-repo.sh --also-install sudo service google-cloud-ops-agent start - group: packer modules: - id: custom-image source: modules/packer/custom-image kind: packer use: - network1 - packer_script settings: source_image_project_id: [schedmd-slurm-public] source_image_family: slurm-gcp-6-6-debian-11 disk_size: $(vars.disk_size_gb) image_family: $(vars.new_image_family) machine_type: c2-standard-8 # building this image does not require a GPU-enabled VM state_timeout: 30m - group: cluster modules: - id: n1t4_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: zones: $(vars.zone_list) node_count_dynamic_max: 1 bandwidth_tier: gvnic_enabled disk_size_gb: $(vars.disk_size_gb) enable_public_ips: true enable_placement: false advanced_machine_features: threads_per_core: 1 machine_type: n1-standard-96 guest_accelerator: - type: nvidia-tesla-t4 count: 4 on_host_maintenance: TERMINATE instance_image: family: $(vars.new_image_family) project: $(vars.project_id) - id: n1t4_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [n1t4_nodeset] settings: partition_name: n1t4 is_default: true exclusive: false - id: n2_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: zones: $(vars.zone_list) node_count_dynamic_max: 1 bandwidth_tier: gvnic_enabled disk_size_gb: $(vars.disk_size_gb) enable_public_ips: true advanced_machine_features: threads_per_core: 1 machine_type: n2-standard-4 on_host_maintenance: TERMINATE instance_image: family: $(vars.new_image_family) project: $(vars.project_id) - id: n2_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [n2_nodeset] settings: partition_name: n2 is_default: true - id: g2_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset use: [network1] settings: zones: $(vars.zone_list) node_count_dynamic_max: 1 bandwidth_tier: gvnic_enabled disk_size_gb: $(vars.disk_size_gb) enable_public_ips: true advanced_machine_features: threads_per_core: 1 machine_type: g2-standard-96 on_host_maintenance: TERMINATE instance_image: family: $(vars.new_image_family) project: $(vars.project_id) - id: g2_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [g2_nodeset] settings: partition_name: g2gpu8 is_default: false - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network1] settings: name_prefix: login machine_type: n2-standard-4 enable_login_public_ips: true instance_image: family: $(vars.new_image_family) project: $(vars.project_id) - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network1 - n1t4_partition - n2_partition - g2_partition - slurm_login - homefs - data_bucket settings: enable_controller_public_ips: true controller_startup_script: $(startup_script.startup_script) controller_startup_scripts_timeout: 21600 login_startup_script: $(startup_script.startup_script) login_startup_scripts_timeout: 21600 instance_image: family: $(vars.new_image_family) project: $(vars.project_id)