llama2-finetuning-slurm/hpc-slurm-llama2.yaml (306 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
blueprint_name: hpc-slurm-llama
vars:
project_id: ## Set GCP Project ID Here ##
bucket_model: ## Set your bucket name prefix here ##
deployment_name: hpc-slurm-llama2
region: us-central1
zone: us-central1-a
zone_list: [us-central1-a, us-central1-b, us-central1-c]
new_image_family: llama2-slurm-v6
instance_image_custom: true
disk_size_gb: 200
# Documentation for each of the modules used below can be found at
# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md
deployment_groups:
- group: enable_apis
modules:
- id: enable_apis
source: community/modules/project/service-enablement
settings:
gcp_service_list: [
"cloudresourcemanager.googleapis.com",
"stackdriver.googleapis.com",
"iam.googleapis.com",
"logging.googleapis.com",
"compute.googleapis.com"
]
- group: setup
modules:
## Monitoring
- id: hpc_dash
source: modules/monitoring/dashboard
settings:
title: HPC
- id: gpu_dash
source: modules/monitoring/dashboard
settings:
title: GPU
base_dashboard: Empty
widgets:
- |
{
"title": "GPU Memory Utilization",
"xyChart": {
"dataSets": [
{
"timeSeriesQuery": {
"timeSeriesFilter": {
"filter": "metric.type=\"agent.googleapis.com/gpu/memory/bytes_used\" resource.type=\"gce_instance\"",
"aggregation": {
"perSeriesAligner": "ALIGN_MEAN",
"crossSeriesReducer": "REDUCE_NONE",
"groupByFields": []
}
}
},
"plotType": "LINE",
"targetAxis": "Y1",
"minAlignmentPeriod": "60s"
}
],
"chartOptions": {
"mode": "COLOR",
"displayHorizontal": false
},
"thresholds": [],
"yAxis": {
"scale": "LINEAR"
}
}
}
- |
{
"title": "GPU Utilization",
"xyChart": {
"dataSets": [
{
"timeSeriesQuery": {
"prometheusQuery": "avg_over_time(agent_googleapis_com:gpu_utilization{monitored_resource=\"gce_instance\"}[${__interval}])"
},
"plotType": "LINE",
"targetAxis": "Y1"
}
],
"chartOptions": {
"mode": "COLOR",
"displayHorizontal": false
},
"thresholds": [],
"yAxis": {
"scale": "LINEAR"
}
}
}
## network
- id: network1
source: modules/network/vpc
## Filesystems
- id: homefs
source: community/modules/file-system/nfs-server
use: [network1]
settings:
local_mounts: [/home]
disk_size: 2560
instance_image:
project: "cloud-hpc-image-public"
family: "hpc-rocky-linux-8"
- id: data_bucket
source: community/modules/file-system/cloud-storage-bucket
settings:
name_prefix: $(vars.bucket_model)
random_suffix: true
force_destroy: true
local_mount: /data_bucket
mount_options: defaults,_netdev,implicit_dirs,allow_other,dir_mode=0777,file_mode=766
- id: move_files
source: ./files
use: [data_bucket]
## Install Scripts
- id: packer_script
# configure conda environment for llama
source: modules/scripts/startup-script
settings:
runners:
- type: shell
destination: install-ml-libraries.sh
content: |
#!/bin/bash
# this script is designed to execute on Slurm images published by SchedMD that:
# - are based on Debian 11 distribution of Linux
# - have NVIDIA Drivers v530 pre-installed
# - have CUDA Toolkit 12.1 pre-installed.
set -e -o pipefail
CONDA_BASE=/opt/conda
if [ -d $CONDA_BASE ]; then
exit 0
fi
DL_DIR=\$(mktemp -d)
cd $DL_DIR
curl -O https://repo.anaconda.com/miniconda/Miniconda3-py310_23.3.1-0-Linux-x86_64.sh
HOME=$DL_DIR bash Miniconda3-py310_23.3.1-0-Linux-x86_64.sh -b -p $CONDA_BASE
cd -
rm -rf $DL_DIR
unset DL_DIR
tee /tmp/llama2_env.yml << EOLL
name: llama2
channels:
- conda-forge
- nvidia
- nvidia/label/cuda-12.4.0
dependencies:
- appdirs
- loralib
- black
- black-jupyter
- py7zr
- scipy
- optimum
- datasets
- accelerate
- peft
- fairscale
- fire
- sentencepiece
- transformers
- huggingface_hub
- git
- pip
- pip:
- bitsandbytes
- nvidia-cudnn-cu12
- dataclasses
- nvidia-nccl-cu12
- trl
- torch
- torchaudio
- torchvision
- nvitop
EOLL
source $CONDA_BASE/bin/activate base
conda env create -n llama2 --file /tmp/llama2_env.yml
- id: startup_script
source: modules/scripts/startup-script
settings:
install_cloud_ops_agent: false
runners:
- type: shell
destination: startup-script.sh
content: |
#!/bin/bash
CONDA_BASE=/opt/conda
source $CONDA_BASE/bin/activate base
conda init --system
# UnInstall Stackdriver Agent
sudo systemctl stop stackdriver-agent.service
sudo systemctl disable stackdriver-agent.service
curl -sSO https://dl.google.com/cloudagents/add-monitoring-agent-repo.sh
sudo dpkg --configure -a
sudo bash add-monitoring-agent-repo.sh --uninstall
sudo bash add-monitoring-agent-repo.sh --remove-repo
# Install ops-agent
curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
sudo bash add-google-cloud-ops-agent-repo.sh --also-install
sudo service google-cloud-ops-agent start
- group: packer
modules:
- id: custom-image
source: modules/packer/custom-image
kind: packer
use:
- network1
- packer_script
settings:
source_image_project_id: [schedmd-slurm-public]
source_image_family: slurm-gcp-6-6-debian-11
disk_size: $(vars.disk_size_gb)
image_family: $(vars.new_image_family)
machine_type: c2-standard-8 # building this image does not require a GPU-enabled VM
state_timeout: 30m
- group: cluster
modules:
- id: n1t4_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network1]
settings:
zones: $(vars.zone_list)
node_count_dynamic_max: 1
bandwidth_tier: gvnic_enabled
disk_size_gb: $(vars.disk_size_gb)
enable_public_ips: true
enable_placement: false
advanced_machine_features:
threads_per_core: 1
machine_type: n1-standard-96
guest_accelerator:
- type: nvidia-tesla-t4
count: 4
on_host_maintenance: TERMINATE
instance_image:
family: $(vars.new_image_family)
project: $(vars.project_id)
- id: n1t4_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [n1t4_nodeset]
settings:
partition_name: n1t4
is_default: true
exclusive: false
- id: n2_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network1]
settings:
zones: $(vars.zone_list)
node_count_dynamic_max: 1
bandwidth_tier: gvnic_enabled
disk_size_gb: $(vars.disk_size_gb)
enable_public_ips: true
advanced_machine_features:
threads_per_core: 1
machine_type: n2-standard-4
on_host_maintenance: TERMINATE
instance_image:
family: $(vars.new_image_family)
project: $(vars.project_id)
- id: n2_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [n2_nodeset]
settings:
partition_name: n2
is_default: true
- id: g2_nodeset
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
use: [network1]
settings:
zones: $(vars.zone_list)
node_count_dynamic_max: 1
bandwidth_tier: gvnic_enabled
disk_size_gb: $(vars.disk_size_gb)
enable_public_ips: true
advanced_machine_features:
threads_per_core: 1
machine_type: g2-standard-96
on_host_maintenance: TERMINATE
instance_image:
family: $(vars.new_image_family)
project: $(vars.project_id)
- id: g2_partition
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
use: [g2_nodeset]
settings:
partition_name: g2gpu8
is_default: false
- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
use: [network1]
settings:
name_prefix: login
machine_type: n2-standard-4
enable_login_public_ips: true
instance_image:
family: $(vars.new_image_family)
project: $(vars.project_id)
- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
use:
- network1
- n1t4_partition
- n2_partition
- g2_partition
- slurm_login
- homefs
- data_bucket
settings:
enable_controller_public_ips: true
controller_startup_script: $(startup_script.startup_script)
controller_startup_scripts_timeout: 21600
login_startup_script: $(startup_script.startup_script)
login_startup_scripts_timeout: 21600
instance_image:
family: $(vars.new_image_family)
project: $(vars.project_id)