mcloud_train.yaml

# Ultravox training configuration name: ultravox-train image: mosaicml/composer:latest compute: gpus: 8 cluster: r15z1p1 integrations: - integration_type: git_repo git_repo: fixie-ai/ultravox git_branch: $UV_BRANCH pip_install: poetry==1.7.1 scheduling: max_duration: 6 # 6 hours max for jobs to avoid hanging jobs command: >- cd ultravox && poetry install --no-dev && poetry run python -m ultravox.training.helpers.prefetch_weights $TRAIN_ARGS && poetry run torchrun --nproc_per_node=8 -m ultravox.training.train $TRAIN_ARGS env_variables: MLFLOW_TRACKING_URI: databricks UV_BRANCH: main TRAIN_ARGS: --config_path ultravox/training/configs/release_config.yaml HF_HUB_DOWNLOAD_TIMEOUT: "300" # Set timeout to 300 seconds (5 minutes) HF_HUB_ENABLE_HF_TRANSFER: 1 # RAM Efficient Loading: only the first process loads the pretrained model checkoint while all other processes have empty # weights. Only applicable for Transformers. Forcibly sets `sync_module_states` to `True`. FSDP_CPU_RAM_EFFICIENT_LOADING: 1

mcloud_train.yaml (24 lines of code) (raw):