launcher/nemo/launchers.py (33 lines of code) (raw):

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from pathlib import Path from typing import List import nemo_launcher.utils.job_utils as job_utils from nemo_launcher.core.launchers import AutoLauncher, K8SLauncher, Launcher from .slurm_launcher import SMSlurmLauncher class SMAutoLauncher(AutoLauncher): """ AutoLauncher object for Sagemaker """ @staticmethod def get_launchers(): """Returns supported launchers as a dictionary from launcher name to launcher class""" return { "bcm": SMSlurmLauncher, "k8s": SMK8SLauncher, "sm_jobs": SMJobsLauncher, } class SMK8SLauncher(K8SLauncher): """ Launcher for SM training jobs using K8s. """ def _make_submission_file_text(self, command_groups: List[List[str]]) -> str: """ Generate the script to launch the Helm chart. A very simple bash script is generated which runs `helm install` for the Helm chart that was generated. :param List[List[str]] command_groups: Command groups to launch with :return: submission script file's text :rtype: str """ paths = job_utils.JobPaths(folder=self.folder, job_name=self.job_name) helm_charts = paths.folder / "k8s_template" job_name = self.job_name.replace("_", "-") extra_helm_args = "" if self.parameters.get("namespace", None): extra_helm_args += f" --namespace {self.parameters['namespace']}" # Apply a timeout of 15min in case images take a long time to bring up # or pre-install hooks take a while return f"#!/bin/bash\nhelm install --timeout=15m --wait {extra_helm_args} {job_name} {helm_charts}\n" class SMJobsLauncher(Launcher): def _make_submission_file_text(self, command_groups: List[List[str]]) -> str: """ Given the command groups, generate submission script file's text. Command groups is a list of command group. A command group is defined as: 0. Command group is a list of command strings 1. Each command group occupies one bcprun, srun or bash 2. Each command group eventually has multiple commands connected by ";" On interactive cluster, multi-gpu python scripts are launched with `torchrun --nproc_per_node=??` :param List[List[str]] command_groups: Command groups to launch with :return: submission script file's text :rtype: str """ # now create lines = ["#!/bin/bash", ""] for group_ind, command_group in enumerate(command_groups): command = "\n".join(command_group) lines.append(command) return "\n".join(lines) def _submit_command(self, submission_file_path: Path) -> str: command_list = ["bash", submission_file_path] # run job_utils.CommandFunction(command_list, ret_stdout=False, verbose=False)() # explicit errors return ""