in launcher/nemo/stages.py [0:0]
def generate_default_k8s_value_template(self, template_root, cluster_parameters, stage_cfg_path=None):
"""
Setting the general k8s configs that will be applicable for all device types and training methods
"""
with open(template_root / "values.yaml") as value_file:
values_template = OmegaConf.load(value_file)
values_template.image.trainingImage = cluster_parameters["container_image"]
values_template.trainingConfig.jobName = self.stage_cfg.run.name
# Cluster configs
values_template.trainingConfig.numEFADevices = self.num_efa_devices
if "pullPolicy" in cluster_parameters:
values_template.image.pullPolicy = cluster_parameters["pullPolicy"]
if "env_vars" in cluster_parameters:
values_template.trainingConfig.envVars = cluster_parameters["env_vars"]
if "restartPolicy" in cluster_parameters:
values_template.trainingConfig.restartPolicy = cluster_parameters["restartPolicy"]
if "cleanPodPolicy" in cluster_parameters:
values_template.trainingConfig.cleanPodPolicy = cluster_parameters["cleanPodPolicy"]
if "persistent_volume_claims" in cluster_parameters:
values_template.trainingConfig.persistentVolumeClaims = cluster_parameters["persistent_volume_claims"]
if "volumes" in cluster_parameters:
values_template.trainingConfig.volumes = cluster_parameters["volumes"]
if cluster_parameters.get("namespace", None) is not None:
values_template.trainingConfig.namespace = cluster_parameters["namespace"]
if cluster_parameters.get("annotations", None) is not None:
values_template.trainingConfig.annotations = cluster_parameters["annotations"]
if cluster_parameters.get("priority_class_name", None) is not None:
values_template.trainingConfig.priorityClassName = cluster_parameters["priority_class_name"]
if cluster_parameters.get("service_account_name") is not None:
values_template.trainingConfig.serviceAccountName = cluster_parameters["service_account_name"]
if cluster_parameters.get("custom_labels", None) is not None:
values_template.trainingConfig.customLabels = cluster_parameters["custom_labels"]
if cluster_parameters.get("label_selector", None) is not None:
values_template.trainingConfig.labelSelector = cluster_parameters["label_selector"]
values_template.trainingConfig.compile = OmegaConf.select(self.cfg, "recipes.run.compile", default=0)
if self._default_repo is not None:
values_template.trainingConfig.git.repo_url_or_path = self._default_repo
if self._default_branch is not None:
values_template.trainingConfig.git.branch = self._default_branch
# Git configs
if self.cfg.get("git", None) is not None:
if self.cfg.git.get("repo_url_or_path", None) is not None:
repo_url_or_path = str(self.cfg.git.repo_url_or_path)
# We only support to use local repo path for slurm, bcm is nemo launcher version of slurm cluster
if not (repo_url_or_path.startswith("http") or repo_url_or_path.startswith("codecommit::")):
raise ValueError("local git repo path is only supported for slurm based cluster")
if self.cfg.git.get("token", None) is not None:
repo_url_or_path = self.insert_git_token(repo_url_or_path, self.cfg.git.token)
values_template.trainingConfig.git.repo_url_or_path = repo_url_or_path
if self.cfg.git.get("branch", None) is not None:
values_template.trainingConfig.git.branch = self.cfg.git.branch
if self.cfg.git.get("commit", None) is not None:
values_template.trainingConfig.git.commit = self.cfg.git.commit
if self.cfg.git.get("update_adapter", None) is not None:
values_template.trainingConfig.git.update_adapter = self.cfg.git.update_adapter
values_template.trainingConfig.device = self.device
values_template.trainingConfig.scriptArgs = self.get_script_args_str(stage_cfg_path)
values_template.trainingConfig.pre_script = self.stage_cfg.get("pre_script", [])
values_template.trainingConfig.post_script = self.stage_cfg.get("post_script", [])
return values_template