in main.py [0:0]
def preprocess_config(cfg) -> Tuple[bool, bool]:
"""
Pre-process the configuration passed to the job
Returns
-------
Tuple
boolean: configuration has a custom script
boolean: is it a SageMaker recipe
"""
with omegaconf.open_dict(cfg):
cfg.launcher_scripts_path = LAUNCHER_SCRIPT_PATH
# Override the cluster type to align with NeMo
if cfg.get("cluster_type") is None:
assert cfg.get("cluster") is not None
cluster_type = cfg.cluster.cluster_type
else:
cluster_type = cfg.cluster_type
with omegaconf.open_dict(cfg):
if cluster_type == "slurm":
cfg.cluster_type = "bcm"
else:
cfg.cluster_type = cluster_type
if cfg.get("wandb_api_key_file") is None:
with omegaconf.open_dict(cfg):
cfg.wandb_api_key_file = None
if cfg.get("wandb_api_bcp_secret_key") is None:
with omegaconf.open_dict(cfg):
cfg.wandb_api_bcp_secret_key = None
if cfg.get("training_cfg") is not None:
assert cfg.get("stages") is None, "training_cfg and stages should not set together"
stage_cfg = cfg.get("training_cfg")
assert stage_cfg.get("run") is not None, "run config should be set"
run_config = stage_cfg.get("run")
if run_config.get("ntasks_per_node") is not None:
ntasks_per_node = run_config.get("ntasks_per_node")
else:
instance_type = get_instance_type(cfg)
if instance_type is not None and get_num_accelerator_devices(instance_type) is not None:
ntasks_per_node = get_num_accelerator_devices(instance_type) * get_num_cores_per_accelerator(
instance_type
)
else:
ntasks_per_node = 8
# To align with https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L721
with omegaconf.open_dict(stage_cfg):
stage_cfg.trainer = {"devices": ntasks_per_node}
with omegaconf.open_dict(run_config):
run_config.ntasks_per_node = ntasks_per_node
run_config.results_dir = f"{cfg.base_results_dir}/{run_config.name}"
# To align with https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L313C54-L313C72
with omegaconf.open_dict(cfg):
cfg.training = {"model": {"ub_tp_comm_overlap": False}}
# if not in a unit-test environment de-dupe consecutive runs by appending random hash to end of job name
if "pytest" not in sys.modules and "name" in cfg.training_cfg.run:
cfg.training_cfg.run.name = valid_run_name(cfg.training_cfg.run.get("name", None))
return True, False
if cfg.recipes:
model_type = cfg.recipes.run.get("model_type", None)
# if not in a unit-test environment de-dupe consecutive runs by appending random hash to end of job name
if "pytest" not in sys.modules and "name" in cfg.recipes.run:
cfg.recipes.run.name = valid_run_name(cfg.recipes.run.get("name", None))
with omegaconf.open_dict(cfg):
cfg.training = cfg.recipes # Point cfg.training to cfg.recipes to avoid conflict in nemo stages
if "hf" in model_type:
return False, True
return False, False