def preprocess

def preprocess_config()

in main.py [0:0]
53 lines of code
17 McCabe index (conditional complexity)

def preprocess_config(cfg) -> Tuple[bool, bool]:
    """
    Pre-process the configuration passed to the job

    Returns
    -------
    Tuple
        boolean: configuration has a custom script
        boolean: is it a SageMaker recipe
    """

    with omegaconf.open_dict(cfg):
        cfg.launcher_scripts_path = LAUNCHER_SCRIPT_PATH
    # Override the cluster type to align with NeMo
    if cfg.get("cluster_type") is None:
        assert cfg.get("cluster") is not None
        cluster_type = cfg.cluster.cluster_type
    else:
        cluster_type = cfg.cluster_type

    with omegaconf.open_dict(cfg):
        if cluster_type == "slurm":
            cfg.cluster_type = "bcm"
        else:
            cfg.cluster_type = cluster_type

    if cfg.get("wandb_api_key_file") is None:
        with omegaconf.open_dict(cfg):
            cfg.wandb_api_key_file = None

    if cfg.get("wandb_api_bcp_secret_key") is None:
        with omegaconf.open_dict(cfg):
            cfg.wandb_api_bcp_secret_key = None

    if cfg.get("training_cfg") is not None:
        assert cfg.get("stages") is None, "training_cfg and stages should not set together"
        stage_cfg = cfg.get("training_cfg")
        assert stage_cfg.get("run") is not None, "run config should be set"
        run_config = stage_cfg.get("run")

        if run_config.get("ntasks_per_node") is not None:
            ntasks_per_node = run_config.get("ntasks_per_node")
        else:
            instance_type = get_instance_type(cfg)
            if instance_type is not None and get_num_accelerator_devices(instance_type) is not None:
                ntasks_per_node = get_num_accelerator_devices(instance_type) * get_num_cores_per_accelerator(
                    instance_type
                )
            else:
                ntasks_per_node = 8

        # To align with https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L721
        with omegaconf.open_dict(stage_cfg):
            stage_cfg.trainer = {"devices": ntasks_per_node}
            with omegaconf.open_dict(run_config):
                run_config.ntasks_per_node = ntasks_per_node
                run_config.results_dir = f"{cfg.base_results_dir}/{run_config.name}"

        # To align with https://github.com/NVIDIA/NeMo-Framework-Launcher/blob/23.11/launcher_scripts/nemo_launcher/core/stages.py#L313C54-L313C72
        with omegaconf.open_dict(cfg):
            cfg.training = {"model": {"ub_tp_comm_overlap": False}}

        # if not in a unit-test environment de-dupe consecutive runs by appending random hash to end of job name
        if "pytest" not in sys.modules and "name" in cfg.training_cfg.run:
            cfg.training_cfg.run.name = valid_run_name(cfg.training_cfg.run.get("name", None))

        return True, False

    if cfg.recipes:
        model_type = cfg.recipes.run.get("model_type", None)

        # if not in a unit-test environment de-dupe consecutive runs by appending random hash to end of job name
        if "pytest" not in sys.modules and "name" in cfg.recipes.run:
            cfg.recipes.run.name = valid_run_name(cfg.recipes.run.get("name", None))

        with omegaconf.open_dict(cfg):
            cfg.training = cfg.recipes  # Point cfg.training to cfg.recipes to avoid conflict in nemo stages
        if "hf" in model_type:
            return False, True

    return False, False