def prepare_deepspeed_cmd

def prepare_deepspeed_cmd_env()

in src/accelerate/utils/launch.py [0:0]
140 lines of code
51 McCabe index (conditional complexity)

def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> tuple[list[str], dict[str, str]]:
    """
    Prepares and returns the command list and an environment with the correct DeepSpeed environment variables.
    """
    # get free port and update configurations
    if args.main_process_port == 0:
        args.main_process_port = get_free_port()

    elif args.main_process_port is None:
        args.main_process_port = 29500

    num_processes = args.num_processes
    num_machines = args.num_machines
    main_process_ip = args.main_process_ip
    main_process_port = args.main_process_port
    cmd = None

    # make sure launcher is not None
    if args.deepspeed_multinode_launcher is None:
        # set to default pdsh
        args.deepspeed_multinode_launcher = DEEPSPEED_MULTINODE_LAUNCHERS[0]

    if num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
        cmd = ["deepspeed"]
        cmd.extend(["--hostfile", str(args.deepspeed_hostfile)])
        if args.deepspeed_multinode_launcher == "nossh":
            if compare_versions("deepspeed", "<", "0.14.5"):
                raise ValueError("nossh launcher requires DeepSpeed >= 0.14.5")
            cmd.extend(["--node_rank", str(args.machine_rank), "--no_ssh"])
        else:
            cmd.extend(["--no_local_rank", "--launcher", str(args.deepspeed_multinode_launcher)])
        if args.deepspeed_exclusion_filter is not None:
            cmd.extend(
                [
                    "--exclude",
                    str(args.deepspeed_exclusion_filter),
                ]
            )
        elif args.deepspeed_inclusion_filter is not None:
            cmd.extend(
                [
                    "--include",
                    str(args.deepspeed_inclusion_filter),
                ]
            )
        else:
            cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
        if main_process_ip:
            cmd.extend(["--master_addr", str(main_process_ip)])
        cmd.extend(["--master_port", str(main_process_port)])
        if args.module and args.no_python:
            raise ValueError("--module and --no_python cannot be used together")
        elif args.module:
            cmd.append("--module")
        elif args.no_python:
            cmd.append("--no_python")
        cmd.append(args.training_script)
        cmd.extend(args.training_script_args)
    elif num_machines > 1 and args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
        args.nproc_per_node = str(num_processes // num_machines)
        args.nnodes = str(num_machines)
        args.node_rank = int(args.machine_rank)
        if getattr(args, "same_network", False):
            args.master_addr = str(main_process_ip)
            args.master_port = str(main_process_port)
        else:
            args.rdzv_endpoint = f"{main_process_ip}:{main_process_port}"
    else:
        args.nproc_per_node = str(num_processes)
        if main_process_port is not None:
            args.master_port = str(main_process_port)

    # only need to check port availability in main process, in case we have to start multiple launchers on the same machine
    # for some reasons like splitting log files.
    need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
    if need_port_check and is_port_in_use(main_process_port):
        if num_machines <= 1:
            args.standalone = True
            warnings.warn(
                f"Port `{main_process_port}` is already in use. "
                "Accelerate will attempt to launch in a standalone-like mode by finding an open port automatically for this session. "
                "If this current attempt fails, or for more control in future runs, please specify a different port "
                "(e.g., `--main_process_port <your_chosen_port>`) or use `--main_process_port 0` for automatic selection "
                "in your launch command or Accelerate config file."
            )
        else:
            raise ConnectionError(
                f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
                "Please specify a different port (such as using the `--main_process_port` flag or specifying a different `main_process_port` in your config file)"
                " and rerun your script. To automatically use the next open port (on a single node), you can set this to `0`."
            )

    if args.module and args.no_python:
        raise ValueError("--module and --no_python cannot be used together")
    elif args.module:
        args.module = True
    elif args.no_python:
        args.no_python = True

    current_env = os.environ.copy()
    if args.debug:
        current_env["ACCELERATE_DEBUG_MODE"] = "true"
    gpu_ids = getattr(args, "gpu_ids", "all")
    if gpu_ids != "all" and args.gpu_ids is not None:
        if is_xpu_available():
            current_env["ZE_AFFINITY_MASK"] = gpu_ids
        elif is_mlu_available():
            current_env["MLU_VISIBLE_DEVICES"] = gpu_ids
        elif is_sdaa_available():
            current_env["SDAA_VISIBLE_DEVICES"] = gpu_ids
        elif is_musa_available():
            current_env["MUSA_VISIBLE_DEVICES"] = gpu_ids
        elif is_npu_available():
            current_env["ASCEND_RT_VISIBLE_DEVICES"] = gpu_ids
        elif is_hpu_available():
            current_env["HABANA_VISIBLE_MODULES"] = gpu_ids
        else:
            current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
    try:
        mixed_precision = PrecisionType(args.mixed_precision.lower())
    except ValueError:
        raise ValueError(
            f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
        )

    current_env["PYTHONPATH"] = env_var_path_add("PYTHONPATH", os.path.abspath("."))
    current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
    if args.mixed_precision.lower() == "fp8":
        if not is_fp8_available():
            raise RuntimeError(
                "FP8 is not available on this machine. Please ensure that either Transformer Engine, MSAMP or torchao is installed."
            )
        current_env = setup_fp8_env(args, current_env)
    current_env["ACCELERATE_CONFIG_DS_FIELDS"] = str(args.deepspeed_fields_from_accelerate_config).lower()
    current_env["ACCELERATE_USE_DEEPSPEED"] = "true"
    if args.zero_stage is not None:
        current_env["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(args.zero_stage)
    if args.gradient_accumulation_steps is not None:
        current_env["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str(args.gradient_accumulation_steps)
    if args.gradient_clipping is not None:
        current_env["ACCELERATE_GRADIENT_CLIPPING"] = str(args.gradient_clipping).lower()
    if args.offload_optimizer_device is not None:
        current_env["ACCELERATE_DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE"] = str(args.offload_optimizer_device).lower()
    if args.offload_param_device is not None:
        current_env["ACCELERATE_DEEPSPEED_OFFLOAD_PARAM_DEVICE"] = str(args.offload_param_device).lower()
    if args.zero3_init_flag is not None:
        current_env["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = str(args.zero3_init_flag).lower()
    if args.zero3_save_16bit_model is not None:
        current_env["ACCELERATE_DEEPSPEED_ZERO3_SAVE_16BIT_MODEL"] = str(args.zero3_save_16bit_model).lower()
    if args.deepspeed_config_file is not None:
        current_env["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
    if args.enable_cpu_affinity:
        current_env["ACCELERATE_CPU_AFFINITY"] = "1"
    if args.deepspeed_moe_layer_cls_names is not None:
        current_env["ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES"] = str(args.deepspeed_moe_layer_cls_names)
    return cmd, current_env