in src/accelerate/utils/launch.py [0:0]
def prepare_deepspeed_cmd_env(args: argparse.Namespace) -> tuple[list[str], dict[str, str]]:
"""
Prepares and returns the command list and an environment with the correct DeepSpeed environment variables.
"""
# get free port and update configurations
if args.main_process_port == 0:
args.main_process_port = get_free_port()
elif args.main_process_port is None:
args.main_process_port = 29500
num_processes = args.num_processes
num_machines = args.num_machines
main_process_ip = args.main_process_ip
main_process_port = args.main_process_port
cmd = None
# make sure launcher is not None
if args.deepspeed_multinode_launcher is None:
# set to default pdsh
args.deepspeed_multinode_launcher = DEEPSPEED_MULTINODE_LAUNCHERS[0]
if num_machines > 1 and args.deepspeed_multinode_launcher != DEEPSPEED_MULTINODE_LAUNCHERS[1]:
cmd = ["deepspeed"]
cmd.extend(["--hostfile", str(args.deepspeed_hostfile)])
if args.deepspeed_multinode_launcher == "nossh":
if compare_versions("deepspeed", "<", "0.14.5"):
raise ValueError("nossh launcher requires DeepSpeed >= 0.14.5")
cmd.extend(["--node_rank", str(args.machine_rank), "--no_ssh"])
else:
cmd.extend(["--no_local_rank", "--launcher", str(args.deepspeed_multinode_launcher)])
if args.deepspeed_exclusion_filter is not None:
cmd.extend(
[
"--exclude",
str(args.deepspeed_exclusion_filter),
]
)
elif args.deepspeed_inclusion_filter is not None:
cmd.extend(
[
"--include",
str(args.deepspeed_inclusion_filter),
]
)
else:
cmd.extend(["--num_gpus", str(args.num_processes // args.num_machines)])
if main_process_ip:
cmd.extend(["--master_addr", str(main_process_ip)])
cmd.extend(["--master_port", str(main_process_port)])
if args.module and args.no_python:
raise ValueError("--module and --no_python cannot be used together")
elif args.module:
cmd.append("--module")
elif args.no_python:
cmd.append("--no_python")
cmd.append(args.training_script)
cmd.extend(args.training_script_args)
elif num_machines > 1 and args.deepspeed_multinode_launcher == DEEPSPEED_MULTINODE_LAUNCHERS[1]:
args.nproc_per_node = str(num_processes // num_machines)
args.nnodes = str(num_machines)
args.node_rank = int(args.machine_rank)
if getattr(args, "same_network", False):
args.master_addr = str(main_process_ip)
args.master_port = str(main_process_port)
else:
args.rdzv_endpoint = f"{main_process_ip}:{main_process_port}"
else:
args.nproc_per_node = str(num_processes)
if main_process_port is not None:
args.master_port = str(main_process_port)
# only need to check port availability in main process, in case we have to start multiple launchers on the same machine
# for some reasons like splitting log files.
need_port_check = num_machines <= 1 or int(args.machine_rank) == 0
if need_port_check and is_port_in_use(main_process_port):
if num_machines <= 1:
args.standalone = True
warnings.warn(
f"Port `{main_process_port}` is already in use. "
"Accelerate will attempt to launch in a standalone-like mode by finding an open port automatically for this session. "
"If this current attempt fails, or for more control in future runs, please specify a different port "
"(e.g., `--main_process_port <your_chosen_port>`) or use `--main_process_port 0` for automatic selection "
"in your launch command or Accelerate config file."
)
else:
raise ConnectionError(
f"Tried to launch distributed communication on port `{main_process_port}`, but another process is utilizing it. "
"Please specify a different port (such as using the `--main_process_port` flag or specifying a different `main_process_port` in your config file)"
" and rerun your script. To automatically use the next open port (on a single node), you can set this to `0`."
)
if args.module and args.no_python:
raise ValueError("--module and --no_python cannot be used together")
elif args.module:
args.module = True
elif args.no_python:
args.no_python = True
current_env = os.environ.copy()
if args.debug:
current_env["ACCELERATE_DEBUG_MODE"] = "true"
gpu_ids = getattr(args, "gpu_ids", "all")
if gpu_ids != "all" and args.gpu_ids is not None:
if is_xpu_available():
current_env["ZE_AFFINITY_MASK"] = gpu_ids
elif is_mlu_available():
current_env["MLU_VISIBLE_DEVICES"] = gpu_ids
elif is_sdaa_available():
current_env["SDAA_VISIBLE_DEVICES"] = gpu_ids
elif is_musa_available():
current_env["MUSA_VISIBLE_DEVICES"] = gpu_ids
elif is_npu_available():
current_env["ASCEND_RT_VISIBLE_DEVICES"] = gpu_ids
elif is_hpu_available():
current_env["HABANA_VISIBLE_MODULES"] = gpu_ids
else:
current_env["CUDA_VISIBLE_DEVICES"] = gpu_ids
try:
mixed_precision = PrecisionType(args.mixed_precision.lower())
except ValueError:
raise ValueError(
f"Unknown mixed_precision mode: {args.mixed_precision.lower()}. Choose between {PrecisionType.list()}."
)
current_env["PYTHONPATH"] = env_var_path_add("PYTHONPATH", os.path.abspath("."))
current_env["ACCELERATE_MIXED_PRECISION"] = str(mixed_precision)
if args.mixed_precision.lower() == "fp8":
if not is_fp8_available():
raise RuntimeError(
"FP8 is not available on this machine. Please ensure that either Transformer Engine, MSAMP or torchao is installed."
)
current_env = setup_fp8_env(args, current_env)
current_env["ACCELERATE_CONFIG_DS_FIELDS"] = str(args.deepspeed_fields_from_accelerate_config).lower()
current_env["ACCELERATE_USE_DEEPSPEED"] = "true"
if args.zero_stage is not None:
current_env["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(args.zero_stage)
if args.gradient_accumulation_steps is not None:
current_env["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str(args.gradient_accumulation_steps)
if args.gradient_clipping is not None:
current_env["ACCELERATE_GRADIENT_CLIPPING"] = str(args.gradient_clipping).lower()
if args.offload_optimizer_device is not None:
current_env["ACCELERATE_DEEPSPEED_OFFLOAD_OPTIMIZER_DEVICE"] = str(args.offload_optimizer_device).lower()
if args.offload_param_device is not None:
current_env["ACCELERATE_DEEPSPEED_OFFLOAD_PARAM_DEVICE"] = str(args.offload_param_device).lower()
if args.zero3_init_flag is not None:
current_env["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = str(args.zero3_init_flag).lower()
if args.zero3_save_16bit_model is not None:
current_env["ACCELERATE_DEEPSPEED_ZERO3_SAVE_16BIT_MODEL"] = str(args.zero3_save_16bit_model).lower()
if args.deepspeed_config_file is not None:
current_env["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = str(args.deepspeed_config_file)
if args.enable_cpu_affinity:
current_env["ACCELERATE_CPU_AFFINITY"] = "1"
if args.deepspeed_moe_layer_cls_names is not None:
current_env["ACCELERATE_DEEPSPEED_MOE_LAYER_CLS_NAMES"] = str(args.deepspeed_moe_layer_cls_names)
return cmd, current_env