def launch_command_parser()

in src/accelerate/commands/launch.py [0:0]


def launch_command_parser(subparsers=None):
    description = "Launch a python script in a distributed scenario. Arguments can be passed in with either hyphens (`--num-processes=2`) or underscores (`--num_processes=2`)"
    if subparsers is not None:
        parser = subparsers.add_parser(
            "launch", description=description, add_help=False, allow_abbrev=False, formatter_class=CustomHelpFormatter
        )
    else:
        parser = CustomArgumentParser(
            "Accelerate launch command",
            description=description,
            add_help=False,
            allow_abbrev=False,
            formatter_class=CustomHelpFormatter,
        )

    parser.add_argument("-h", "--help", action="help", help="Show this help message and exit.")

    parser.add_argument(
        "--config_file",
        default=None,
        help="The config file to use for the default values in the launching script.",
    )
    parser.add_argument(
        "--quiet",
        "-q",
        action="store_true",
        help="Silence subprocess errors from the launch stack trace and only show the relevant tracebacks. (Only applicable to DeepSpeed and single-process configurations)",
    )
    # Hardware selection arguments
    hardware_args = parser.add_argument_group(
        "Hardware Selection Arguments", "Arguments for selecting the hardware to be used."
    )
    hardware_args.add_argument(
        "--cpu", default=False, action="store_true", help="Whether or not to force the training on the CPU."
    )
    hardware_args.add_argument(
        "--multi_gpu",
        default=False,
        action="store_true",
        help="Whether or not this should launch a distributed GPU training.",
    )
    hardware_args.add_argument(
        "--tpu", default=False, action="store_true", help="Whether or not this should launch a TPU training."
    )
    hardware_args.add_argument(
        "--ipex",
        default=False,
        action="store_true",
        help="Whether or not this should launch a Intel PyTorch Extension (IPEX) training.",
    )

    # Resource selection arguments
    resource_args = parser.add_argument_group(
        "Resource Selection Arguments", "Arguments for fine-tuning how available hardware should be used."
    )
    resource_args.add_argument(
        "--mixed_precision",
        type=str,
        choices=["no", "fp16", "bf16", "fp8"],
        help="Whether or not to use mixed precision training. "
        "Choose between FP16 and BF16 (bfloat16) training. "
        "BF16 training is only supported on Nvidia Ampere GPUs and PyTorch 1.10 or later.",
    )
    resource_args.add_argument(
        "--num_processes", type=int, default=None, help="The total number of processes to be launched in parallel."
    )
    resource_args.add_argument(
        "--num_machines", type=int, default=None, help="The total number of machines used in this training."
    )
    resource_args.add_argument(
        "--num_cpu_threads_per_process",
        type=int,
        default=None,
        help="The number of CPU threads per process. Can be tuned for optimal performance.",
    )
    resource_args.add_argument(
        "--enable_cpu_affinity",
        default=False,
        action="store_true",
        help="Whether or not CPU affinity and balancing should be enabled. Currently only supported on NVIDIA hardware.",
    )
    # Dynamo arguments
    resource_args.add_argument(
        "--dynamo_backend",
        type=str,
        choices=["no"] + [b.lower() for b in DYNAMO_BACKENDS],
        help="Choose a backend to optimize your training with dynamo, see more at "
        "https://github.com/pytorch/torchdynamo.",
    )
    resource_args.add_argument(
        "--dynamo_mode",
        type=str,
        default="default",
        choices=TORCH_DYNAMO_MODES,
        help="Choose a mode to optimize your training with dynamo.",
    )
    resource_args.add_argument(
        "--dynamo_use_fullgraph",
        default=False,
        action="store_true",
        help="Whether to use full graph mode for dynamo or it is ok to break model into several subgraphs",
    )
    resource_args.add_argument(
        "--dynamo_use_dynamic",
        default=False,
        action="store_true",
        help="Whether to enable dynamic shape tracing.",
    )
    resource_args.add_argument(
        "--dynamo_use_regional_compilation",
        default=False,
        action="store_true",
        help="Whether to enable regional compilation.",
    )

    # Training Paradigm arguments
    paradigm_args = parser.add_argument_group(
        "Training Paradigm Arguments", "Arguments for selecting which training paradigm to be used."
    )
    paradigm_args.add_argument(
        "--use_deepspeed",
        default=False,
        action="store_true",
        help="Whether to use deepspeed.",
    )
    paradigm_args.add_argument(
        "--use_fsdp",
        default=False,
        action="store_true",
        help="Whether to use fsdp.",
    )
    paradigm_args.add_argument(
        "--use_megatron_lm",
        default=False,
        action="store_true",
        help="Whether to use Megatron-LM.",
    )

    paradigm_args.add_argument(
        "--use_xpu",
        default=None,
        action="store_true",
        help="Whether to use IPEX plugin to speed up training on XPU specifically. This argument is deprecated and ignored, will be removed in Accelerate v1.20.",
    )

    # distributed GPU training arguments
    distributed_args = parser.add_argument_group("Distributed GPUs", "Arguments related to distributed GPU training.")
    distributed_args.add_argument(
        "--gpu_ids",
        default=None,
        help="What GPUs (by id) should be used for training on this machine as a comma-separated list",
    )
    distributed_args.add_argument(
        "--same_network",
        default=False,
        action="store_true",
        help="Whether all machines used for multinode training exist on the same local network.",
    )
    distributed_args.add_argument(
        "--machine_rank", type=int, default=None, help="The rank of the machine on which this script is launched."
    )
    distributed_args.add_argument(
        "--main_process_ip", type=str, default=None, help="The IP address of the machine of rank 0."
    )
    distributed_args.add_argument(
        "--main_process_port",
        type=int,
        default=None,
        help="The port to use to communicate with the machine of rank 0.",
    )
    distributed_args.add_argument(
        "-t",
        "--tee",
        default="0",
        type=str,
        help="Tee std streams into a log file and also to console.",
    )
    distributed_args.add_argument(
        "--log_dir",
        type=str,
        default=None,
        help=(
            "Base directory to use for log files when using torchrun/torch.distributed.run as launcher. "
            "Use with --tee to redirect std streams info log files."
        ),
    )
    distributed_args.add_argument(
        "--role",
        type=str,
        default="default",
        help="User-defined role for the workers.",
    )
    # Rendezvous related arguments
    distributed_args.add_argument(
        "--rdzv_backend",
        type=str,
        default="static",
        help="The rendezvous method to use, such as 'static' (the default) or 'c10d'",
    )
    distributed_args.add_argument(
        "--rdzv_conf",
        type=str,
        default="",
        help="Additional rendezvous configuration (<key1>=<value1>,<key2>=<value2>,...).",
    )
    distributed_args.add_argument(
        "--max_restarts",
        type=int,
        default=0,
        help="Maximum number of worker group restarts before failing.",
    )
    distributed_args.add_argument(
        "--monitor_interval",
        type=float,
        default=0.1,
        help="Interval, in seconds, to monitor the state of workers.",
    )
    parser.add_argument(
        "-m",
        "--module",
        action="store_true",
        help="Change each process to interpret the launch script as a Python module, executing with the same behavior as 'python -m'.",
    )
    parser.add_argument(
        "--no_python",
        action="store_true",
        help="Skip prepending the training script with 'python' - just execute it directly. Useful when the script is not a Python script.",
    )

    # TPU arguments
    tpu_args = parser.add_argument_group("TPU", "Arguments related to TPU.")
    tpu_args.add_argument(
        "--tpu_cluster",
        action="store_true",
        dest="tpu_use_cluster",
        help="Whether to use a GCP TPU pod for training.",
    )
    tpu_args.add_argument(
        "--no_tpu_cluster",
        action="store_false",
        dest="tpu_use_cluster",
        help="Should not be passed explicitly, this is for internal use only.",
    )
    tpu_args.add_argument(
        "--tpu_use_sudo",
        action="store_true",
        help="Whether to use `sudo` when running the TPU training script in each pod.",
    )
    tpu_args.add_argument(
        "--vm",
        type=str,
        action="append",
        help=(
            "List of single Compute VM instance names. "
            "If not provided we assume usage of instance groups. For TPU pods."
        ),
    )
    tpu_args.add_argument(
        "--env",
        type=str,
        action="append",
        help="List of environment variables to set on the Compute VM instances. For TPU pods.",
    )
    tpu_args.add_argument(
        "--main_training_function",
        type=str,
        default=None,
        help="The name of the main function to be executed in your script (only for TPU training).",
    )
    tpu_args.add_argument(
        "--downcast_bf16",
        action="store_true",
        help="Whether when using bf16 precision on TPUs if both float and double tensors are cast to bfloat16 or if double tensors remain as float32.",
    )

    # DeepSpeed arguments
    deepspeed_args = parser.add_argument_group("DeepSpeed Arguments", "Arguments related to DeepSpeed.")
    deepspeed_args.add_argument(
        "--deepspeed_config_file",
        default=None,
        type=str,
        help="DeepSpeed config file.",
    )
    deepspeed_args.add_argument(
        "--zero_stage",
        default=None,
        type=int,
        help="DeepSpeed's ZeRO optimization stage (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to `2`.",
    )
    deepspeed_args.add_argument(
        "--offload_optimizer_device",
        default=None,
        type=str,
        help="Decides where (none|cpu|nvme) to offload optimizer states (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--offload_param_device",
        default=None,
        type=str,
        help="Decides where (none|cpu|nvme) to offload parameters (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--offload_optimizer_nvme_path",
        default=None,
        type=str,
        help="Decides Nvme Path to offload optimizer states (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--offload_param_nvme_path",
        default=None,
        type=str,
        help="Decides Nvme Path to offload parameters (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to 'none'.",
    )
    deepspeed_args.add_argument(
        "--gradient_accumulation_steps",
        default=None,
        type=int,
        help="No of gradient_accumulation_steps used in your training script (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to `1`.",
    )
    deepspeed_args.add_argument(
        "--gradient_clipping",
        default=None,
        type=float,
        help="gradient clipping value used in your training script (useful only when `use_deepspeed` flag is passed). "
        "If unspecified, will default to `1.0`.",
    )
    deepspeed_args.add_argument(
        "--zero3_init_flag",
        default=None,
        type=str,
        help="Decides Whether (true|false) to enable `deepspeed.zero.Init` for constructing massive models. "
        "Only applicable with DeepSpeed ZeRO Stage-3. If unspecified, will default to `true`.",
    )
    deepspeed_args.add_argument(
        "--zero3_save_16bit_model",
        default=None,
        type=str,
        help="Decides Whether (true|false) to save 16-bit model weights when using ZeRO Stage-3. "
        "Only applicable with DeepSpeed ZeRO Stage-3. If unspecified, will default to `false`.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_hostfile",
        default=None,
        type=str,
        help="DeepSpeed hostfile for configuring multi-node compute resources.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_exclusion_filter",
        default=None,
        type=str,
        help="DeepSpeed exclusion filter string when using mutli-node setup.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_inclusion_filter",
        default=None,
        type=str,
        help="DeepSpeed inclusion filter string when using mutli-node setup.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_multinode_launcher",
        default=None,
        type=str,
        help="DeepSpeed multi-node launcher to use, e.g. `pdsh`, `standard`, `openmpi`, `mvapich`, `mpich`, `slurm`, `nossh` (requires DeepSpeed >= 0.14.5). If unspecified, will default to `pdsh`.",
    )
    deepspeed_args.add_argument(
        "--deepspeed_moe_layer_cls_names",
        default=None,
        type=str,
        help="comma-separated list of transformer MoE layer class names (case-sensitive) to wrap ,e.g, `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ..."
        " (useful only when `use_deepspeed` flag is passed).",
    )

    # fsdp arguments
    fsdp_args = parser.add_argument_group("FSDP Arguments", "Arguments related to Fully Shared Data Parallelism.")
    fsdp_args.add_argument(
        "--fsdp_version",
        type=str,
        default="1",
        choices=["1", "2"],
        help="FSDP version to use. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_offload_params",
        default="false",
        type=str,
        help="Decides Whether (true|false) to offload parameters and gradients to CPU. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_min_num_params",
        type=int,
        default=1e8,
        help="FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `use_fsdp` flag is passed).",
    )
    # We enable this for backwards compatibility, throw a warning if this is set in `FullyShardedDataParallelPlugin`
    fsdp_args.add_argument(
        "--fsdp_sharding_strategy",
        type=str,
        default="FULL_SHARD",
        help="FSDP's sharding strategy. (useful only when `use_fsdp` flag is passed and `fsdp_version=1`).",
    )
    fsdp_args.add_argument(
        "--fsdp_reshard_after_forward",
        type=str,
        default="true",
        help="FSDP's Reshard After Forward Strategy. (useful only when `use_fsdp` flag is passed). Supports either boolean (FSDP2) or `FULL_SHARD | SHARD_GRAD_OP | NO_RESHARD` (FSDP1).",
    )
    fsdp_args.add_argument(
        "--fsdp_auto_wrap_policy",
        type=str,
        default=None,
        help="FSDP's auto wrap policy. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_transformer_layer_cls_to_wrap",
        default=None,
        type=str,
        help="Transformer layer class name (case-sensitive) to wrap ,e.g, `BertLayer`, `GPTJBlock`, `T5Block` .... "
        "(useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_backward_prefetch",
        default=None,
        type=str,
        help="FSDP's backward prefetch policy. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_state_dict_type",
        default=None,
        type=str,
        help="FSDP's state dict type. (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_forward_prefetch",
        default="false",
        type=str,
        help="If True, then FSDP explicitly prefetches the next upcoming "
        "all-gather while executing in the forward pass (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_use_orig_params",
        default="true",
        type=str,
        help="If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres."
        " (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_cpu_ram_efficient_loading",
        default="true",
        type=str,
        help="If True, only the first process loads the pretrained model checkoint while all other processes have empty weights. "
        "Only applicable for 🤗 Transformers. When using this, `--fsdp_sync_module_states` needs to True. "
        "(useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_sync_module_states",
        default="true",
        type=str,
        help="If True, each individually wrapped FSDP unit will broadcast module parameters from rank 0."
        " (useful only when `use_fsdp` flag is passed).",
    )
    fsdp_args.add_argument(
        "--fsdp_activation_checkpointing",
        default="false",
        type=str,
        help="Decides Whether (true|false) intermediate activations are freed during the forward pass, and a checkpoint is left as a placeholder. (useful only when `use_fsdp` flag is passed).",
    )

    # megatron_lm args
    megatron_lm_args = parser.add_argument_group("Megatron-LM Arguments", "Arguments related to Megatron-LM.")
    megatron_lm_args.add_argument(
        "--megatron_lm_tp_degree",
        type=int,
        default=1,
        help="Megatron-LM's Tensor Parallelism (TP) degree. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_pp_degree",
        type=int,
        default=1,
        help="Megatron-LM's Pipeline Parallelism (PP) degree. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_num_micro_batches",
        type=int,
        default=None,
        help="Megatron-LM's number of micro batches when PP degree > 1. (useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_sequence_parallelism",
        default=None,
        type=str,
        help="Decides Whether (true|false) to enable Sequence Parallelism when TP degree > 1. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_recompute_activations",
        default=None,
        type=str,
        help="Decides Whether (true|false) to enable Selective Activation Recomputation. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_use_distributed_optimizer",
        default=None,
        type=str,
        help="Decides Whether (true|false) to use distributed optimizer "
        "which shards optimizer state and gradients across Data Pralellel (DP) ranks. "
        "(useful only when `use_megatron_lm` flag is passed).",
    )
    megatron_lm_args.add_argument(
        "--megatron_lm_gradient_clipping",
        default=1.0,
        type=float,
        help="Megatron-LM's gradient clipping value based on global L2 Norm (0 to disable). "
        "(useful only when `use_megatron_lm` flag is passed).",
    )

    # FP8 arguments
    fp8_args = parser.add_argument_group(
        "FP8 Arguments", "Arguments related to FP8 training (requires `--mixed_precision=fp8`)"
    )
    fp8_args.add_argument(
        "--fp8_backend",
        type=str,
        choices=["te", "msamp"],
        help="Choose a backend to train with FP8 (te: TransformerEngine, msamp: MS-AMP)",
    )
    fp8_args.add_argument(
        "--fp8_use_autocast_during_eval",
        default=False,
        action="store_true",
        help="Whether to use FP8 autocast during eval mode (useful only when `--fp8_backend=te` is passed). Generally better metrics are found when this is not passed.",
    )
    fp8_args.add_argument(
        "--fp8_margin",
        type=int,
        default=0,
        help="The margin to use for the gradient scaling (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_interval",
        type=int,
        default=1,
        help="The interval to use for how often the scaling factor is recomputed (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_format",
        type=str,
        default="HYBRID",
        choices=["HYBRID", "E4M3", "E5M2"],
        help="The format to use for the FP8 recipe (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_amax_history_len",
        type=int,
        default=1024,
        help="The length of the history to use for the scaling factor computation (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_amax_compute_algo",
        type=str,
        default="most_recent",
        choices=["max", "most_recent"],
        help="The algorithm to use for the scaling factor computation. (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_override_linear_precision",
        type=lambda x: tuple(map(str_to_bool, x.split(","))),
        default=(False, False, False),
        help="Whether or not to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision. Should be passed in a comma-separated string of booleans (useful only when `--fp8_backend=te` is passed).",
    )
    fp8_args.add_argument(
        "--fp8_opt_level",
        type=str,
        default="O2",
        choices=["O1", "O2"],
        help="What level of 8-bit collective communication should be used with MS-AMP (useful only when `--fp8_backend=msamp` is passed).",
    )

    # AWS arguments
    aws_args = parser.add_argument_group("AWS Arguments", "Arguments related to AWS.")
    aws_args.add_argument(
        "--aws_access_key_id",
        type=str,
        default=None,
        help="The AWS_ACCESS_KEY_ID used to launch the Amazon SageMaker training job",
    )
    aws_args.add_argument(
        "--aws_secret_access_key",
        type=str,
        default=None,
        help="The AWS_SECRET_ACCESS_KEY used to launch the Amazon SageMaker training job.",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="Whether to print out the torch.distributed stack trace when something fails.",
    )
    parser.add_argument(
        "training_script",
        type=str,
        help=(
            "The full path to the script to be launched in parallel, followed by all the arguments for the training "
            "script."
        ),
    )

    # MPI arguments
    mpirun_args = parser.add_argument_group("MPI Arguments", "Arguments related to mpirun for Multi-CPU")
    mpirun_args.add_argument(
        "--mpirun_hostfile",
        type=str,
        default=None,
        help="Location for a hostfile for using Accelerate to launch a multi-CPU training job with mpirun. This will "
        "get passed to the MPI --hostfile or -f parameter, depending on which MPI program is installed.",
    )
    mpirun_args.add_argument(
        "--mpirun_ccl",
        type=int,
        default=1,
        help="The number of oneCCL worker threads when using Accelerate to launch multi-CPU training with mpirun.",
    )

    # Other arguments of the training scripts
    parser.add_argument("training_script_args", nargs=argparse.REMAINDER, help="Arguments of the training script.")

    if subparsers is not None:
        parser.set_defaults(func=launch_command)
    return parser