in horovod/runner/launch.py [0:0]
def parse_args():
override_args = set()
parser = argparse.ArgumentParser(description='Horovod Runner')
parser.add_argument('-v', '--version', action='version', version=horovod.__version__,
help='Shows Horovod version.')
np_arg = parser.add_argument('-np', '--num-proc', action='store', dest='np',
type=int, required=not lsf.LSFUtils.using_lsf(),
help='Total number of training processes. In elastic mode, '
'number of processes required before training can start.')
parser.add_argument('-cb', '--check-build', action=make_check_build_action(np_arg), nargs=0,
help='Shows which frameworks and libraries have been built into Horovod.')
parser.add_argument('--disable-cache', action='store_true',
dest='disable_cache',
help='If the flag is not set, horovodrun will perform '
'the initialization checks only once every 60 '
'minutes -- if the checks successfully pass. '
'Otherwise, all the checks will run every time '
'horovodrun is called.')
parser.add_argument('--start-timeout', action='store',
dest='start_timeout', type=int,
help='Horovodrun has to perform all the checks and '
'start the processes before the specified '
'timeout. The default value is 30 seconds. '
'Alternatively, The environment variable '
'HOROVOD_START_TIMEOUT can also be used to '
'specify the initialization timeout.')
parser.add_argument('--network-interface', action='store', dest='nics',
help='Network interfaces that can be used for communication separated by '
'comma. If not specified, Horovod will find the common NICs among all '
'the workers and use it; example, --network-interface "eth0,eth1".')
parser.add_argument('--output-filename', action='store',
help='For Gloo, writes stdout / stderr of all processes to a filename of the form '
'<output_filename>/rank.<rank>/<stdout | stderr>. The <rank> will be padded with 0 '
'characters to ensure lexicographical order. For MPI, delegates its behavior to mpirun.')
parser.add_argument('--verbose', action='store_true',
dest='verbose',
help='If this flag is set, extra messages will '
'be printed.')
parser.add_argument('command', nargs=argparse.REMAINDER,
help='Command to be executed.')
parser.add_argument('--config-file', action='store', dest='config_file',
help='Path to YAML file containing runtime parameter configuration for Horovod. '
'Note that this will override any command line arguments provided before '
'this argument, and will be overridden by any arguments that come after it.')
group_ssh = parser.add_argument_group('SSH arguments')
group_ssh.add_argument('-p', '--ssh-port', action='store', dest='ssh_port',
type=int, help='SSH port on all the hosts.')
group_ssh.add_argument('-i', '--ssh-identity-file', action='store', dest='ssh_identity_file',
help='File on the driver from which the identity (private key) is read.')
group_params = parser.add_argument_group('tuneable parameter arguments')
group_params.add_argument('--fusion-threshold-mb', action=make_override_action(override_args), type=int,
help='Fusion buffer threshold in MB. This is the maximum amount of '
'tensor data that can be fused together into a single batch '
'during allreduce / allgather. Setting 0 disables tensor fusion. '
'(default: 64)')
group_params.add_argument('--cycle-time-ms', action=make_override_action(override_args), type=float,
help='Cycle time in ms. This is the delay between each tensor fusion '
'cycle. The larger the cycle time, the more batching, but the '
'greater latency between each allreduce / allgather operations. '
'(default: 5')
group_params.add_argument('--cache-capacity', action=make_override_action(override_args), type=int,
help='Maximum number of tensor names that will be cached to reduce amount '
'of coordination required between workers before performing allreduce / '
'allgather. (default: 1024')
group_hierarchical_allreduce = group_params.add_mutually_exclusive_group()
group_hierarchical_allreduce.add_argument('--hierarchical-allreduce',
action=make_override_true_action(override_args),
help='Perform hierarchical allreduce between workers instead of '
'ring allreduce. Hierarchical allreduce performs a local '
'allreduce / gather within a host, then a parallel cross allreduce '
'between equal local ranks across workers, and finally a '
'local gather.')
group_hierarchical_allreduce.add_argument('--no-hierarchical-allreduce', dest='hierarchical_allreduce',
action=make_override_false_action(override_args),
help='Explicitly disable hierarchical allreduce to prevent autotuning '
'from adjusting it.')
group_hierarchical_allgather = group_params.add_mutually_exclusive_group()
group_hierarchical_allgather.add_argument('--hierarchical-allgather',
action=make_override_true_action(override_args),
help='Perform hierarchical allgather between workers instead of '
'ring allgather. See hierarchical allreduce for algorithm details.')
group_hierarchical_allgather.add_argument('--no-hierarchical-allgather', dest='hierarchical_allgather',
action=make_override_false_action(override_args),
help='Explicitly disable hierarchical allgather to prevent autotuning '
'from adjusting it.')
group_autotune = parser.add_argument_group('autotune arguments')
group_autotune_enabled = group_autotune.add_mutually_exclusive_group()
group_autotune_enabled.add_argument('--autotune', action=make_override_true_action(override_args),
help='Perform autotuning to select parameter argument values that maximimize '
'throughput for allreduce / allgather. Any parameter explicitly set will '
'be held constant during tuning.')
group_autotune_enabled.add_argument('--no-autotune', dest='autotune',
action=make_override_false_action(override_args), help=argparse.SUPPRESS)
group_autotune.add_argument('--autotune-log-file', action=make_override_action(override_args),
help='Comma-separated log of trials containing each hyperparameter and the '
'score of the trial. The last row will always contain the best value '
'found.')
group_autotune.add_argument('--autotune-warmup-samples', action=make_override_action(override_args),
type=int, default=3,
help='Number of samples to discard before beginning the optimization process '
'during autotuning. Performance during the first few batches can be '
'affected by initialization and cache warmups. (default: %(default)s)')
group_autotune.add_argument('--autotune-steps-per-sample', action=make_override_action(override_args),
type=int, default=10,
help='Number of steps (approximate) to record before observing a sample. The sample '
'score is defined to be the median score over all batches within the sample. The '
'more batches per sample, the less variance in sample scores, but the longer '
'autotuning will take. (default: %(default)s)')
group_autotune.add_argument('--autotune-bayes-opt-max-samples', action=make_override_action(override_args),
type=int, default=20,
help='Maximum number of samples to collect for each Bayesian optimization process. '
'(default: %(default)s)')
group_autotune.add_argument('--autotune-gaussian-process-noise', action=make_override_action(override_args),
type=float, default=0.8,
help='Regularization value [0, 1] applied to account for noise in samples. '
'(default: %(default)s)')
group_elastic = parser.add_argument_group('elastic arguments')
group_elastic.add_argument('--min-np', action='store', dest='min_np', type=int,
help='Minimum number of processes running for training to continue. If number of '
'available processes dips below this threshold, then training will wait for '
'more instances to become available. Defaults to --num-proc.')
group_elastic.add_argument('--max-np', action='store', dest='max_np', type=int,
help='Maximum number of training processes, beyond which no additional '
'processes will be created. If not specified, then will be unbounded.')
group_elastic.add_argument('--slots-per-host', action='store', dest='slots', type=int,
help='Number of slots for processes per host. Normally 1 slot per GPU per host. '
'If slots are provided by the output of the host discovery script, then '
'that value will override this parameter.')
group_elastic.add_argument('--elastic-timeout', action='store', dest='elastic_timeout', type=int,
help='Timeout for elastic initialisation after re-scaling the cluster. '
'The default value is 600 seconds. Alternatively, '
'The environment variable HOROVOD_ELASTIC_TIMEOUT '
'can also be used to.')
group_elastic.add_argument('--reset-limit', action='store', dest='reset_limit', type=int,
help='Maximum number of times that the training job can scale up or down '
'the number of workers after which the job is terminated. (default: None)')
group_timeline = parser.add_argument_group('timeline arguments')
group_timeline.add_argument('--timeline-filename', action=make_override_action(override_args),
help='JSON file containing timeline of Horovod events used for debugging '
'performance. If this is provided, timeline events will be recorded, '
'which can have a negative impact on training performance.')
group_timeline_cycles = group_timeline.add_mutually_exclusive_group()
group_timeline_cycles.add_argument('--timeline-mark-cycles', action=make_override_true_action(override_args),
help='Mark cycles on the timeline. Only enabled if the timeline filename '
'is provided.')
group_timeline_cycles.add_argument('--no-timeline-mark-cycles', dest='timeline_mark_cycles',
action=make_override_false_action(override_args), help=argparse.SUPPRESS)
group_stall_check = parser.add_argument_group('stall check arguments')
group_stall_check_enabled = group_stall_check.add_mutually_exclusive_group()
group_stall_check_enabled.add_argument('--no-stall-check', action=make_override_true_action(override_args),
help='Disable the stall check. The stall check will log a warning when '
'workers have stalled waiting for other ranks to submit tensors.')
group_stall_check_enabled.add_argument('--stall-check', dest='no_stall_check',
action=make_override_false_action(override_args), help=argparse.SUPPRESS)
group_stall_check.add_argument('--stall-check-warning-time-seconds', action=make_override_action(override_args),
type=int, default=60,
help='Seconds until the stall warning is logged to stderr. (default: %(default)s)')
group_stall_check.add_argument('--stall-check-shutdown-time-seconds', action=make_override_action(override_args),
type=int, default=0,
help='Seconds until Horovod is shutdown due to stall. Shutdown will only take '
'place if this value is greater than the warning time. (default: %(default)s)')
group_library_options = parser.add_argument_group('library arguments')
group_mpi_threads_disable = group_library_options.add_mutually_exclusive_group()
group_mpi_threads_disable.add_argument('--mpi-threads-disable', action=make_override_true_action(override_args),
help='Disable MPI threading support. Only applies when running in MPI '
'mode. In some cases, multi-threaded MPI can slow down other '
'components, but is necessary if you wish to run mpi4py on top '
'of Horovod.')
group_mpi_threads_disable.add_argument('--no-mpi-threads-disable', dest='mpi_threads_disable',
action=make_override_false_action(override_args), help=argparse.SUPPRESS)
group_library_options.add_argument('--mpi-args', action='store', dest='mpi_args',
help='Extra MPI arguments to pass to mpirun. '
'They need to be passed with the equal sign to avoid parsing issues. '
'e.g. --mpi-args="--map-by ppr:6:node"')
group_library_options.add_argument('--tcp', action='store_true', dest='tcp_flag',
help='If this flag is set, only TCP is used for communication.')
group_library_options.add_argument('--binding-args', action='store', dest='binding_args',
help='Process binding arguments. Default is socket for Spectrum MPI '
'and no binding for other cases. e.g. --binding-args="--rankfile myrankfile"')
group_library_options.add_argument('--num-nccl-streams', action=make_override_action(override_args),
type=int, default=1,
help='Number of NCCL streams. Only applies when running with NCCL support. '
'(default: %(default)s)')
group_library_options.add_argument('--ccl-bgt-affinity', action=make_override_action(override_args),
type=int, default=0,
help='CCL background thread affinity. Only applies when running with CCL '
'support. (default: %(default)s)')
group_library_options.add_argument('--gloo-timeout-seconds', action=make_override_action(override_args),
type=int, default=30,
help='Timeout in seconds for Gloo operations to complete. '
'(default: %(default)s)')
group_logging = parser.add_argument_group('logging arguments')
group_logging.add_argument('--log-level', action=make_override_action(override_args),
choices=config_parser.LOG_LEVELS,
help='Minimum level to log to stderr from the Horovod backend. (default: WARNING).')
group_logging_timestamp = group_logging.add_mutually_exclusive_group()
group_logging_timestamp.add_argument('--log-hide-timestamp', action=make_override_true_action(override_args),
help='Hide the timestamp from Horovod log messages.')
group_logging_timestamp.add_argument('--no-log-hide-timestamp', dest='log_hide_timestamp',
action=make_override_false_action(override_args), help=argparse.SUPPRESS)
group_hosts_parent = parser.add_argument_group('host arguments')
group_hosts = group_hosts_parent.add_mutually_exclusive_group()
group_hosts.add_argument('-H', '--hosts', action='store', dest='hosts',
help='List of host names and the number of available slots '
'for running processes on each, of the form: <hostname>:<slots> '
'(e.g.: host1:2,host2:4,host3:1 indicating 2 processes can run on host1, '
'4 on host2, and 1 on host3). If not specified, defaults to using '
'localhost:<np>')
group_hosts.add_argument('-hostfile', '--hostfile', action='store', dest='hostfile',
help='Path to a host file containing the list of host names and the number of '
'available slots. Each line of the file must be of the form: '
'<hostname> slots=<slots>')
group_hosts.add_argument('--host-discovery-script', action=make_override_action(override_args),
help='Used for elastic training (autoscaling and fault tolerance). '
'An executable script that will print to stdout every available host (one per '
'newline character) that can be used to run worker processes. Optionally '
'specifies number of slots on the same line as the hostname as: "hostname:slots".'
'Providing a discovery script enables elastic training (see elastic arguments).'
'The job will fail immediately if execution of the script returns a non-zero exit '
'code on the first call. Subsequent calls will be retried until timeout.')
group_controller_parent = parser.add_argument_group('controller arguments')
group_controller = group_controller_parent.add_mutually_exclusive_group()
group_controller.add_argument('--gloo', action='store_true', dest='use_gloo',
help='Run Horovod using the Gloo controller. This will '
'be the default if Horovod was not built with MPI support.')
group_controller.add_argument('--mpi', action='store_true', dest='use_mpi',
help='Run Horovod using the MPI controller. This will '
'be the default if Horovod was built with MPI support.')
group_controller.add_argument('--jsrun', action='store_true', dest='use_jsrun',
help='Launch Horovod processes with jsrun and use the MPI controller. '
'This will be the default if jsrun is installed and Horovod '
'was built with MPI support.')
args = parser.parse_args()
if args.config_file:
with open(args.config_file, 'r') as f:
config = yaml.load(f, Loader=yaml.FullLoader)
config_parser.set_args_from_config(args, config, override_args)
config_parser.validate_config_args(args)
args.run_func = None
if args.check_build:
check_build(args.verbose)
return args