def parse_args()

in horovod/runner/launch.py [0:0]
242 lines of code
3 McCabe index (conditional complexity)

def parse_args():
    override_args = set()

    parser = argparse.ArgumentParser(description='Horovod Runner')

    parser.add_argument('-v', '--version', action='version', version=horovod.__version__,
                        help='Shows Horovod version.')

    np_arg = parser.add_argument('-np', '--num-proc', action='store', dest='np',
                                 type=int, required=not lsf.LSFUtils.using_lsf(),
                                 help='Total number of training processes. In elastic mode, '
                                      'number of processes required before training can start.')

    parser.add_argument('-cb', '--check-build', action=make_check_build_action(np_arg), nargs=0,
                        help='Shows which frameworks and libraries have been built into Horovod.')

    parser.add_argument('--disable-cache', action='store_true',
                        dest='disable_cache',
                        help='If the flag is not set, horovodrun will perform '
                             'the initialization checks only once every 60 '
                             'minutes -- if the checks successfully pass. '
                             'Otherwise, all the checks will run every time '
                             'horovodrun is called.')

    parser.add_argument('--start-timeout', action='store',
                        dest='start_timeout', type=int,
                        help='Horovodrun has to perform all the checks and '
                             'start the processes before the specified '
                             'timeout. The default value is 30 seconds. '
                             'Alternatively, The environment variable '
                             'HOROVOD_START_TIMEOUT can also be used to '
                             'specify the initialization timeout.')

    parser.add_argument('--network-interface', action='store', dest='nics',
                        help='Network interfaces that can be used for communication separated by '
                             'comma. If not specified, Horovod will find the common NICs among all '
                             'the workers and use it; example, --network-interface "eth0,eth1".')

    parser.add_argument('--output-filename', action='store',
                        help='For Gloo, writes stdout / stderr of all processes to a filename of the form '
                             '<output_filename>/rank.<rank>/<stdout | stderr>. The <rank> will be padded with 0 '
                             'characters to ensure lexicographical order. For MPI, delegates its behavior to mpirun.')

    parser.add_argument('--verbose', action='store_true',
                        dest='verbose',
                        help='If this flag is set, extra messages will '
                             'be printed.')

    parser.add_argument('command', nargs=argparse.REMAINDER,
                        help='Command to be executed.')

    parser.add_argument('--config-file', action='store', dest='config_file',
                        help='Path to YAML file containing runtime parameter configuration for Horovod. '
                             'Note that this will override any command line arguments provided before '
                             'this argument, and will be overridden by any arguments that come after it.')

    group_ssh = parser.add_argument_group('SSH arguments')
    group_ssh.add_argument('-p', '--ssh-port', action='store', dest='ssh_port',
                           type=int, help='SSH port on all the hosts.')
    group_ssh.add_argument('-i', '--ssh-identity-file', action='store', dest='ssh_identity_file',
                           help='File on the driver from which the identity (private key) is read.')

    group_params = parser.add_argument_group('tuneable parameter arguments')
    group_params.add_argument('--fusion-threshold-mb', action=make_override_action(override_args), type=int,
                              help='Fusion buffer threshold in MB. This is the maximum amount of '
                                   'tensor data that can be fused together into a single batch '
                                   'during allreduce / allgather. Setting 0 disables tensor fusion. '
                                   '(default: 64)')
    group_params.add_argument('--cycle-time-ms', action=make_override_action(override_args), type=float,
                              help='Cycle time in ms. This is the delay between each tensor fusion '
                                   'cycle. The larger the cycle time, the more batching, but the '
                                   'greater latency between each allreduce / allgather operations. '
                                   '(default: 5')
    group_params.add_argument('--cache-capacity', action=make_override_action(override_args), type=int,
                              help='Maximum number of tensor names that will be cached to reduce amount '
                                   'of coordination required between workers before performing allreduce / '
                                   'allgather. (default: 1024')

    group_hierarchical_allreduce = group_params.add_mutually_exclusive_group()
    group_hierarchical_allreduce.add_argument('--hierarchical-allreduce',
                                              action=make_override_true_action(override_args),
                                              help='Perform hierarchical allreduce between workers instead of '
                                                   'ring allreduce. Hierarchical allreduce performs a local '
                                                   'allreduce / gather within a host, then a parallel cross allreduce '
                                                   'between equal local ranks across workers, and finally a '
                                                   'local gather.')
    group_hierarchical_allreduce.add_argument('--no-hierarchical-allreduce', dest='hierarchical_allreduce',
                                              action=make_override_false_action(override_args),
                                              help='Explicitly disable hierarchical allreduce to prevent autotuning '
                                                   'from adjusting it.')

    group_hierarchical_allgather = group_params.add_mutually_exclusive_group()
    group_hierarchical_allgather.add_argument('--hierarchical-allgather',
                                              action=make_override_true_action(override_args),
                                              help='Perform hierarchical allgather between workers instead of '
                                                   'ring allgather. See hierarchical allreduce for algorithm details.')
    group_hierarchical_allgather.add_argument('--no-hierarchical-allgather', dest='hierarchical_allgather',
                                              action=make_override_false_action(override_args),
                                              help='Explicitly disable hierarchical allgather to prevent autotuning '
                                                   'from adjusting it.')

    group_autotune = parser.add_argument_group('autotune arguments')
    group_autotune_enabled = group_autotune.add_mutually_exclusive_group()
    group_autotune_enabled.add_argument('--autotune', action=make_override_true_action(override_args),
                                        help='Perform autotuning to select parameter argument values that maximimize '
                                             'throughput for allreduce / allgather. Any parameter explicitly set will '
                                             'be held constant during tuning.')
    group_autotune_enabled.add_argument('--no-autotune', dest='autotune',
                                        action=make_override_false_action(override_args), help=argparse.SUPPRESS)
    group_autotune.add_argument('--autotune-log-file', action=make_override_action(override_args),
                                help='Comma-separated log of trials containing each hyperparameter and the '
                                     'score of the trial. The last row will always contain the best value '
                                     'found.')
    group_autotune.add_argument('--autotune-warmup-samples', action=make_override_action(override_args),
                                type=int, default=3,
                                help='Number of samples to discard before beginning the optimization process '
                                     'during autotuning. Performance during the first few batches can be '
                                     'affected by initialization and cache warmups. (default: %(default)s)')
    group_autotune.add_argument('--autotune-steps-per-sample', action=make_override_action(override_args),
                                type=int, default=10,
                                help='Number of steps (approximate) to record before observing a sample. The sample '
                                     'score is defined to be the median score over all batches within the sample. The '
                                     'more batches per sample, the less variance in sample scores, but the longer '
                                     'autotuning will take. (default: %(default)s)')
    group_autotune.add_argument('--autotune-bayes-opt-max-samples', action=make_override_action(override_args),
                                type=int, default=20,
                                help='Maximum number of samples to collect for each Bayesian optimization process. '
                                     '(default: %(default)s)')
    group_autotune.add_argument('--autotune-gaussian-process-noise', action=make_override_action(override_args),
                                type=float, default=0.8,
                                help='Regularization value [0, 1] applied to account for noise in samples. '
                                     '(default: %(default)s)')

    group_elastic = parser.add_argument_group('elastic arguments')
    group_elastic.add_argument('--min-np', action='store', dest='min_np', type=int,
                               help='Minimum number of processes running for training to continue. If number of '
                                    'available processes dips below this threshold, then training will wait for '
                                    'more instances to become available. Defaults to --num-proc.')
    group_elastic.add_argument('--max-np', action='store', dest='max_np', type=int,
                               help='Maximum number of training processes, beyond which no additional '
                                    'processes will be created. If not specified, then will be unbounded.')
    group_elastic.add_argument('--slots-per-host', action='store', dest='slots', type=int,
                               help='Number of slots for processes per host. Normally 1 slot per GPU per host. '
                                    'If slots are provided by the output of the host discovery script, then '
                                    'that value will override this parameter.')
    group_elastic.add_argument('--elastic-timeout', action='store', dest='elastic_timeout', type=int,
                               help='Timeout for elastic initialisation after re-scaling the cluster. '
                                    'The default value is 600 seconds. Alternatively, '
                                    'The environment variable HOROVOD_ELASTIC_TIMEOUT '
                                    'can also be used to.')
    group_elastic.add_argument('--reset-limit', action='store', dest='reset_limit', type=int,
                               help='Maximum number of times that the training job can scale up or down '
                                    'the number of workers after which the job is terminated. (default: None)')

    group_timeline = parser.add_argument_group('timeline arguments')
    group_timeline.add_argument('--timeline-filename', action=make_override_action(override_args),
                                help='JSON file containing timeline of Horovod events used for debugging '
                                     'performance. If this is provided, timeline events will be recorded, '
                                     'which can have a negative impact on training performance.')
    group_timeline_cycles = group_timeline.add_mutually_exclusive_group()
    group_timeline_cycles.add_argument('--timeline-mark-cycles', action=make_override_true_action(override_args),
                                       help='Mark cycles on the timeline. Only enabled if the timeline filename '
                                            'is provided.')
    group_timeline_cycles.add_argument('--no-timeline-mark-cycles', dest='timeline_mark_cycles',
                                       action=make_override_false_action(override_args), help=argparse.SUPPRESS)

    group_stall_check = parser.add_argument_group('stall check arguments')
    group_stall_check_enabled = group_stall_check.add_mutually_exclusive_group()
    group_stall_check_enabled.add_argument('--no-stall-check', action=make_override_true_action(override_args),
                                           help='Disable the stall check. The stall check will log a warning when '
                                                'workers have stalled waiting for other ranks to submit tensors.')
    group_stall_check_enabled.add_argument('--stall-check', dest='no_stall_check',
                                           action=make_override_false_action(override_args), help=argparse.SUPPRESS)
    group_stall_check.add_argument('--stall-check-warning-time-seconds', action=make_override_action(override_args),
                                   type=int, default=60,
                                   help='Seconds until the stall warning is logged to stderr. (default: %(default)s)')
    group_stall_check.add_argument('--stall-check-shutdown-time-seconds', action=make_override_action(override_args),
                                   type=int, default=0,
                                   help='Seconds until Horovod is shutdown due to stall. Shutdown will only take '
                                        'place if this value is greater than the warning time. (default: %(default)s)')

    group_library_options = parser.add_argument_group('library arguments')
    group_mpi_threads_disable = group_library_options.add_mutually_exclusive_group()
    group_mpi_threads_disable.add_argument('--mpi-threads-disable', action=make_override_true_action(override_args),
                                           help='Disable MPI threading support. Only applies when running in MPI '
                                                'mode. In some cases, multi-threaded MPI can slow down other '
                                                'components, but is necessary if you wish to run mpi4py on top '
                                                'of Horovod.')
    group_mpi_threads_disable.add_argument('--no-mpi-threads-disable', dest='mpi_threads_disable',
                                           action=make_override_false_action(override_args), help=argparse.SUPPRESS)
    group_library_options.add_argument('--mpi-args', action='store', dest='mpi_args',
                                       help='Extra MPI arguments to pass to mpirun. '
                                            'They need to be passed with the equal sign to avoid parsing issues. '
                                            'e.g. --mpi-args="--map-by ppr:6:node"')
    group_library_options.add_argument('--tcp', action='store_true', dest='tcp_flag',
                                       help='If this flag is set, only TCP is used for communication.')
    group_library_options.add_argument('--binding-args', action='store', dest='binding_args',
                                       help='Process binding arguments. Default is socket for Spectrum MPI '
                                            'and no binding for other cases. e.g. --binding-args="--rankfile myrankfile"')
    group_library_options.add_argument('--num-nccl-streams', action=make_override_action(override_args),
                                       type=int, default=1,
                                       help='Number of NCCL streams. Only applies when running with NCCL support. '
                                            '(default: %(default)s)')
    group_library_options.add_argument('--ccl-bgt-affinity', action=make_override_action(override_args),
                                       type=int, default=0,
                                       help='CCL background thread affinity. Only applies when running with CCL '
                                            'support. (default: %(default)s)')
    group_library_options.add_argument('--gloo-timeout-seconds', action=make_override_action(override_args),
                                       type=int, default=30,
                                       help='Timeout in seconds for Gloo operations to complete. '
                                            '(default: %(default)s)')

    group_logging = parser.add_argument_group('logging arguments')
    group_logging.add_argument('--log-level', action=make_override_action(override_args),
                               choices=config_parser.LOG_LEVELS,
                               help='Minimum level to log to stderr from the Horovod backend. (default: WARNING).')
    group_logging_timestamp = group_logging.add_mutually_exclusive_group()
    group_logging_timestamp.add_argument('--log-hide-timestamp', action=make_override_true_action(override_args),
                                         help='Hide the timestamp from Horovod log messages.')
    group_logging_timestamp.add_argument('--no-log-hide-timestamp', dest='log_hide_timestamp',
                                         action=make_override_false_action(override_args), help=argparse.SUPPRESS)

    group_hosts_parent = parser.add_argument_group('host arguments')
    group_hosts = group_hosts_parent.add_mutually_exclusive_group()
    group_hosts.add_argument('-H', '--hosts', action='store', dest='hosts',
                             help='List of host names and the number of available slots '
                                  'for running processes on each, of the form: <hostname>:<slots> '
                                  '(e.g.: host1:2,host2:4,host3:1 indicating 2 processes can run on host1, '
                                  '4 on host2, and 1 on host3). If not specified, defaults to using '
                                  'localhost:<np>')
    group_hosts.add_argument('-hostfile', '--hostfile', action='store', dest='hostfile',
                             help='Path to a host file containing the list of host names and the number of '
                                  'available slots. Each line of the file must be of the form: '
                                  '<hostname> slots=<slots>')
    group_hosts.add_argument('--host-discovery-script', action=make_override_action(override_args),
                             help='Used for elastic training (autoscaling and fault tolerance). '
                                  'An executable script that will print to stdout every available host (one per '
                                  'newline character) that can be used to run worker processes. Optionally '
                                  'specifies number of slots on the same line as the hostname as: "hostname:slots".'
                                  'Providing a discovery script enables elastic training (see elastic arguments).'
                                  'The job will fail immediately if execution of the script returns a non-zero exit '
                                  'code on the first call. Subsequent calls will be retried until timeout.')

    group_controller_parent = parser.add_argument_group('controller arguments')
    group_controller = group_controller_parent.add_mutually_exclusive_group()
    group_controller.add_argument('--gloo', action='store_true', dest='use_gloo',
                                  help='Run Horovod using the Gloo controller. This will '
                                       'be the default if Horovod was not built with MPI support.')
    group_controller.add_argument('--mpi', action='store_true', dest='use_mpi',
                                  help='Run Horovod using the MPI controller. This will '
                                       'be the default if Horovod was built with MPI support.')
    group_controller.add_argument('--jsrun', action='store_true', dest='use_jsrun',
                                  help='Launch Horovod processes with jsrun and use the MPI controller. '
                                       'This will be the default if jsrun is installed and Horovod '
                                       'was built with MPI support.')

    args = parser.parse_args()

    if args.config_file:
        with open(args.config_file, 'r') as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        config_parser.set_args_from_config(args, config, override_args)
    config_parser.validate_config_args(args)

    args.run_func = None

    if args.check_build:
        check_build(args.verbose)

    return args