def run()

in horovod/runner/__init__.py [0:0]


def run(
        func,
        args=(),
        kwargs=None,
        np=1,
        min_np=None,
        max_np=None,
        slots=None,
        reset_limit=None,
        hosts=None,
        hostfile=None,
        start_timeout=None,
        ssh_port=None,
        ssh_identity_file=None,
        disable_cache=None,
        output_filename=None,
        verbose=None,
        use_gloo=None,
        use_mpi=None,
        mpi_args=None,
        network_interface=None):
    """
    Launch a Horovod job to run the specified process function and get the return value.

    :param func: The function to be run in Horovod job processes. The function return value will
                 be collected as the corresponding Horovod process return value.
                 This function must be compatible with pickle.
    :param args: Arguments to pass to `func`.
    :param kwargs: Keyword arguments to pass to `func`.
    :param np: Number of Horovod processes.
    :param min_np: Minimum number of processes running for training to continue. If number of
                   available processes dips below this threshold, then training will wait for
                   more instances to become available. Defaults to np
    :param max_np: Maximum number of training processes, beyond which no additional processes
                   will be created. If not specified, then will be unbounded.
    :param slots: Number of slots for processes per host. Normally 1 slot per GPU per host.
                  If slots are provided by the output of the host discovery script, then that
                  value will override this parameter.
    :param reset_limit: Maximum number of times that the training job can scale up or down the number of workers after
                        which the job is terminated. A reset event occurs when workers are added or removed from the
                        job after the initial registration. So a reset_limit of 0 would mean the job cannot change
                        membership after its initial set of workers. A reset_limit of 1 means it can resize at most
                        once, etc.

    :param hosts: List of host names and the number of available slots
                  for running processes on each, of the form: <hostname>:<slots>
                  (e.g.: host1:2,host2:4,host3:1 indicating 2 processes can run on host1,
                  4 on host2, and 1 on host3). If not specified, defaults to using localhost:<np>
    :param hostfile: Path to a host file containing the list of host names and the number of
                     available slots. Each line of the file must be of the form:
                     <hostname> slots=<slots>
    :param start_timeout: Horovodrun has to perform all the checks and
                          start the processes before the specified
                          timeout. The default value is 30 seconds.
                          Alternatively, The environment variable
                          HOROVOD_START_TIMEOUT can also be used to
                          specify the initialization timeout.
    :param ssh_port: SSH port on all the hosts.
    :param ssh_identity_file: SSH identity (private key) file.
    :param disable_cache: If the flag is not set, horovodrun will perform
                          the initialization checks only once every 60
                          minutes -- if the checks successfully pass.
                          Otherwise, all the checks will run every time
                          horovodrun is called.'
    :param output_filename: For Gloo, writes stdout / stderr of all processes to a filename of the form
                            <output_filename>/rank.<rank>/<stdout | stderr>. The <rank> will be padded with 0
                            characters to ensure lexicographical order.
                            For MPI, delegates its behavior to mpirun.
    :param verbose: If this flag is set, extra messages will be printed.
    :param use_gloo: Run Horovod using the Gloo controller. This will
                     be the default if Horovod was not built with MPI support.
    :param use_mpi: Run Horovod using the MPI controller. This will
                    be the default if Horovod was built with MPI support.
    :param mpi_args: Extra arguments for the MPI controller. This is only used when use_mpi is True.
    :param network_interface: Network interfaces to use for communication separated by comma. If
                             not specified, Horovod will find the common NICs among all the
                             workers and use those; example, eth0,eth1.
    :return: Return a list which contains values return by all Horovod processes.
             The index of the list corresponds to the rank of each Horovod process.
    """
    from .launch import _run

    if kwargs is None:
        kwargs = {}

    def wrapped_func():
        return func(*args, **kwargs)

    if hosts is not None and hostfile is not None:
        raise ValueError('Argument hosts and hostfile only allow one provided.')

    if use_gloo and use_mpi:
        raise ValueError('Argument use_gloo and use_mpi only allow one set True.')

    hargs = _HorovodArgs()

    hargs.np = np
    hargs.min_np = min_np
    hargs.max_np = max_np
    hargs.slots = slots
    hargs.reset_limit = reset_limit
    hargs.hosts = hosts
    hargs.hostfile = hostfile
    hargs.start_timeout = start_timeout
    hargs.ssh_port = ssh_port
    hargs.ssh_identity_file = ssh_identity_file
    hargs.mpi_args = mpi_args
    hargs.disable_cache = disable_cache
    hargs.output_filename = output_filename
    hargs.verbose = verbose
    hargs.use_gloo = use_gloo
    hargs.use_mpi = use_mpi
    hargs.nics = network_interface
    hargs.run_func = wrapped_func

    return _run(hargs)