def generate_jsrun_rankfile()

in horovod/runner/js_run.py [0:0]


def generate_jsrun_rankfile(settings, path=None):
    """
    Generates rankfile to use with jsrun.
    It splits the cores among the processes, which leads to best performance according to experiments.

    Args:
        settings: Settings for running jsrun.
                  Note: settings.num_proc and settings.hosts must not be None.
        path: Optional path of the rankfile.
              Note: this file will be overwritten.
    """
    cpu_per_gpu = (lsf.LSFUtils.get_num_cores() * lsf.LSFUtils.get_num_threads()) // lsf.LSFUtils.get_num_gpus()
    host_list = (x.split(':') for x in settings.hosts.split(','))

    # Verify and truncate host list if necessary
    validated_list = []
    remaining_slots = settings.num_proc
    for host, slots in host_list:
        slots = int(slots)
        if slots > lsf.LSFUtils.get_num_gpus():
            raise ValueError('Invalid host input, slot count for host \'{host}:{slots}\' is greater '
                             'than number of GPUs per host \'{gpus}\'.'.format(
                host=host, slots=slots, gpus=lsf.LSFUtils.get_num_gpus()))
        needed_slots = min(slots, remaining_slots)
        validated_list.append((host, needed_slots))
        remaining_slots -= needed_slots
        if remaining_slots == 0:
            break
    if remaining_slots != 0:
        raise ValueError('Not enough slots on the hosts to fulfill the {slots} requested.'.format(
            slots=settings.num_proc))

    # Generate rankfile
    path = tempfile.mktemp() if path is None else path
    with open(path, 'w') as tmp:
        tmp.write('overlapping_rs: allow\n')
        tmp.write('cpu_index_using: logical\n')
        rank = 0
        for host, slots in validated_list:
            cpu_val = 0
            tmp.write('\n')
            for s in range(slots):
                tmp.write('rank: {rank}: {{ hostname: {host}; cpu: {{{scpu}-{ecpu}}} ; gpu: * ; mem: * }}\n'.format(
                    rank=rank,
                    host=host,
                    scpu=cpu_val,
                    ecpu=cpu_val + cpu_per_gpu - 1
                ))
                rank += 1
                cpu_val += cpu_per_gpu
    return path