in horovod/torch/mpi_ops.py [0:0]
def _allreduce_async(tensor, output, name, op, prescale_factor, postscale_factor):
# Set the divisor for reduced gradients to average when necessary
if op == Average:
if rocm_built():
# For ROCm, perform averaging at framework level
divisor = size()
op = Sum
else:
divisor = 1
elif op == Adasum:
if tensor.device.type != 'cpu' and gpu_available('torch'):
if nccl_built():
if not is_homogeneous():
raise NotImplementedError('Running GPU Adasum on heterogeneous cluster is not supported yet.')
elif not num_rank_is_power_2(int(size() / local_size())):
raise NotImplementedError('Running GPU Adasum with non-power of 2 nodes is not supported yet.')
if rocm_built():
# For ROCm, perform averaging at framework level
divisor = local_size()
else:
divisor = 1
else:
warnings.warn('Adasum reduction does not currently support GPU reduction using MPI. Tensors are '
'copied to CPU memory instead. To use Adasum for GPU reduction, please compile Horovod '
'with HOROVOD_GPU_OPERATIONS=NCCL.')
divisor = 1
else:
if not num_rank_is_power_2(size()):
raise NotImplementedError('Running Adasum with non-power of 2 ranks is not supported yet.')
divisor = 1
else:
divisor = 1
function = _check_function(_allreduce_function_factory, tensor)
try:
handle = getattr(mpi_lib, function)(tensor, output, divisor,
name.encode() if name is not None else _NULL, op,
prescale_factor, postscale_factor)
except RuntimeError as e:
raise HorovodInternalError(e)
_handle_map[handle] = (tensor, output)
return handle