in horovod/tensorflow/__init__.py [0:0]
def DistributedOptimizer(optimizer, name=None, use_locking=False, device_dense='',
device_sparse='', compression=Compression.none,
sparse_as_dense=False, backward_passes_per_step=1,
op=Average, gradient_predivide_factor=1.0):
"""Construct a new DistributedOptimizer, which uses another optimizer
under the hood for computing single-process gradient values and
applying gradient updates after the gradient values have been combined
across all the Horovod ranks.
Args:
optimizer:
Optimizer to use for computing gradients and applying updates.
name:
Optional name prefix for the operations created when applying
gradients. Defaults to "Distributed" followed by the provided
optimizer type.
use_locking:
Whether to use locking when updating variables.
See Optimizer.__init__ for more info.
device_dense:
Device to be used for dense tensors. Uses GPU by default
if Horovod was built with HOROVOD_GPU_OPERATIONS.
device_sparse:
Device to be used for sparse tensors. Uses GPU by default
if Horovod was built with HOROVOD_GPU_OPERATIONS.
compression:
Compression algorithm used during allreduce to reduce the amount
of data sent during each parameter update step. Defaults to
not using compression.
sparse_as_dense:
Treat all sparse gradients as dense tensors. This can help improve
performance and memory utilization if the original sparse gradient
has high density. Defaults to false.
backward_passes_per_step:
Number of backward passes to perform before calling hvd.allreduce.
This allows accumulating updates over multiple mini-batches before
reducing and applying them.
op:
The reduction operation to use when combining gradients across
different ranks.
gradient_predivide_factor:
If op == Average, gradient_predivide_factor splits the averaging
before and after the sum. Gradients are scaled by
1.0 / gradient_predivide_factor before the sum and
gradient_predivide_factor / size after the sum.
"""
if gradient_predivide_factor != 1.0:
if rocm_built():
raise ValueError('gradient_predivide_factor not supported yet with ROCm')
if op != Average:
raise ValueError('gradient_predivide_factor not supported with op != Average')
if isinstance(optimizer, _LegacyOptimizer):
if op == Adasum:
return _DistributedAdasumOptimizer(optimizer, name, use_locking, device_dense,
device_sparse, compression, backward_passes_per_step)
else:
if backward_passes_per_step > 1:
raise ValueError('backward_passes_per_step>1 is not supported yet with '
'op != Adasum')
return _DistributedOptimizer(optimizer, name, use_locking, device_dense,
device_sparse, compression, sparse_as_dense, op,
gradient_predivide_factor)
elif isinstance(optimizer, tf.keras.optimizers.Optimizer):
if op == Adasum:
raise ValueError('op == Adasum is not supported yet with Keras')
if backward_passes_per_step > 1:
raise ValueError('backward_passes_per_step > 1 is not supported yet with Keras')
import horovod.tensorflow.keras as hvd_k
return hvd_k.DistributedOptimizer(optimizer, name, device_dense, device_sparse,
compression, sparse_as_dense, gradient_predivide_factor)
else:
raise ValueError('Provided optimizer doesn\'t inherit from either legacy '
'TensorFlow or Keras optimizer: %s' % optimizer)