def get_optimizers()

in keras/backend/mxnet_backend.py [0:0]


def get_optimizers():
    import importlib
    optimizers = importlib.import_module('keras.optimizers')

    class MXOptimizer(optimizers.Optimizer, mx.optimizer.Optimizer):
        """Custom MXNet Optimizer wrapping Keras Optimizer.
        This is required because we cannot use Keras optimizer directly as MXNet backend does not
        support symbolic optimizers.
        """

        def __init__(self, lr, decay):
            super(MXOptimizer, self).__init__()
            self.lr = variable(lr)
            self.decay = variable(decay)

        def _get_lr(self, _):
            return self.lr.tensor.asscalar() / (1. + self.decay.tensor.asscalar() * self.num_update)

        def _get_lrs(self, _):
            return [self._get_lr(_) for i in range(0, self.aggregate_num)]

        def get_config(self):
            config = {}
            if hasattr(self, 'clipnorm'):
                config['clipnorm'] = self.clipnorm
            if hasattr(self, 'clipvalue'):
                config['clipvalue'] = self.clipvalue
            return config

    class SGD(MXOptimizer, mx.optimizer.SGD):
        """Stochastic gradient descent optimizer.

        Includes support for momentum,
        learning rate decay, and Nesterov momentum.

        # Arguments
            lr: float >= 0. Learning rate.
            momentum: float >= 0. Parameter that accelerates SGD
                in the relevant direction and dampens oscillations.
            decay: float >= 0. Learning rate decay over each update.
            nesterov: boolean. Whether to apply Nesterov momentum.
        """

        def __init__(self, lr=0.01, momentum=0., decay=0.,
                     nesterov=False, clipnorm=None, **kwargs):
            mx.optimizer.SGD.__init__(self, learning_rate=lr, momentum=momentum, clip_gradient=clipnorm, **kwargs)
            MXOptimizer.__init__(self, lr, decay)
            # use only 1 aggregated optimizer to fit into Keras optimizer requirement
            self.aggregate_num = 1

        def get_config(self):
            config = {'lr': float(get_value(self.lr)),
                      'momentum': float(get_value(self.momentum)),
                      'decay': float(get_value(self.decay))}
            base_config = super(SGD, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    class Adagrad(MXOptimizer, mx.optimizer.AdaGrad):
        """Adagrad optimizer.

        Adagrad is an optimizer with parameter-specific learning rates,
        which are adapted relative to how frequently a parameter gets
        updated during training. The more updates a parameter receives,
        the smaller the updates.

        It is recommended to leave the parameters of this optimizer
        at their default values.

        # Arguments
            lr: float >= 0. Initial learning rate.
            epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
            decay: float >= 0. Learning rate decay over each update.

        # References
            - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)  # nopep8
        """

        def __init__(self, lr=0.01, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
            mx.optimizer.AdaGrad.__init__(self, learning_rate=lr, eps=epsilon, clip_gradient=clipnorm, **kwargs)
            MXOptimizer.__init__(self, lr, decay)

        def get_config(self):
            config = {'lr': float(get_value(self.lr)),
                      'decay': float(get_value(self.decay)),
                      'epsilon': float(get_value(self.float_stable_eps))}
            base_config = super(Adagrad, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    class Adadelta(MXOptimizer, mx.optimizer.AdaDelta):
        """Adadelta optimizer.

        Adadelta is a more robust extension of Adagrad
        that adapts learning rates based on a moving window of gradient updates,
        instead of accumulating all past gradients. This way, Adadelta continues
        learning even when many updates have been done. Compared to Adagrad, in the
        original version of Adadelta you don't have to set an initial learning
        rate. In this version, initial learning rate and decay factor can
        be set, as in most other Keras optimizers.

        It is recommended to leave the parameters of this optimizer
        at their default values.

        # Arguments
            lr: float >= 0. Initial learning rate, defaults to 1.
                It is recommended to leave it at the default value.
            rho: float >= 0. Adadelta decay factor, corresponding to fraction of
                gradient to keep at each time step.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
            decay: float >= 0. Initial learning rate decay.

        # References
            - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
        """

        def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
            mx.optimizer.AdaDelta.__init__(self, rho=rho, epsilon=epsilon, clip_gradient=clipnorm, **kwargs)
            MXOptimizer.__init__(self, lr, decay)

        def get_config(self):
            config = {'lr': float(get_value(self.lr)),
                      'rho': float(get_value(self.rho)),
                      'decay': float(get_value(self.decay)),
                      'epsilon': self.epsilon}
            base_config = super(Adadelta, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    class Adam(MXOptimizer, mx.optimizer.Adam):
        """Adam optimizer.

        Default parameters follow those provided in the original paper.

        # Arguments
            lr: float >= 0. Learning rate.
            beta_1: float, 0 < beta < 1. Generally close to 1.
            beta_2: float, 0 < beta < 1. Generally close to 1.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
            decay: float >= 0. Learning rate decay over each update.
            amsgrad: boolean. Whether to apply the AMSGrad variant of this
                algorithm from the paper "On the Convergence of Adam and
                Beyond".

        # References
            - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
            - [On the Convergence of Adam and Beyond](https://openreview.net/forum?id=ryQu7f-RZ)
        """

        def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
                     epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
            mx.optimizer.Adam.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2,
                                       epsilon=epsilon, clip_gradient=clipnorm, **kwargs)
            MXOptimizer.__init__(self, lr, decay)

        def get_config(self):
            config = {'lr': float(get_value(self.lr)),
                      'beta_1': float(get_value(self.beta1)),
                      'beta_2': float(get_value(self.beta2)),
                      'decay': float(get_value(self.decay)),
                      'epsilon': self.epsilon}
            base_config = super(Adam, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    class Adamax(MXOptimizer, mx.optimizer.Adamax):
        """Adamax optimizer from Adam paper's Section 7.

        It is a variant of Adam based on the infinity norm.
        Default parameters follow those provided in the paper.

        # Arguments
            lr: float >= 0. Learning rate.
            beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
            decay: float >= 0. Learning rate decay over each update.

        # References
            - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
        """

        def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, decay=0., clipnorm=None,
                     epsilon=1e-8, **kwargs):
            mx.optimizer.Adamax.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2,
                                         clip_gradient=clipnorm, **kwargs)
            MXOptimizer.__init__(self, lr, decay)
            self.epsilon = epsilon

        def get_config(self):
            config = {'lr': float(get_value(self.learning_rate)),
                      'beta_1': float(get_value(self.beta1)),
                      'beta_2': float(get_value(self.beta2)),
                      'decay': float(get_value(self.decay)),
                      'epsilon': self.epsilon}
            base_config = super(Adamax, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    class Nadam(MXOptimizer, mx.optimizer.Nadam):
        """Nesterov Adam optimizer.

        Much like Adam is essentially RMSprop with momentum,
        Nadam is Adam RMSprop with Nesterov momentum.

        Default parameters follow those provided in the paper.
        It is recommended to leave the parameters of this optimizer
        at their default values.

        # Arguments
            lr: float >= 0. Learning rate.
            beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.

        # References
            - [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
            - [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf)  # nopep8
        """

        def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0., clipnorm=None,
                     schedule_decay=0.004, **kwargs):
            mx.optimizer.Nadam.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2, epsilon=epsilon,
                                        schedule_decay=schedule_decay, **kwargs)
            MXOptimizer.__init__(self, lr, decay)

        def get_config(self):
            config = {'lr': float(get_value(self.learning_rate)),
                      'beta_1': float(get_value(self.beta1)),
                      'beta_2': float(get_value(self.beta2)),
                      'epsilon': self.epsilon,
                      'schedule_decay': self.schedule_decay}
            base_config = super(Nadam, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    class RMSprop(MXOptimizer, mx.optimizer.RMSProp):
        """RMSProp optimizer.

        It is recommended to leave the parameters of this optimizer
        at their default values
        (except the learning rate, which can be freely tuned).

        This optimizer is usually a good choice for recurrent
        neural networks.

        # Arguments
            lr: float >= 0. Learning rate.
            rho: float >= 0.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
            decay: float >= 0. Learning rate decay over each update.

        # References
            - [rmsprop: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)  # nopep8
        """

        def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
            mx.optimizer.RMSProp.__init__(self, learning_rate=lr, gamma1=rho, epsilon=epsilon,
                                          clip_gradient=clipnorm, **kwargs)
            MXOptimizer.__init__(self, lr, decay)

        def get_config(self):
            config = {'lr': float(get_value(self.lr)),
                      'rho': float(get_value(self.gamma1)),
                      'decay': float(get_value(self.decay)),
                      'epsilon': self.epsilon}
            base_config = super(RMSprop, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    return SGD, Adagrad, Adadelta, Adam, Adamax, RMSprop, Nadam