in keras/backend/mxnet_backend.py [0:0]
def get_optimizers():
import importlib
optimizers = importlib.import_module('keras.optimizers')
class MXOptimizer(optimizers.Optimizer, mx.optimizer.Optimizer):
"""Custom MXNet Optimizer wrapping Keras Optimizer.
This is required because we cannot use Keras optimizer directly as MXNet backend does not
support symbolic optimizers.
"""
def __init__(self, lr, decay):
super(MXOptimizer, self).__init__()
self.lr = variable(lr)
self.decay = variable(decay)
def _get_lr(self, _):
return self.lr.tensor.asscalar() / (1. + self.decay.tensor.asscalar() * self.num_update)
def _get_lrs(self, _):
return [self._get_lr(_) for i in range(0, self.aggregate_num)]
def get_config(self):
config = {}
if hasattr(self, 'clipnorm'):
config['clipnorm'] = self.clipnorm
if hasattr(self, 'clipvalue'):
config['clipvalue'] = self.clipvalue
return config
class SGD(MXOptimizer, mx.optimizer.SGD):
"""Stochastic gradient descent optimizer.
Includes support for momentum,
learning rate decay, and Nesterov momentum.
# Arguments
lr: float >= 0. Learning rate.
momentum: float >= 0. Parameter that accelerates SGD
in the relevant direction and dampens oscillations.
decay: float >= 0. Learning rate decay over each update.
nesterov: boolean. Whether to apply Nesterov momentum.
"""
def __init__(self, lr=0.01, momentum=0., decay=0.,
nesterov=False, clipnorm=None, **kwargs):
mx.optimizer.SGD.__init__(self, learning_rate=lr, momentum=momentum, clip_gradient=clipnorm, **kwargs)
MXOptimizer.__init__(self, lr, decay)
# use only 1 aggregated optimizer to fit into Keras optimizer requirement
self.aggregate_num = 1
def get_config(self):
config = {'lr': float(get_value(self.lr)),
'momentum': float(get_value(self.momentum)),
'decay': float(get_value(self.decay))}
base_config = super(SGD, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adagrad(MXOptimizer, mx.optimizer.AdaGrad):
"""Adagrad optimizer.
Adagrad is an optimizer with parameter-specific learning rates,
which are adapted relative to how frequently a parameter gets
updated during training. The more updates a parameter receives,
the smaller the updates.
It is recommended to leave the parameters of this optimizer
at their default values.
# Arguments
lr: float >= 0. Initial learning rate.
epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) # nopep8
"""
def __init__(self, lr=0.01, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
mx.optimizer.AdaGrad.__init__(self, learning_rate=lr, eps=epsilon, clip_gradient=clipnorm, **kwargs)
MXOptimizer.__init__(self, lr, decay)
def get_config(self):
config = {'lr': float(get_value(self.lr)),
'decay': float(get_value(self.decay)),
'epsilon': float(get_value(self.float_stable_eps))}
base_config = super(Adagrad, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adadelta(MXOptimizer, mx.optimizer.AdaDelta):
"""Adadelta optimizer.
Adadelta is a more robust extension of Adagrad
that adapts learning rates based on a moving window of gradient updates,
instead of accumulating all past gradients. This way, Adadelta continues
learning even when many updates have been done. Compared to Adagrad, in the
original version of Adadelta you don't have to set an initial learning
rate. In this version, initial learning rate and decay factor can
be set, as in most other Keras optimizers.
It is recommended to leave the parameters of this optimizer
at their default values.
# Arguments
lr: float >= 0. Initial learning rate, defaults to 1.
It is recommended to leave it at the default value.
rho: float >= 0. Adadelta decay factor, corresponding to fraction of
gradient to keep at each time step.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Initial learning rate decay.
# References
- [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
"""
def __init__(self, lr=1.0, rho=0.95, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
mx.optimizer.AdaDelta.__init__(self, rho=rho, epsilon=epsilon, clip_gradient=clipnorm, **kwargs)
MXOptimizer.__init__(self, lr, decay)
def get_config(self):
config = {'lr': float(get_value(self.lr)),
'rho': float(get_value(self.rho)),
'decay': float(get_value(self.decay)),
'epsilon': self.epsilon}
base_config = super(Adadelta, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adam(MXOptimizer, mx.optimizer.Adam):
"""Adam optimizer.
Default parameters follow those provided in the original paper.
# Arguments
lr: float >= 0. Learning rate.
beta_1: float, 0 < beta < 1. Generally close to 1.
beta_2: float, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
amsgrad: boolean. Whether to apply the AMSGrad variant of this
algorithm from the paper "On the Convergence of Adam and
Beyond".
# References
- [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
- [On the Convergence of Adam and Beyond](https://openreview.net/forum?id=ryQu7f-RZ)
"""
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
mx.optimizer.Adam.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2,
epsilon=epsilon, clip_gradient=clipnorm, **kwargs)
MXOptimizer.__init__(self, lr, decay)
def get_config(self):
config = {'lr': float(get_value(self.lr)),
'beta_1': float(get_value(self.beta1)),
'beta_2': float(get_value(self.beta2)),
'decay': float(get_value(self.decay)),
'epsilon': self.epsilon}
base_config = super(Adam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Adamax(MXOptimizer, mx.optimizer.Adamax):
"""Adamax optimizer from Adam paper's Section 7.
It is a variant of Adam based on the infinity norm.
Default parameters follow those provided in the paper.
# Arguments
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
"""
def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, decay=0., clipnorm=None,
epsilon=1e-8, **kwargs):
mx.optimizer.Adamax.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2,
clip_gradient=clipnorm, **kwargs)
MXOptimizer.__init__(self, lr, decay)
self.epsilon = epsilon
def get_config(self):
config = {'lr': float(get_value(self.learning_rate)),
'beta_1': float(get_value(self.beta1)),
'beta_2': float(get_value(self.beta2)),
'decay': float(get_value(self.decay)),
'epsilon': self.epsilon}
base_config = super(Adamax, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class Nadam(MXOptimizer, mx.optimizer.Nadam):
"""Nesterov Adam optimizer.
Much like Adam is essentially RMSprop with momentum,
Nadam is Adam RMSprop with Nesterov momentum.
Default parameters follow those provided in the paper.
It is recommended to leave the parameters of this optimizer
at their default values.
# Arguments
lr: float >= 0. Learning rate.
beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
# References
- [Nadam report](http://cs229.stanford.edu/proj2015/054_report.pdf)
- [On the importance of initialization and momentum in deep learning](http://www.cs.toronto.edu/~fritz/absps/momentum.pdf) # nopep8
"""
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0., clipnorm=None,
schedule_decay=0.004, **kwargs):
mx.optimizer.Nadam.__init__(self, learning_rate=lr, beta1=beta_1, beta2=beta_2, epsilon=epsilon,
schedule_decay=schedule_decay, **kwargs)
MXOptimizer.__init__(self, lr, decay)
def get_config(self):
config = {'lr': float(get_value(self.learning_rate)),
'beta_1': float(get_value(self.beta1)),
'beta_2': float(get_value(self.beta2)),
'epsilon': self.epsilon,
'schedule_decay': self.schedule_decay}
base_config = super(Nadam, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
class RMSprop(MXOptimizer, mx.optimizer.RMSProp):
"""RMSProp optimizer.
It is recommended to leave the parameters of this optimizer
at their default values
(except the learning rate, which can be freely tuned).
This optimizer is usually a good choice for recurrent
neural networks.
# Arguments
lr: float >= 0. Learning rate.
rho: float >= 0.
epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
decay: float >= 0. Learning rate decay over each update.
# References
- [rmsprop: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) # nopep8
"""
def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., clipnorm=None, **kwargs):
mx.optimizer.RMSProp.__init__(self, learning_rate=lr, gamma1=rho, epsilon=epsilon,
clip_gradient=clipnorm, **kwargs)
MXOptimizer.__init__(self, lr, decay)
def get_config(self):
config = {'lr': float(get_value(self.lr)),
'rho': float(get_value(self.gamma1)),
'decay': float(get_value(self.decay)),
'epsilon': self.epsilon}
base_config = super(RMSprop, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
return SGD, Adagrad, Adadelta, Adam, Adamax, RMSprop, Nadam