tools/agile-machine-learning-api/codes/trainer/utils/optimizer_utils.py (164 lines of code) (raw):
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module for creating optimizers in tensorflow for model training."""
import tensorflow as tf
class Optimizer(object):
"""
Creates an instance of optimizer whose attributes are extended to
tf optimizers.
"""
def __init__(
self,
rho=0.95,
epsilon=1e-08,
use_locking=False,
global_step=tf.train.get_global_step(),
initial_gradient_squared_accumulator_value=0.1,
l1_regularization_strength=0.0,
l2_regularization_strength=0.0,
beta1=0.9,
beta2=0.999,
learning_rate_power=-0.5,
initial_accumulator_value=0.1,
accum_name=None,
linear_name=None,
l2_shrinkage_regularization_strength=0.0,
momentum=0.0,
use_nesterov=False,
decay=0.9,
centered=False,
lr_decay=0.9,
decay_steps=2500):
"""Total number of arguments any optimizer can take.
Arguments:
rho : A Tensor containing the value to minimize or a callable taking no arguments which returns the value to minimize.
When eager execution is enabled it must be a callable.
epsilon : Small value to avoid zero denominator.
use_locking : boolean, If True use locks for update operations.
global_step : tf.train.get_global_step() object
initial_gradient_squared_accumulator_value : float,
l1_regularization_strength : A float value, must be greater than or equal to zero.
l2_regularization_strength : A float value, must be greater than or equal to zero.
beta1 : A float value or a constant float tensor. The exponential decay rate for the 1st moment estimates.
beta2 : A float value or a constant float tensor. The exponential decay rate for the 2nd moment estimates.
learning_rate_power : A float value, must be less or equal to zero.
initial_accumulator_value : The starting value for accumulators. Only zero or positive values are allowed.
accum_name : The suffix for the variable that keeps the gradient squared accumulator. If not present, defaults to name.
linear_name : The suffix for the variable that keeps the linear gradient accumulator. If not present, defaults to name + "_1".
l2_shrinkage_regularization_strength : A float value, must be greater than or equal to zero.
This differs from L2 above in that the L2 above is a stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
momentum : A scalar tensor.
use_nesterov : If True use Nesterov Momentum. See Sutskever et al., 2013.
This implementation always computes gradients at the value of the variable(s) passed to the optimizer.
Using Nesterov Momentum makes the variable(s) track the values called theta_t + mu*v_t in the paper.
decay : Discounting factor for the history/coming gradient
centered : boolean, If True, gradients are normalized by the estimated variance of the gradient; if False, by the uncentered second moment.
Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False.
lr_decay : A float value, must be greater than or equal to zero.
decay_steps : A scalar int32 or int64 Tensor or a Python number. Must be positive. See the decay computation above.
"""
self.rho = rho
self.epsilon = epsilon
self.use_locking = use_locking
self.global_step = global_step
self.initial_gradient_squared_accumulator_value = (
initial_gradient_squared_accumulator_value)
self.initial_accumulator_value = initial_accumulator_value
self.l1_regularization_strength = l1_regularization_strength
self.l2_regularization_strength = l2_regularization_strength
self.beta1 = beta1
self.beta2 = beta2
self.learning_rate_power = learning_rate_power
self.l2_shrinkage_regularization_strength = (
l2_shrinkage_regularization_strength)
self.momentum = momentum
self.use_nesterov = use_nesterov
self.accum_name = accum_name
self.linear_name = linear_name
self.decay = decay
self.centered = centered
self.lr_decay = lr_decay
self.decay_steps = decay_steps
def decay_lr(self, learning_rate):
"""Creates an instance of exponential decay fn.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.exponential_decay(
learning_rate=learning_rate,
decay_rate=self.lr_decay,
decay_steps=self.decay_steps,
global_step=tf.train.get_global_step()
)
def adadelta(self, learning_rate):
"""Sets up an instance of ada delta optimizer.
Infers most parameters from init.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.AdadeltaOptimizer(
learning_rate=learning_rate,
rho=self.rho,
epsilon=self.epsilon,
use_locking=self.use_locking,
name='Adadelta'
)
def adagrad(self, learning_rate):
"""Sets up an instance of adagrad optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.AdagradOptimizer(
learning_rate=learning_rate,
initial_accumulator_value=self.initial_accumulator_value,
use_locking=self.use_locking,
name='Adagrad'
)
def adagradDA(self, learning_rate):
"""Sets up an instance of adagradDA optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.AdagradDAOptimizer(
learning_rate=learning_rate,
global_step=self.global_step,
initial_gradient_squared_accumulator_value=(
self.initial_gradient_squared_accumulator_value),
l1_regularization_strength=self.l1_regularization_strength,
l2_regularization_strength=self.l2_regularization_strength,
use_locking=self.use_locking,
name='AdagradDA'
)
def adam(self, learning_rate):
"""Sets up an instance of adam optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.AdamOptimizer(
learning_rate=learning_rate,
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon,
use_locking=self.use_locking,
name='Adam'
)
def ftrl(self, learning_rate):
"""Sets up an instance of ftrl optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.FtrlOptimizer(
learning_rate=learning_rate,
learning_rate_power=self.learning_rate_power,
initial_accumulator_value=self.initial_accumulator_value,
l1_regularization_strength=self.l1_regularization_strength,
l2_regularization_strength=self.l2_regularization_strength,
use_locking=self.use_locking,
name='Ftrl',
accum_name=self.accum_name,
linear_name=self.linear_name,
l2_shrinkage_regularization_strength=(
self.l2_shrinkage_regularization_strength)
)
@classmethod
def gradient_descent(cls, learning_rate):
"""Sets up an instance of simple gradient descent optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.GradientDescentOptimizer(
learning_rate=learning_rate,
use_locking=False,
name='GradientDescent'
)
def momentum_optimizer(self, learning_rate):
"""Sets up an instance of momentum optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=self.momentum,
use_locking=self.use_locking,
use_nesterov=self.use_nesterov,
name='MomentumOptimizer'
)
def proximal_adagrad(self, learning_rate):
"""Sets up an instance of proximal adagrad optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.ProximalAdagradOptimizer(
learning_rate=learning_rate,
initial_accumulator_value=self.initial_accumulator_value,
l1_regularization_strength=self.l1_regularization_strength,
l2_regularization_strength=self.l2_regularization_strength,
use_locking=self.use_locking,
name='ProximalAdagrad'
)
def proximal_gradient_desc(self, learning_rate):
"""Sets up an instance of proximal gradient descent optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer"""
return tf.train.ProximalGradientDescentOptimizer(
learning_rate=learning_rate,
l1_regularization_strength=self.l1_regularization_strength,
l2_regularization_strength=self.l2_regularization_strength,
use_locking=False,
name='ProximalGradientDesc'
)
def rmsprop(self, learning_rate):
"""Sets up an instance of rms prop optimizer.
Arguments:
learning_rate : float, Learning rate for the optimizer
"""
return tf.train.RMSPropOptimizer(
learning_rate=learning_rate,
decay=self.decay,
momentum=self.momentum,
epsilon=self.epsilon,
use_locking=self.use_locking,
centered=self.centered,
name='RmsProp'
)
def set_opt_wrap(self, name, learning_rate, decay):
"""
Wrapper fn for _set_opt function
Arguments:
name : str, Name of the optimizer to be used
learning_rate : float, Learning rate for the optimizer
decay : Boolean, whether or not to use learning rate decay
Returns:
_set_opt function
"""
return self._set_opt(name, learning_rate, decay)
def _set_opt(self, name, learning_rate, decay):
"""Choose the optimizer and infer parameters from init.
Arguments:
name : str, Name of the optimizer to be used
learning_rate : float, Learning rate for the optimizer
decay : Boolean, whether or not to use learning rate decay
Returns:
Instance of an optimizer with paramters parsed
"""
if decay:
learning_rate = self.decay_lr(learning_rate)
else:
learning_rate = learning_rate
optimizer_mapping = {
'adadelta': self.adadelta,
'adagradDA': self.adagradDA,
'adagrad': self.adagrad,
'adam': self.adam,
'gradientdescent': self.gradient_descent,
'momentumoptimizer': self.momentum_optimizer,
'proximaladagrad': self.proximal_adagrad,
'proximalgradientdesc': self.proximal_gradient_desc,
'rmsprop': self.rmsprop,
'ftrl': self.ftrl
}
return optimizer_mapping[name](learning_rate)