perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm (866 lines of code) (raw):
package AI::MXNet::Optimizer;
use strict;
use warnings;
use AI::MXNet::Base;
use AI::MXNet::NDArray;
use AI::MXNet::Random;
use List::Util qw(max);
=head1 NAME
AI::MXNet::Optimizer - Common Optimization algorithms with regularizations.
=head1 DESCRIPTION
Common Optimization algorithms with regularizations.
=cut
use Mouse;
use AI::MXNet::Function::Parameters;
my %opt_registry;
method get_opt_registry()
{
return \%opt_registry;
}
method register()
{
my $name = $self;
($name) = $name =~ /::(\w+)$/;
{ no strict 'refs'; *{__PACKAGE__."::$name"} = sub { $self }; }
$name = lc $name;
if(exists $opt_registry{ $name })
{
my $existing = $opt_registry{ $name };
warn(
"WARNING: New optimizer $self.$name"
."is overriding existing optimizer $existing.$name"
);
}
$opt_registry{ $name } = $self;
}
=head2 create_optimizer
Create an optimizer with specified name.
Parameters
----------
name: str
Name of required optimizer. Should be the name
of a subclass of Optimizer. Case insensitive.
rescale_grad : float
Rescaling factor on gradient. Normally should be 1/batch_size.
kwargs: dict
Parameters for optimizer
Returns
-------
opt : Optimizer
The result optimizer.
=cut
method create_optimizer(Str $name, %kwargs)
{
if(exists $opt_registry{ lc $name })
{
my $rescale_grad = delete($kwargs{rescale_grad})//1;
return $opt_registry{ lc $name }->new(
rescale_grad => $rescale_grad,
%kwargs
);
}
confess("Cannot find optimizer $name");
}
*create = \&create_optimizer;
has 'rescale_grad' => (is => "rw", isa => "Num", default=>1);
has 'lr' => (is => "rw", isa => "Num");
has 'learning_rate' => (is => "rw", isa => "Num", default => 0.01);
has 'lr_scheduler' => (is => "rw", isa => "Maybe[AI::MXNet::LRScheduler]");
has 'wd' => (is => "rw", isa => "Num", default => 0);
has 'lr_mult' => (is => "rw", isa => "HashRef", default => sub { +{} });
has 'wd_mult' => (is => "rw", isa => "HashRef", , default => sub { +{} });
has 'num_update' => (is => "rw", isa => "Int");
has 'begin_num_update' => (is => "rw", isa => "Int", default => 0);
has '_index_update_count' => (is => "rw", isa => "HashRef", default => sub { +{} });
has 'clip_gradient' => (is => "rw", isa => "Maybe[Num]");
has 'param_idx2name' => (is => "rw", isa => "HashRef[Str]", default => sub { +{} });
has 'idx2name' => (is => "rw", isa => "HashRef[Str]");
has 'sym' => (is => "rw", isa => "Maybe[AI::MXNet::Symbol]");
sub BUILD
{
my $self = shift;
if($self->lr_scheduler)
{
$self->lr_scheduler->base_lr($self->learning_rate);
}
$self->lr($self->learning_rate);
$self->num_update($self->begin_num_update);
$self->idx2name({ %{ $self->param_idx2name } });
$self->set_lr_mult({});
$self->set_wd_mult({});
}
# Create additional optimizer state such as momentum.
# override in implementations.
method create_state($index, $weight){}
# Update the parameters. override in implementations
method update($index, $weight, $grad, $state){}
# set lr scale is deprecated. Use set_lr_mult instead.
method set_lr_scale($args_lrscale)
{
Carp::cluck("set lr scale is deprecated. Use set_lr_mult instead.");
}
=head2 set_lr_mult
Set individual learning rate multipler for parameters
Parameters
----------
args_lr_mult : dict of string/int to float
set the lr multipler for name/index to float.
setting multipler by index is supported for backward compatibility,
but we recommend using name and symbol.
=cut
method set_lr_mult(HashRef[Num] $args_lr_mult)
{
$self->lr_mult({});
if($self->sym)
{
my $attr = $self->sym->attr_dict();
for my $name (@{ $self->sym->list_arguments() })
{
if(exists $attr->{ $name } and exists $attr->{ $name }{ __lr_mult__ })
{
$self->lr_mult->{ $name } = $attr->{ $name }{ __lr_mult__ };
}
}
}
$self->lr_mult({ %{ $self->lr_mult }, %{ $args_lr_mult } });
}
=head2 set_wd_mult
Set individual weight decay multipler for parameters.
By default wd multipler is 0 for all params whose name doesn't
end with _weight, if param_idx2name is provided.
Parameters
----------
args_wd_mult : dict of string/int to float
set the wd multipler for name/index to float.
setting multipler by index is supported for backward compatibility,
but we recommend using name and symbol.
=cut
method set_wd_mult(HashRef[Num] $args_wd_mult)
{
$self->wd_mult({});
for my $n (values %{ $self->idx2name })
{
if(not $n =~ /(?:_weight|_gamma)$/)
{
$self->wd_mult->{ $n } = 0;
}
}
if($self->sym)
{
my $attr = $self->sym->attr_dict();
for my $name (@{ $self->sym->list_arguments() })
{
if(exists $attr->{ $name } and exists $attr->{ $name }{ __wd_mult__ })
{
$self->wd_mult->{ $name } = $attr->{ $name }{ __wd_mult__ };
}
}
}
$self->wd_mult({ %{ $self->wd_mult }, %{ $args_wd_mult } });
}
method _update_count(Index $index)
{
if(not exists $self->_index_update_count->{ $index })
{
$self->_index_update_count->{ $index } = $self->begin_num_update;
}
$self->_index_update_count->{ $index } += 1;
$self->num_update(max($self->_index_update_count->{ $index }, $self->num_update));
}
method _get_lr(Index $index)
{
my $lr;
if($self->lr_scheduler)
{
$lr = &{$self->lr_scheduler}($self->num_update);
}
else
{
$lr = $self->lr;
}
if(exists $self->lr_mult->{ $index })
{
$lr *= $self->lr_mult->{ $index };
}
elsif(exists $self->idx2name->{ $index })
{
$lr *= $self->lr_mult->{ $self->idx2name->{ $index } }//1;
}
return $lr;
}
method _get_wd(Index $index)
{
my $wd = $self->wd;
if(exists $self->wd_mult->{ $index })
{
$wd *= $self->wd_mult->{ $index };
}
elsif(exists $self->idx2name->{ $index })
{
$wd *= $self->wd_mult->{ $self->idx2name->{ $index } }//1;
}
return $wd;
}
=head1 NAME
AI::MXNet::SGD - A very simple SGD optimizer with momentum and weight regularization.
=cut
=head1 DESCRIPTION
A very simple SGD optimizer with momentum and weight regularization.
Parameters
----------
learning_rate : float, optional
learning_rate of SGD
momentum : float, optional
momentum value
wd : float, optional
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient. Normally should be 1/batch_size.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
param_idx2name : hash of string/int to float, optional
special treat weight decay in parameter ends with bias, gamma, and beta
multi_precision: bool, optional
Flag to control the internal precision of the optimizer.
False results in using the same precision as the weights (default),
True makes internal 32-bit copy of the weights and applies gradients
in 32-bit precision even if actual weights used in the model have lower precision.
Turning this on can improve convergence and accuracy when training with float16.
=cut
package AI::MXNet::SGD;
use Mouse;
extends 'AI::MXNet::Optimizer';
has 'kwargs' => (is => "rw", isa => "HashRef[Num]");
has 'momentum' => (is => "rw", isa => "Num", default => 0);
has 'multi_precision' => (is => "ro", isa => "Bool", default => 0);
sub BUILD
{
my $self = shift;
$self->kwargs({ rescale_grad => $self->rescale_grad });
if($self->momentum)
{
$self->kwargs->{momentum} = $self->momentum;
}
if($self->clip_gradient)
{
$self->kwargs->{clip_gradient} = $self->clip_gradient;
}
}
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
my $momentum;
my $weight_master_copy;
if($self->multi_precision and $weight->dtype eq 'float16')
{
my $weight_master_copy = AI::MXNet::NDArray->array($weight, ctx => $weight->context, dtype => 'float32');
if($self->momentum != 0)
{
$momentum = AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype => 'float32');
}
return [$momentum, $weight_master_copy];
}
if($weight->dtype eq 'float16' and not $self->multi_precision)
{
AI::MXNet::Logging->warning(
"Accumulating with float16 in optimizer can lead to ".
"poor accuracy or slow convergence. ".
"Consider using multi_precision=True option of the ".
"SGD optimizer"
);
}
if($self->momentum != 0)
{
$momentum = AI::MXNet::NDArray->zeros($weight->shape, ctx => $weight->context, dtype => $weight->dtype);
}
return $momentum;
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
my $kwargs = {
out => $weight,
lr => $lr,
wd => $wd,
%{ $self->kwargs }
};
my $use_multi_precision = ref($state) eq 'ARRAY';
if(not $use_multi_precision)
{
if(defined $state)
{
AI::MXNet::NDArray->sgd_mom_update(
$weight, $grad, $state, $kwargs
);
}
else
{
AI::MXNet::NDArray->sgd_update(
$weight, $grad, $kwargs
);
}
}
else
{
if(defined $state->[0])
{
AI::MXNet::NDArray->mp_sgd_mom_update(
$weight, $grad, $state->[0], $state->[1], $kwargs
);
}
else
{
AI::MXNet::NDArray->mp_sgd_update(
$weight, $grad, $state->[1], $kwargs
);
}
}
}
__PACKAGE__->register;
package AI::MXNet::DCASGD;
use Mouse;
use AI::MXNet::Base;
extends 'AI::MXNet::Optimizer';
=head1 NAME
AI::MXNet::DCASGD - DCASGD optimizer with momentum and weight regularization.
=cut
=head1 DESCRIPTION
DCASGD optimizer with momentum and weight regularization.
Implements paper "Asynchronous Stochastic Gradient Descent with
Delay Compensation for Distributed Deep Learning"
Parameters
----------
learning_rate : float, optional
learning_rate of SGD
momentum : float, optional
momentum value
lamda : float, optional
scale DC value
wd : float, optional
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient. Normally should be 1/batch_size.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
param_idx2name : hash ref of string/int to float, optional
special treat weight decay in parameter ends with bias, gamma, and beta
=cut
has 'momentum' => (is => 'ro', isa => 'Num', default => 0);
has 'lamda' => (is => 'ro', isa => 'Num', default => 0.04);
has 'weight_previous' => (is => 'rw', init_arg => undef);
sub BUILD
{
my $self = shift;
$self->weight_previous({});
}
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
$self->momentum ? AI::MXNet::NDArray->zeros(
$weight->shape, ctx => $weight->context, dtype => $weight->dtype
) : undef,
$weight->copy
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
Maybe[AI::MXNet::NDArray] $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
$grad *= $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
my ($mom, $weight_previous) = @{ $state };
if(defined $mom)
{
$mom *= $self->momentum;
$mom += -$lr * (
$grad + $wd * $weight
+
$self->lamda * $grad * $grad * ($weight - $weight_previous)
);
}
else
{
assert($self->momentum == 0);
$mom = -$lr * (
$grad + $wd * $weight
+
$self->lamda * $grad * $grad * ($weight - $weight_previous)
);
}
$weight_previous .= $weight;
$weight += $mom;
}
__PACKAGE__->register;
=head1 NAME
AI::MXNet::NAG - SGD with Nesterov weight handling.
=cut
=head1 DESCRIPTION
It is implemented according to
https://github.com/torch/optim/blob/master/sgd.lua
=cut
package AI::MXNet::NAG;
use Mouse;
extends 'AI::MXNet::SGD';
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
AI::MXNet::NDArray|Undef $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
$grad = $grad * $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
if($state)
{
my $mom = $state;
$mom *= $self->momentum;
$grad += $wd * $weight;
$mom += $grad;
$grad += $self->momentum * $mom;
$weight += -$lr * $grad;
}
else
{
confess("momentum != 0") unless $self->momentum == 0;
$weight += -$lr * ($grad + $wd * $weight);
}
}
__PACKAGE__->register;
=head1 NAME
AI::MXNet::SLGD - Stochastic Langevin Dynamics Updater to sample from a distribution.
=cut
=head1 DESCRIPTION
Stochastic Langevin Dynamics Updater to sample from a distribution.
Parameters
----------
learning_rate : float, optional
learning_rate of SGD
wd : float, optional
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient. Normally should be 1/batch_size.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
param_idx2name : dict of string/int to float, optional
special treat weight decay in parameter ends with bias, gamma, and beta
=cut
package AI::MXNet::SLGD;
use Mouse;
extends 'AI::MXNet::Optimizer';
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return undef;
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
AI::MXNet::NDArray|Undef $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
$grad *= $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
$weight += - $lr/2 * ($grad + $wd * $weight)
+
AI::MXNet::Random->normal(
0, sqrt($lr),
$weight->shape,
$weight->context
);
}
__PACKAGE__->register;
=head1 NAME
AI::MXNet::Adam - Adam optimizer as described in [King2014]_.
=cut
=head1 DESCRIPTION
Adam optimizer as described in [King2014]_.
.. [King2014] Diederik Kingma, Jimmy Ba,
*Adam: A Method for Stochastic Optimization*,
http://arxiv.org/abs/1412.6980
the code in this class was adapted from
https://github.com/mila-udem/blocks/blob/master/blocks/algorithms/__init__.py#L765
Parameters
----------
learning_rate : float, optional
Step size.
Default value is set to 0.001.
beta1 : float, optional
Exponential decay rate for the first moment estimates.
Default value is set to 0.9.
beta2 : float, optional
Exponential decay rate for the second moment estimates.
Default value is set to 0.999.
epsilon : float, optional
Default value is set to 1e-8.
decay_factor : float, optional
Default value is set to 1 - 1e-8.
wd : float, optional
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient. Normally should be 1/batch_size.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
=cut
package AI::MXNet::Adam;
use Mouse;
extends 'AI::MXNet::Optimizer';
has 'kwargs' => (is => "rw", isa => "HashRef[Num]");
has '+learning_rate' => (default => 0.001);
has 'beta1' => (is => "rw", isa => "Num", default => 0.9);
has 'beta2' => (is => "rw", isa => "Num", default => 0.999);
has 'epsilon' => (is => "rw", isa => "Num", default => 1e-8);
has 'decay_factor' => (is => "rw", isa => "Num", default => (1 - 1e-8));
sub BUILD
{
my $self = shift;
$self->kwargs({
rescale_grad => $self->rescale_grad,
beta1 => $self->beta1,
beta2 => $self->beta2,
epsilon => $self->epsilon
});
if($self->clip_gradient)
{
$self->kwargs->{clip_gradient} = $self->clip_gradient;
}
}
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
), # mean
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
) # variance
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
my $t = $self->_index_update_count->{$index};
my $coef1 = 1 - $self->beta1**$t;
my $coef2 = 1 - $self->beta2**$t;
$lr *= sqrt($coef2)/$coef1;
my ($mean, $var) = @{ $state };
AI::MXNet::NDArray->adam_update(
$weight, $grad, $mean, $var,
{
out => $weight,
lr => $lr,
wd => $wd,
%{ $self->kwargs }
}
);
}
__PACKAGE__->register;
=head1 NAME
AI::MXNet::AdaGrad - AdaGrad optimizer of Duchi et al., 2011
=cut
=head1 DESCRIPTION
AdaGrad optimizer of Duchi et al., 2011,
This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf Eq(5)
by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
in some cases.
Parameters
----------
learning_rate : float, optional
Step size.
Default value is set to 0.05.
wd : float, optional
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient. Normally should be 1/batch_size.
eps: float, optional
A small float number to make the updating processing stable
Default value is set to 1e-7.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
=cut
package AI::MXNet::AdaGrad;
use Mouse;
extends 'AI::MXNet::Optimizer';
has 'float_stable_eps' => (is => "rw", isa => "Num", default => 1e-7);
has '+learning_rate' => (default => 0.05);
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
); # history
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
AI::MXNet::NDArray $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
$grad *= $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
my $history = $state;
$history += ($grad * $grad);
$weight += -$lr
*
(
$grad
/
AI::MXNet::NDArray->sqrt(
$history
+
$self->float_stable_eps
)
+
$wd * $weight
);
}
__PACKAGE__->register;
=head1 NAME
AI::MXNet::RMSProp - RMSProp optimizer of Tieleman & Hinton, 2012.
=cut
=head1 DESCRIPTION
RMSProp optimizer of Tieleman & Hinton, 2012,
For centered=False, the code follows the version in
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
Tieleman & Hinton, 2012
For centered=True, the code follows the version in
http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
Parameters
----------
learning_rate : float, optional
Step size.
Default value is set to 0.001.
gamma1: float, optional
decay factor of moving average for gradient^2.
Default value is set to 0.9.
gamma2: float, optional
"momentum" factor.
Default value if set to 0.9.
Only used if centered=True
epsilon : float, optional
Default value is set to 1e-8.
centered : bool, optional
Use Graves or Tielemans & Hintons version of RMSProp
wd : float, optional
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
clip_weights : float, optional
clip weights in range [-clip_weights, clip_weights]
=cut
package AI::MXNet::RMSProp;
use Mouse;
extends 'AI::MXNet::Optimizer';
has '+learning_rate' => (default => 0.001);
has 'gamma1' => (is => "ro", isa => "Num", default => 0.9);
has 'gamma2' => (is => "ro", isa => "Num", default => 0.9);
has 'epsilon' => (is => "ro", isa => "Num", default => 1e-8);
has 'centered' => (is => "ro", isa => "Bool", default => 0);
has 'clip_weights' => (is => "ro", isa => "Num");
has 'kwargs' => (is => "rw", init_arg => undef);
sub BUILD
{
my $self = shift;
$self->kwargs({
rescale_grad => $self->rescale_grad,
gamma1 => $self->gamma1,
epsilon => $self->epsilon
});
if($self->centered)
{
$self->kwargs->{gamma2} = $self->gamma2;
}
if($self->clip_gradient)
{
$self->kwargs->{clip_gradient} = $self->clip_gradient;
}
if($self->clip_weights)
{
$self->kwargs->{clip_weights} = $self->clip_weights;
}
}
# For centered=False: n
# For centered=True: n, g, delta
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
$self->centered
? (
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
), # n
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
), # g
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
)
) # delta
: (
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
), # n
)
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
my ($n, $g, $delta) = @{ $state };
if($self->centered)
{
AI::MXNet::NDArray->rmspropalex_update(
$weight, $grad, $n, $g, $delta,
{
out => $weight,
lr => $lr,
wd => $wd,
%{ $self->kwargs }
}
);
}
else
{
AI::MXNet::NDArray->rmsprop_update(
$weight, $grad, $n,
{
out => $weight,
lr => $lr,
wd => $wd,
%{ $self->kwargs }
}
);
}
}
__PACKAGE__->register;
=head1 NAME
AI::MXNet::AdaDelta - AdaDelta optimizer.
=cut
=head1 DESCRIPTION
AdaDelta optimizer as described in
Zeiler, M. D. (2012).
*ADADELTA: An adaptive learning rate method.*
http://arxiv.org/abs/1212.5701
Parameters
----------
rho: float
Decay rate for both squared gradients and delta x
epsilon : float
The constant as described in the thesis
wd : float
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient. Normally should be 1/batch_size.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
=cut
package AI::MXNet::AdaDelta;
use Mouse;
extends 'AI::MXNet::Optimizer';
has 'rho' => (is => "rw", isa => "Num", default => 0.9);
has 'epsilon' => (is => "rw", isa => "Num", default => 1e-5);
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
), # accumulated g
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
) # accumulated delta
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
my $wd = $self->_get_wd($index);
$self->_update_count($index);
$grad *= $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
my ($acc_g, $acc_delta) = @{ $state };
$acc_g .= $self->rho * $acc_g + (1 - $self->rho) * $grad * $grad;
my $current_delta = ($acc_delta + $self->epsilon)->sqrt
/
($acc_g + $self->epsilon)->sqrt
*
$grad;
$acc_delta .= $self->rho * $acc_delta + (1 - $self->rho) * $current_delta * $current_delta;
$weight -= $current_delta + $wd * $weight;
}
__PACKAGE__->register;
# For test use
package AI::MXNet::Test;
use Mouse;
extends 'AI::MXNet::Optimizer';
# Create a state to duplicate weight
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
);
}
# performs w += rescale_grad * grad
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
AI::MXNet::NDArray $state
)
{
$weight += $grad * $self->rescale_grad;
$state .= $weight;
}
__PACKAGE__->register;
package AI::MXNet::Ftrl;
=head1 NAME
AI::MXNet::Ftrl
=cut
=head1 DESCRIPTION
Reference:Ad Click Prediction: a View from the Trenches
Parameters
----------
lamda1 : float, optional
L1 regularization coefficient.
learning_rate : float, optional
The initial learning rate.
beta : float, optional
Per-coordinate learning rate correlation parameter.
eta_{t,i}=frac{learning_rate}{beta+sqrt{sum_{s=1^}tg_{s,i}^t}
=cut
use Mouse;
extends 'AI::MXNet::Optimizer';
has '+learning_rate' => (default => 0.1);
has 'beta' => (is => "ro", isa => "Num", default => 1);
has 'lambda1' => (is => "ro", isa => "Num", default => 0.9);
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
), # dn
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
) # n
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
$self->_update_count($index);
my $wd = $self->_get_wd($index);
my $lr = $self->_get_lr($index);
$grad *= $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
my ($dn, $n) = @{ $state };
$dn += $grad - (($n + $grad * $grad)->sqrt - $n->sqrt) * $weight / $lr;
$n += $grad * $grad;
$weight .= ($dn->sign * $self->lamda1 - $dn)
/
(($self->beta + $n->sqrt) / $lr + $wd) * ($dn->abs > $self->lamda1);
}
__PACKAGE__->register;
package AI::MXNet::Adamax;
=head1 NAME
AI::MXNet::Adamax
=cut
=head1 DESCRIPTION
It is a variant of Adam based on the infinity norm
available at http://arxiv.org/abs/1412.6980 Section 7.
This optimizer accepts the following parameters in addition to those accepted
AI::MXNet::Optimizer.
Parameters
----------
beta1 : float, optional
Exponential decay rate for the first moment estimates.
beta2 : float, optional
Exponential decay rate for the second moment estimates.
=cut
use Mouse;
extends 'AI::MXNet::Optimizer';
has '+learning_rate' => (default => 0.002);
has 'beta1' => (is => "ro", isa => "Num", default => 0.9);
has 'beta2' => (is => "ro", isa => "Num", default => 0.999);
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
), # mean
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
) # variance
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
my $wd = $self->_get_wd($index);
my $lr = $self->_get_lr($index);
$self->_update_count($index);
my $t = $self->_index_update_count->{$index};
$lr /= (1 - $self->beta1**$t);
$grad = $grad * $self->rescale_grad + $wd * $weight;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
# update m_t and u_t
my($m_t, $u_t) = @{ $state };
$m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
$u_t .= AI::MXNet::NDArray->maximum($self->beta2 * $u_t, $grad->abs);
# update weight
$weight -= $lr * $m_t / $u_t;
}
__PACKAGE__->register;
package AI::MXNet::Nadam;
=head1 NAME
AI::MXNet::Nadam
=cut
=head1 DESCRIPTION
The Nesterov Adam optimizer.
Much like Adam is essentially RMSprop with momentum,
Nadam is Adam RMSprop with Nesterov momentum available
at http://cs229.stanford.edu/proj2015/054_report.pdf.
This optimizer accepts the following parameters in addition to those accepted
AI::MXNet::Optimizer.
Parameters
----------
beta1 : float, optional
Exponential decay rate for the first moment estimates.
beta2 : float, optional
Exponential decay rate for the second moment estimates.
epsilon : float, optional
Small value to avoid division by 0.
schedule_decay : float, optional
Exponential decay rate for the momentum schedule
=cut
use Mouse;
extends 'AI::MXNet::Optimizer';
has '+learning_rate' => (default => 0.001);
has 'beta1' => (is => "ro", isa => "Num", default => 0.9);
has 'beta2' => (is => "ro", isa => "Num", default => 0.999);
has 'epsilon' => (is => "ro", isa => "Num", default => 1e-8);
has 'schedule_decay' => (is => "ro", isa => "Num", default => 0.004);
has 'm_schedule' => (is => "rw", default => 1, init_arg => undef);
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
), # mean
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
) # variance
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
my $wd = $self->_get_wd($index);
my $lr = $self->_get_lr($index);
$self->_update_count($index);
my $t = $self->_index_update_count->{$index};
$grad = $grad * $self->rescale_grad + $wd * $weight;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
# warming momentum schedule
my $momentum_t = $self->beta1 * (1 - 0.5 * (0.96**($t * $self->schedule_decay)));
my $momentum_t_1 = $self->beta1 * (1 - 0.5 * (0.96**(($t + 1) * $self->schedule_decay)));
$self->m_schedule = $self->m_schedule * $momentum_t;
my $m_schedule_next = $self->m_schedule * $momentum_t_1;
# update m_t and v_t
my ($m_t, $v_t) = @{ $state };
$m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
$v_t .= $self->beta2 * $v_t + (1 - $self->beta2) * $grad * $grad;
my $grad_prime = $grad / (1 - $self->m_schedule);
my $m_t_prime = $m_t / (1 - $m_schedule_next);
my $v_t_prime = $v_t / (1 - $self->beta2**$t);
my $m_t_bar = (1 - $momentum_t) * $grad_prime + $momentum_t_1 * $m_t_prime;
# update weight
$weight -= $lr * $m_t_bar / (sqrt($v_t_prime) + $self->epsilon);
}
__PACKAGE__->register;
# updater for kvstore
package AI::MXNet::Updater;
use Mouse;
use Storable qw(thaw freeze);
use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } },
fallback => 1;
has "optimizer" => (is => "rw", isa => "AI::MXNet::Optimizer");
has "states" => (is => "rw", isa => "HashRef", default => sub { +{} });
has "states_synced" => (is => "rw", isa => "HashRef", default => sub { +{} });
method call(Index $index, AI::MXNet::NDArray $grad, AI::MXNet::NDArray $weight)
{
if(not exists $self->states->{ $index })
{
$self->states->{ $index } = $self->optimizer->create_state($index, $weight);
$self->states_synced->{ $index } = 1;
}
elsif(not $self->states_synced->{ $index })
{
$self->states->{ $index } = $self->sync_state_context($self->states->{ $index }, $weight->context);
$self->states_synced->{ $index } = 1;
}
$self->optimizer->update($index, $weight, $grad, $self->states->{ $index });
}
*slice = *call;
method sync_state_context(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $state, AI::MXNet::Context $context)
{
if(blessed $state)
{
return $state->as_in_context($context);
}
elsif(ref $state)
{
return [map { $self->sync_state_context($_, $context) } @{ $state }];
}
return $state;
}
method set_states($states)
{
my $thawed_states = thaw($states);
$self->states($thawed_states);
%{ $self->states_synced } = map { $_ => 0 } keys %{ $thawed_states };
}
method get_states()
{
return freeze($self->states);
}
package AI::MXNet::Optimizer;
method get_updater(AI::MXNet::Optimizer $optimizer)
{
return AI::MXNet::Updater->new(optimizer => $optimizer);
}
1;