in src/ml/optimization/line_search-inl.hpp [329:588]
inline ls_return more_thuente(
first_order_opt_interface& model,
double init_step,
double init_func_value,
DenseVector point,
Vector gradient,
DenseVector direction,
double function_scaling = 1.0,
const std::shared_ptr<smooth_regularizer_interface> reg=NULL,
size_t max_function_evaluations = LS_MAX_ITER){
// Initialize the return object
ls_return stats;
// Input checking: Initial step size can't be zero.
if (init_step <= LS_ZERO){
logprogress_stream << " Error:"
<<" \nInitial step step less than "<< LS_ZERO
<< "." << std::endl;
return stats;
}
// Check that the initia direction is a descent direction.
// This can only occur of your gradients were computed incorrectly
// or the problem is non-convex.
DenseVector x0 = point;
double Dphi0 = gradient.dot(direction);
if ( (init_step <= 0) | (Dphi0 >= OPTIMIZATION_ZERO) ){
logprogress_stream << " Error: Search direction is not a descent direction."
<<" \nDetected numerical difficulties." << std::endl;
}
// Initializing local variables
// stx, fx, dgx: Values of the step, function, and derivative at the best
// step.
//
// sty, fy, dgy: Values of the step, function, and derivative at the other
// endpoint of the interval of uncertainty.
//
// st, f, dg : Values of the step, function, and derivative at current
// step
//
// g : Gradient w.r.t x and step (vector) at the current point
//
double stx = LS_ZERO;
double fx = init_func_value;
double dgx = Dphi0; // Derivative of f(x + s d) w.r.t s
double sty = LS_ZERO;
double fy = init_func_value;
double dgy = Dphi0;
double stp = init_step;
double f = init_func_value;
double dg = Dphi0;
Vector g = gradient;
DenseVector reg_gradient(gradient.size());
// Interval [stmax, stmin] of uncertainty
double stmax = LS_ZERO;
double stmin = LS_MAX_STEP_SIZE;
// Flags and local variables
bool brackt = false; // Bracket or Zoon phase?
bool stage1 = true;
double wolfe_func_dec = LS_C1*Dphi0; // Wolfe condition [W1]
double wolfe_curvature = LS_C2*Dphi0; // Wolfe condition [W2]
double width = stmax - stmin;
double width2 = 2*width;
// Constants used in this code.
// (Based on http://www.ece.northwestern.edu/~nocedal/lbfgs.html)
const double p5 = 0.5;
const double p66 = 0.66;
const double xtrapf = 4;
bool infoc = true; // Zoom status
// Start searching
while (true){
// Set the min and max steps based on the current interval of uncertainty.
if (brackt == true){
stmin = std::min(stx,sty);
stmax = std::max(stx,sty);
}else{
stmin = stx;
stmax = stp + xtrapf*(stp - stx);
}
// Force the step to be within the bounds
stp = std::max(stp,LS_ZERO);
stp = std::min(stp,LS_MAX_STEP_SIZE);
// If an unusual termination is to occur then let 'stp' be the lowest point
// obtained so far.
if ( (infoc == false)
|| (brackt && (stmax-stmin <= LS_ZERO))){
logprogress_stream << "Warning:"
<< " Unusual termination criterion reached."
<< "\nReturning the best step found so far."
<< " This typically happens when the number of features is much"
<< " larger than the number of training samples. Consider pruning"
<< " features manually or increasing the regularization value."
<< std::endl;
stp = stx;
}
// Reached func evaluation limit -- return the best one so far.
if (size_t(stats.func_evals) >= max_function_evaluations){
stats.step_size = stx;
stats.status = true;
return stats;
}
// Evaluate the function and gradient at stp and compute the directional
// derivative.
point = x0 + stp * direction;
model.compute_first_order_statistics(point, g, f);
stats.num_passes++;
stats.func_evals++;
stats.gradient_evals++;
if (reg != NULL){
reg->compute_gradient(point, reg_gradient);
f += reg->compute_function_value(point);
g += reg_gradient;
}
if(function_scaling != 1.0) {
f *= function_scaling;
g *= function_scaling;
}
dg = g.dot(direction);
double ftest = init_func_value + stp*wolfe_func_dec;
// Termination checking
// Note: There are many good checks and balances used in Nocedal's code
// Some of them are overly defensive and should not happen.
// Rounding errors
if ( (brackt && ((stp <= stmin) || (stp >= stmax))) || (infoc == false)){
logprogress_stream << "Warning: Rounding errors"
<< " prevent further progress. \nThere may not be a step which"
<< " satisfies the sufficient decrease and curvature conditions."
<< " \nTolerances may be too small or dataset may be poorly scaled."
<< " This typically happens when the number of features is much"
<< " larger than the number of training samples. Consider pruning"
<< " features manually or increasing the regularization value."
<< std::endl;
stats.step_size = stp;
stats.status = false;
return stats;
}
// Step is more than LS_MAX_STEP_SIZE
if ((stp >= LS_MAX_STEP_SIZE) && (f <= ftest) && (dg <= wolfe_func_dec)){
logprogress_stream << "Warning: Reached max step size."
<< std::endl;
stats.step_size = stp;
stats.status = true;
return stats;
}
// Step is smaller than LS_ZERO
if ((stp <= LS_ZERO) && ((f > ftest) || (dg >= wolfe_func_dec))){
logprogress_stream << "Error: Reached min step size."
<< " Cannot proceed anymore."
<< std::endl;
stats.step_size = stp;
stats.status = false;
return stats;
}
// Relative width of the interval of uncertainty is reached.
if (brackt && (stmax-stmin <= LS_ZERO)){
logprogress_stream << "Error: \nInterval of uncertainty"
<< "lower than step size limit." << std::endl;
stats.status = false;
return stats;
}
// Wolfe conditions W1 and W2 are satisfied! Woo!
if ((f <= ftest) && (std::abs(dg) <= -wolfe_curvature)){
stats.step_size = stp;
stats.status = true;
return stats;
}
// Stage 1 is a search for steps for which the modified function has
// a nonpositive value and nonnegative derivative.
if ( stage1 && (f <= ftest) && (dg >= wolfe_curvature)){
stage1 = false;
}
// A modified function is used to predict the step only if we have not
// obtained a step for which the modified function has a nonpositive
// function value and nonnegative derivative, and if a lower function
// value has been obtained but the decrease is not sufficient.
if (stage1 && (f <= fx) && (f > ftest)){
// Define the modified function and derivative values.
double fm = f - stp*wolfe_func_dec;
double fxm = fx - stx*wolfe_func_dec;
double fym = fy - sty*wolfe_func_dec;
double dgm = dg - wolfe_func_dec;
double dgxm = dgx - wolfe_func_dec;
double dgym = dgy - wolfe_func_dec;
// Call cstep to update the interval of uncertainty and to compute the
// new step.
infoc = cstep(stx,fxm, dgxm,
sty, fym, dgym,
stp, fm, dgm,
brackt,
stmin,stmax);
// Reset the function and gradient values for f.
fx = fxm + stx*wolfe_func_dec;
fy = fym + sty*wolfe_func_dec;
dgx = dgxm + wolfe_func_dec;
dgy = dgym + wolfe_func_dec;
}else{
// Call cstep to update the interval of uncertainty and to compute the
// new step.
infoc = cstep(stx,fx, dgx,
sty, fy, dgy,
stp, f, dg,
brackt,
stmin,stmax);
}
// Force a sufficient decrease in the size of the interval of uncertainty.
if (brackt){
if (std::abs(sty-stx) >= p66*width2){
stp = stx + p5*(sty - stx);
}
width2 = width;
width = std::abs(sty-stx);
}
} // end-of-while
return stats;
}