inline ls_return more_thuente()

in src/ml/optimization/line_search-inl.hpp [329:588]


inline ls_return more_thuente(
    first_order_opt_interface& model,
    double init_step,
    double init_func_value,
    DenseVector point,
    Vector gradient,
    DenseVector direction,
    double function_scaling = 1.0,
    const std::shared_ptr<smooth_regularizer_interface> reg=NULL,
    size_t max_function_evaluations = LS_MAX_ITER){


    // Initialize the return object
    ls_return stats;

    // Input checking: Initial step size can't be zero.
    if (init_step <= LS_ZERO){
      logprogress_stream << " Error:"
                         <<" \nInitial step step less than "<< LS_ZERO
                         << "." << std::endl;
      return stats;
    }


    // Check that the initia direction is a descent direction.
    // This can only occur of your gradients were computed incorrectly
    // or the problem is non-convex.
    DenseVector x0 = point;
    double Dphi0 = gradient.dot(direction);
    if ( (init_step <= 0) | (Dphi0 >= OPTIMIZATION_ZERO) ){
      logprogress_stream << " Error: Search direction is not a descent direction."
                          <<" \nDetected numerical difficulties." << std::endl;
    }

    // Initializing local variables
    // stx, fx, dgx: Values of the step, function, and derivative at the best
    //               step.
    //
    // sty, fy, dgy: Values of the step, function, and derivative at the other
    //               endpoint of the interval of uncertainty.
    //
    // st, f, dg   : Values of the step, function, and derivative at current
    //               step
    //
    // g           : Gradient w.r.t x and step (vector) at the current point
    //
    double stx = LS_ZERO;
    double fx = init_func_value;
    double dgx = Dphi0;                   // Derivative of f(x + s d) w.r.t s

    double sty = LS_ZERO;
    double fy = init_func_value;
    double dgy = Dphi0;

    double stp = init_step;
    double f = init_func_value;
    double dg = Dphi0;
    Vector g = gradient;
    DenseVector reg_gradient(gradient.size());

    // Interval [stmax, stmin] of uncertainty
    double stmax = LS_ZERO;
    double stmin = LS_MAX_STEP_SIZE;

    // Flags and local variables
    bool brackt = false;                 // Bracket or Zoon phase?
    bool stage1 = true;

    double wolfe_func_dec  = LS_C1*Dphi0;         // Wolfe condition [W1]
    double wolfe_curvature = LS_C2*Dphi0;         // Wolfe condition [W2]
    double width = stmax - stmin;
    double width2 = 2*width;


    // Constants used in this code.
    // (Based on http://www.ece.northwestern.edu/~nocedal/lbfgs.html)
    const double p5 = 0.5;
    const double p66 = 0.66;
    const double xtrapf = 4;
    bool infoc = true;                            // Zoom status

    // Start searching
    while (true){

      // Set the min and max steps based on the current interval of uncertainty.
      if (brackt == true){
        stmin = std::min(stx,sty);
        stmax = std::max(stx,sty);
      }else{
        stmin = stx;
        stmax = stp + xtrapf*(stp - stx);
      }

      // Force the step to be within the bounds
      stp = std::max(stp,LS_ZERO);
      stp = std::min(stp,LS_MAX_STEP_SIZE);

      // If an unusual termination is to occur then let 'stp' be the lowest point
      // obtained so far.
      if ( (infoc == false)
          || (brackt && (stmax-stmin <= LS_ZERO))){

         logprogress_stream << "Warning:"
            << " Unusual termination criterion reached."
            << "\nReturning the best step found so far."
            << " This typically happens when the number of features is much"
            << " larger than the number of training samples. Consider pruning"
            << " features manually or increasing the regularization value."
            << std::endl;
         stp = stx;
      }

      // Reached func evaluation limit -- return the best one so far.
      if (size_t(stats.func_evals) >= max_function_evaluations){
        stats.step_size = stx;
        stats.status = true;
        return stats;
      }

      // Evaluate the function and gradient at stp and compute the directional
      // derivative.
      point = x0 + stp * direction;
      model.compute_first_order_statistics(point, g, f);
      stats.num_passes++;
      stats.func_evals++;
      stats.gradient_evals++;
      if (reg != NULL){
        reg->compute_gradient(point, reg_gradient);
        f += reg->compute_function_value(point);
        g += reg_gradient;
      }

      if(function_scaling != 1.0) {
        f *= function_scaling;
        g *= function_scaling;
      }

      dg = g.dot(direction);

      double ftest = init_func_value + stp*wolfe_func_dec;

      // Termination checking
      // Note: There are many good checks and balances used in Nocedal's code
      // Some of them are overly defensive and should not happen.

      // Rounding errors
      if ( (brackt && ((stp <= stmin) || (stp >= stmax))) || (infoc == false)){
          logprogress_stream << "Warning: Rounding errors"
            << " prevent further progress. \nThere may not be a step which"
            << " satisfies the sufficient decrease and curvature conditions."
            << " \nTolerances may be too small or dataset may be poorly scaled."
            << " This typically happens when the number of features is much"
            << " larger than the number of training samples. Consider pruning"
            << " features manually or increasing the regularization value."
            << std::endl;
          stats.step_size = stp;
          stats.status = false;
          return stats;
      }

      // Step is more than LS_MAX_STEP_SIZE
      if ((stp >= LS_MAX_STEP_SIZE) && (f <= ftest) && (dg <= wolfe_func_dec)){
        logprogress_stream << "Warning: Reached max step size."
                           << std::endl;
        stats.step_size = stp;
        stats.status = true;
        return stats;
      }

      // Step is smaller than LS_ZERO
      if ((stp <= LS_ZERO) && ((f > ftest) || (dg >= wolfe_func_dec))){
        logprogress_stream << "Error: Reached min step size."
                           << " Cannot proceed anymore."
                           << std::endl;
        stats.step_size = stp;
        stats.status = false;
        return stats;
      }



      // Relative width of the interval of uncertainty is reached.
      if (brackt && (stmax-stmin <= LS_ZERO)){
        logprogress_stream << "Error: \nInterval of uncertainty"
                           << "lower than step size limit." << std::endl;
        stats.status = false;
        return stats;
      }

      // Wolfe conditions W1 and W2 are satisfied! Woo!
      if ((f <= ftest) && (std::abs(dg) <= -wolfe_curvature)){
        stats.step_size = stp;
        stats.status = true;
        return stats;
      }

      // Stage 1 is a search for steps for which the modified function has
      // a nonpositive value and nonnegative derivative.
      if ( stage1 && (f <= ftest) && (dg >= wolfe_curvature)){
             stage1 = false;
      }


      // A modified function is used to predict the step only if we have not
      // obtained a step for which the modified function has a nonpositive
      // function value and nonnegative derivative, and if a lower function
      // value has been  obtained but the decrease is not sufficient.

      if (stage1 && (f <= fx) && (f > ftest)){

         // Define the modified function and derivative values.
         double fm = f - stp*wolfe_func_dec;
         double fxm = fx - stx*wolfe_func_dec;
         double fym = fy - sty*wolfe_func_dec;
         double dgm = dg - wolfe_func_dec;
         double dgxm = dgx - wolfe_func_dec;
         double dgym = dgy - wolfe_func_dec;

         // Call cstep to update the interval of uncertainty and to compute the
         // new step.
         infoc = cstep(stx,fxm, dgxm,
                       sty, fym, dgym,
                       stp, fm, dgm,
                       brackt,
                       stmin,stmax);

         // Reset the function and gradient values for f.
         fx = fxm + stx*wolfe_func_dec;
         fy = fym + sty*wolfe_func_dec;
         dgx = dgxm + wolfe_func_dec;
         dgy = dgym + wolfe_func_dec;

      }else{

         // Call cstep to update the interval of uncertainty and to compute the
         // new step.
         infoc = cstep(stx,fx, dgx,
                       sty, fy, dgy,
                       stp, f, dg,
                       brackt,
                       stmin,stmax);

      }


      // Force a sufficient decrease in the size of the interval of uncertainty.
      if (brackt){
         if (std::abs(sty-stx) >= p66*width2){
           stp = stx + p5*(sty - stx);
         }

         width2 = width;
         width = std::abs(sty-stx);
      }

    } // end-of-while

    return stats;

}