lib/maths/common/CBayesianOptimisation.cc (764 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <maths/common/CBayesianOptimisation.h> #include <core/CContainerPrinter.h> #include <core/CIEEE754.h> #include <core/CJsonStateRestoreTraverser.h> #include <core/CMemoryDef.h> #include <core/CPersistUtils.h> #include <core/Constants.h> #include <core/RestoreMacros.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CChecksum.h> #include <maths/common/CLbfgs.h> #include <maths/common/CLinearAlgebraEigen.h> #include <maths/common/CLinearAlgebraShims.h> #include <maths/common/CMathsFuncs.h> #include <maths/common/COrderings.h> #include <maths/common/CSampling.h> #include <maths/common/CTools.h> #include <boost/math/constants/constants.hpp> #include <boost/math/distributions/normal.hpp> #include <cmath> #include <exception> #include <limits> namespace ml { namespace maths { namespace common { namespace { using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator; using TMeanVarAccumulator = CBasicStatistics::SSampleMeanVar<double>::TAccumulator; using TMinAccumulator = CBasicStatistics::COrderStatisticsHeap<std::pair<double, CBayesianOptimisation::TVector>>; const std::string VERSION_7_5_TAG{"7.5"}; const std::string MIN_BOUNDARY_TAG{"min_boundary"}; const std::string MAX_BOUNDARY_TAG{"max_boundary"}; const std::string ERROR_VARIANCES_TAG{"error_variances"}; const std::string KERNEL_PARAMETERS_TAG{"kernel_parameters"}; const std::string MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG{"min_kernel_coordinate_distance_scales"}; const std::string FUNCTION_MEAN_VALUES_TAG{"function_mean_values"}; const std::string RANGE_SHIFT_TAG{"range_shift"}; const std::string RANGE_SCALE_TAG{"range_scale"}; const std::string RESTARTS_TAG{"restarts"}; const std::string RNG_TAG{"rng"}; // The kernel we use is v * I + a(0)^2 * O(I). We fall back to random search when // a(0)^2 < eps * v since for small eps and a reasonable number of dimensions the // expected improvement will be constant in the space we search. We don't terminate // altogether because it is possible that the function we're interpolating has a // narrow deep valley that the Gaussian Process hasn't sampled. const double MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION{1e-8}; //! Affine transform \p scale * (\p fx - \p shift). double toScaled(double shift, double scale, double fx) { return scale * (fx - shift); } //! Affine transform \p shift + \p scale / \p fx. double fromScaled(double shift, double scale, double fx) { return shift + fx / scale; } //! A version of the normal c.d.f. which is stable across our target platforms. double stableNormCdf(double z) { return CTools::stable(CTools::safeCdf(boost::math::normal{0.0, 1.0}, z)); } //! A version of the normal p.d.f. which is stable across our target platforms. double stableNormPdf(double z) { return CTools::stable(CTools::safePdf(boost::math::normal{0.0, 1.0}, z)); } double integrate1dKernel(double theta1, double x) { double c{std::sqrt(CTools::pow2(theta1) + MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION)}; return CTools::stable(boost::math::constants::root_pi<double>() * (std::erf(c * (1 - x)) + std::erf(c * x)) / (2 * c)); } double integrate1dKernelProduct(double theta1, double xit, double xjt) { double c{std::sqrt(CTools::pow2(theta1) + MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION)}; return CTools::stable( boost::math::constants::root_half_pi<double>() / (2 * c) * CTools::stableExp(-0.5 * CTools::pow2(c) * CTools::pow2(xit - xjt)) * (CTools::stable(std::erf(c / boost::math::constants::root_two<double>() * (xit + xjt))) - CTools::stable(std::erf(c / boost::math::constants::root_two<double>() * (xit + xjt - 2))))); } } CBayesianOptimisation::CBayesianOptimisation(TDoubleDoublePrVec parameterBounds, std::size_t restarts) : m_Restarts{restarts}, m_MinBoundary(parameterBounds.size()), m_MaxBoundary(parameterBounds.size()), m_KernelParameters(parameterBounds.size() + 1), m_MinimumKernelCoordinateDistanceScale(parameterBounds.size()) { m_KernelParameters.setOnes(); m_MinimumKernelCoordinateDistanceScale.setConstant( parameterBounds.size(), MINIMUM_KERNEL_COORDINATE_DISTANCE_SCALE); for (std::size_t i = 0; i < parameterBounds.size(); ++i) { m_MinBoundary(i) = parameterBounds[i].first; m_MaxBoundary(i) = parameterBounds[i].second; } } CBayesianOptimisation::CBayesianOptimisation(TVectorVectorPr parameterBounds, std::size_t restarts) : m_Restarts{restarts}, m_MinBoundary{std::move(parameterBounds.first)}, m_MaxBoundary{std::move(parameterBounds.second)}, m_KernelParameters(m_MinBoundary.rows() + 1), m_MinimumKernelCoordinateDistanceScale(m_MinBoundary.rows()) { if (m_MinBoundary.rows() != m_MaxBoundary.rows()) { LOG_ERROR(<< "Input error: sizes of the minimum and maximum boundaries do not match. " << "Please report this error."); } m_KernelParameters.setOnes(); m_MinimumKernelCoordinateDistanceScale.setConstant( m_MaxBoundary.rows(), MINIMUM_KERNEL_COORDINATE_DISTANCE_SCALE); } CBayesianOptimisation::CBayesianOptimisation(core::CStateRestoreTraverser& traverser) { if (traverser.traverseSubLevel([this](auto& traverser_) { return this->acceptRestoreTraverser(traverser_); }) == false) { throw std::runtime_error{"failed to restore Bayesian optimisation"}; } } void CBayesianOptimisation::add(TVector x, double fx, double vx) { if (CMathsFuncs::isFinite(fx) == false || CMathsFuncs::isFinite(vx) == false) { LOG_ERROR(<< "Discarding point (" << x.transpose() << "," << fx << "," << vx << ")"); return; } x = this->to01(std::move(x)); fx = toScaled(m_RangeShift, m_RangeScale, fx); vx = CTools::pow2(m_RangeScale) * vx; std::size_t duplicate(std::find_if(m_FunctionMeanValues.begin(), m_FunctionMeanValues.end(), [&](const auto& value) { return (x - value.first).norm() == 0.0; }) - m_FunctionMeanValues.begin()); if (duplicate < m_FunctionMeanValues.size()) { auto& f = m_FunctionMeanValues[duplicate].second; auto& v = m_ErrorVariances[duplicate]; auto moments = CBasicStatistics::momentsAccumulator(1.0, f, v) + CBasicStatistics::momentsAccumulator(1.0, fx, vx); f = CBasicStatistics::mean(moments); v = CBasicStatistics::maximumLikelihoodVariance(moments); } else { m_FunctionMeanValues.emplace_back(std::move(x), fx); m_ErrorVariances.push_back(vx); } } void CBayesianOptimisation::explainedErrorVariance(double vx) { m_ExplainedErrorVariance = CTools::pow2(m_RangeScale) * vx; } std::size_t CBayesianOptimisation::restarts() const { return m_Restarts; } CBayesianOptimisation::TVectorVectorPr CBayesianOptimisation::boundingBox() const { return {m_MinBoundary, m_MaxBoundary}; } std::pair<CBayesianOptimisation::TVector, CBayesianOptimisation::TOptionalDouble> CBayesianOptimisation::maximumExpectedImprovement(std::size_t numberRounds, double negligibleExpectedImprovement) { // Reapply conditioning and recompute the maximum likelihood kernel parameters. this->maximumLikelihoodKernel(numberRounds); TVector xmax; double fmax{-1.0}; TEIFunc minusEI; TEIGradientFunc minusEIGradient; std::tie(minusEI, minusEIGradient) = this->minusExpectedImprovementAndGradient(); // Use random restarts inside the constraint bounding box. TVector interpolate(m_MinBoundary.size()); TDoubleVec interpolates; CSampling::uniformSample(m_Rng, 0.0, 1.0, 10 * m_Restarts * interpolate.size(), interpolates); TVector a{TVector::Zero(m_MinBoundary.size())}; TVector b{TVector::Ones(m_MaxBoundary.size())}; TVector x; TMeanAccumulator rho_; TMinAccumulator probes{m_Restarts}; for (int i = 0; i < interpolate.size(); ++i) { interpolate(i) = interpolates[i]; } xmax = a + interpolate.cwiseProduct(b - a); if (CTools::pow2(m_KernelParameters(0)) < MINIMUM_KERNEL_SCALE_FOR_EXPECTATION_MAXIMISATION * this->meanErrorVariance()) { for (std::size_t i = interpolate.size(); i < interpolates.size(); /**/) { for (int j = 0; j < interpolate.size(); ++i, ++j) { interpolate(j) = interpolates[i]; } x = a + interpolate.cwiseProduct(b - a); if (this->dissimilarity(x) > this->dissimilarity(xmax)) { xmax = x; } } } else { for (std::size_t i = 0; i < interpolates.size(); /**/) { for (int j = 0; j < interpolate.size(); ++i, ++j) { interpolate(j) = interpolates[i]; } x = a + interpolate.cwiseProduct(b - a); double fx{minusEI(x)}; LOG_TRACE(<< "x = " << x.transpose() << " EI(x) = " << fx); if (-fx > fmax + negligibleExpectedImprovement || this->dissimilarity(x) > this->dissimilarity(xmax)) { xmax = x; fmax = -fx; } rho_.add(std::fabs(fx)); if (-fx > negligibleExpectedImprovement) { probes.add({fx, std::move(x)}); } } // We set rho to give the constraint and objective approximately equal priority // in the following constrained optimisation problem. double rho{CBasicStatistics::mean(rho_)}; LOG_TRACE(<< "rho = " << rho); CLbfgs<TVector> lbfgs{10}; TVector xcand; double fcand; for (auto& x0 : probes) { LOG_TRACE(<< "x0 = " << x0.second.transpose()); std::tie(xcand, fcand) = lbfgs.constrainedMinimize( minusEI, minusEIGradient, a, b, std::move(x0.second), rho); LOG_TRACE(<< "xcand = " << xcand.transpose() << " EI(cand) = " << fcand); if (-fcand > fmax + negligibleExpectedImprovement || this->dissimilarity(xcand) > this->dissimilarity(xmax)) { std::tie(xmax, fmax) = std::make_pair(std::move(xcand), -fcand); } } } TOptionalDouble expectedImprovement; if (fmax >= 0.0 && CMathsFuncs::isFinite(fmax)) { expectedImprovement = fmax / m_RangeScale; } xmax = this->from01(std::move(xmax)); LOG_TRACE(<< "best = " << xmax.transpose() << " EI(best) = " << expectedImprovement); return {std::move(xmax), expectedImprovement}; } double CBayesianOptimisation::evaluate(const TVector& input) const { return this->evaluate(this->kinvf(), input); } double CBayesianOptimisation::evaluate(const TVector& Kinvf, const TVector& input) const { TVector Kxn; std::tie(Kxn, std::ignore) = this->kernelCovariates( m_KernelParameters, this->to01(input), this->meanErrorVariance()); return fromScaled(m_RangeShift, m_RangeScale, Kxn.transpose() * Kinvf); } double CBayesianOptimisation::evaluate1D(const TVector& Kinvf, double input, int dimension) const { auto prodXt = [this](const TVector& x, int t) -> double { double prod{1.0}; for (int d = 0; d < m_MinBoundary.size(); ++d) { if (d != t) { prod *= integrate1dKernel(m_KernelParameters(d + 1), x(d)); } } return prod; }; double sum{0.0}; input = (input - m_MinBoundary(dimension)) / (m_MaxBoundary(dimension) - m_MinBoundary(dimension)); for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { const TVector& x{m_FunctionMeanValues[i].first}; sum += Kinvf(static_cast<int>(i)) * CTools::stableExp(-(CTools::pow2(m_KernelParameters[dimension + 1]) + MINIMUM_KERNEL_COORDINATE_DISTANCE_SCALE) * CTools::pow2(input - x(dimension))) * prodXt(x, dimension); } double f2{this->anovaConstantFactor(Kinvf)}; // We only get cancellation if the signs are the same (and we need also // to take the square root of both sum and f2 for which they need to be // positive). if (std::signbit(sum) == std::signbit(f2)) { // We rewrite theta_0^2 sum - f_0 as (theta + f) * (theta - f) where // theta = theta_0 sum^(1/2) and f = f_0^(1/2) because it has better // numerics. double theta{m_KernelParameters(0) * std::sqrt(std::fabs(sum))}; double f{std::sqrt(std::fabs(f2))}; return fromScaled(m_RangeShift, m_RangeScale, std::copysign(1.0, sum) * (theta + f) * (theta - f)); } return fromScaled(m_RangeShift, m_RangeScale, CTools::pow2(m_KernelParameters(0)) * sum - f2); } double CBayesianOptimisation::evaluate1D(double input, int dimension) const { if (dimension < 0 || dimension > m_MinBoundary.size()) { LOG_ERROR(<< "Input error: dimension " << dimension << " is out of bounds. " << "It should be between 0 and " << m_MinBoundary.size() << "."); return 0.0; } return this->evaluate1D(this->kinvf(), input, dimension); } double CBayesianOptimisation::anovaConstantFactor(const TVector& Kinvf) const { double sum{0.0}; for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { double prod{1.0}; const TVector& x{m_FunctionMeanValues[i].first}; for (int d = 0; d < x.size(); ++d) { prod *= integrate1dKernel(m_KernelParameters(d + 1), x(d)); } sum += Kinvf(i) * prod; } return CTools::pow2(m_KernelParameters(0)) * sum; } double CBayesianOptimisation::anovaConstantFactor() const { return this->anovaConstantFactor(this->kinvf()); } double CBayesianOptimisation::anovaTotalVariance(const TVector& Kinvf) const { auto prodIj = [&Kinvf, this](std::size_t i, std::size_t j) -> double { const TVector& xi{m_FunctionMeanValues[i].first}; const TVector& xj{m_FunctionMeanValues[j].first}; double prod{1.0}; for (int d = 0; d < xi.size(); ++d) { prod *= integrate1dKernelProduct(m_KernelParameters[d + 1], xi(d), xj(d)); } return Kinvf(static_cast<int>(i)) * Kinvf(static_cast<int>(j)) * prod; }; double sum{0.0}; for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { sum += prodIj(i, i); for (std::size_t j = 0; j < i; ++j) { sum += 2.0 * prodIj(i, j); } } double theta2{CTools::pow2(m_KernelParameters(0))}; double f0{this->anovaConstantFactor(Kinvf)}; double scale2{CTools::pow2(m_RangeScale)}; if (sum > 0.0) { // We rewrite theta_0^4 sum - f_0^2 as (theta^2 + f_0) * (theta^2 - f_0) // where theta^2 = theta_0^2 sum^(1/2) because it has better numerics. theta2 *= std::sqrt(sum); double variance{(theta2 + f0) * (theta2 - f0)}; return std::max(0.0, variance / scale2); } return std::max(0.0, (theta2 * theta2 * sum - f0 * f0) / scale2); } double CBayesianOptimisation::anovaTotalVariance() const { return this->anovaTotalVariance(this->kinvf()); } double CBayesianOptimisation::excessCoefficientOfVariation() { double errorVariance{this->meanErrorVariance() / CTools::pow2(m_RangeScale)}; double excessVariance{std::max(this->anovaTotalVariance() - errorVariance, 0.0)}; return excessVariance == 0.0 ? 0.0 : std::sqrt(excessVariance) / m_RangeShift; } double CBayesianOptimisation::anovaMainEffect(const TVector& Kinvf, int dimension) const { auto prodXt = [this](const TVector& x, int t) -> double { double prod{1.0}; for (int d = 0; d < m_MinBoundary.size(); ++d) { if (d != t) { prod *= integrate1dKernel(m_KernelParameters(d + 1), x(d)); } } return prod; }; double sum1{0.0}; double sum2{0.0}; for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { const TVector& xi{m_FunctionMeanValues[i].first}; for (std::size_t j = 0; j < m_FunctionMeanValues.size(); ++j) { const TVector& xj{m_FunctionMeanValues[j].first}; sum1 += Kinvf(static_cast<int>(i)) * Kinvf(static_cast<int>(j)) * prodXt(xi, dimension) * prodXt(xj, dimension) * integrate1dKernelProduct(m_KernelParameters(dimension + 1), xi(dimension), xj(dimension)); } sum2 += Kinvf(static_cast<int>(i)) * integrate1dKernel(m_KernelParameters(dimension + 1), xi(dimension)) * prodXt(xi, dimension); } double scale2{CTools::pow2(m_RangeScale)}; double theta02{CTools::pow2(m_KernelParameters(0))}; double f0{this->anovaConstantFactor()}; double f02{CTools::pow2(f0)}; return (theta02 * (theta02 * sum1 - 2.0 * f0 * sum2) + f02) / scale2; } double CBayesianOptimisation::anovaMainEffect(int dimension) const { if (dimension < 0 || dimension > m_MinBoundary.size()) { LOG_ERROR(<< "Input error: dimension " << dimension << " is out of bounds. " << "It should be between 0 and " << m_MinBoundary.size() << "."); return 0.0; } return this->anovaMainEffect(this->kinvf(), dimension); } CBayesianOptimisation::TDoubleDoublePrVec CBayesianOptimisation::anovaMainEffects() const { TDoubleDoublePrVec mainEffects; mainEffects.reserve(static_cast<std::size_t>(m_MinBoundary.size())); TVector Kinvf{this->kinvf()}; double totalVariance{this->anovaTotalVariance(Kinvf)}; for (int i = 0; i < m_MinBoundary.size(); ++i) { double effect{this->anovaMainEffect(Kinvf, i)}; mainEffects.emplace_back(effect, effect / totalVariance); } LOG_TRACE(<< "variance " << totalVariance << ", main effects " << mainEffects << ", kernel parameters " << m_KernelParameters.transpose()); return mainEffects; } void CBayesianOptimisation::kernelParameters(const TVector& parameters) { if (m_KernelParameters.size() == parameters.size()) { m_KernelParameters = parameters; m_RangeShift = 0.0; m_RangeScale = 1.0; } } std::pair<CBayesianOptimisation::TLikelihoodFunc, CBayesianOptimisation::TLikelihoodGradientFunc> CBayesianOptimisation::minusLikelihoodAndGradient() const { TVector f{this->function()}; double v{this->meanErrorVariance()}; TVector ones; TVector gradient; Eigen::ColPivHouseholderQR<Eigen::MatrixXd> Kqr; TMatrix K; TVector Kinvf; TMatrix Kinv; TMatrix dKdai; double eps{1e-4}; // We need to be careful when we compute the kernel decomposition. Basically, // if the kernel matrix is singular to working precision then if the function // value vector projection onto the null-space has non-zero length the likelihood // function is effectively -infinity. This follow from the fact that although // log(1 / lambda_i) -> +infinity, -1/2 sum_i{ ||f_i||^2 / lambda_i } -> -infinity // faster for all ||f_i|| > 0 and lambda_i sufficiently small. Here {lambda_i} // denote the Eigenvalues of the nullspace. We use a rank revealing decomposition // and compute the likelihood on the row space. auto minusLogLikelihood = [=](const TVector& a) mutable -> double { K = this->kernel(a, v + eps); Kqr.compute(K); Kinvf.noalias() = Kqr.solve(f); // Note that Kqr.logAbsDeterminant() = -infinity if K is singular. double logAbsDet{0.0}; for (int i = 0; i < Kqr.rank(); ++i) { logAbsDet += std::log(std::fabs(Kqr.matrixR()(i, i))); } logAbsDet = CTools::stable(logAbsDet); return 0.5 * (f.transpose() * Kinvf + logAbsDet); }; auto minusLogLikelihoodGradient = [=](const TVector& a) mutable -> TVector { K = this->kernel(a, v + eps); Kqr.compute(K); Kinvf.noalias() = Kqr.solve(f); ones = TVector::Ones(f.size()); Kinv.noalias() = Kqr.solve(TMatrix::Identity(f.size(), f.size())); K.diagonal() -= (v + eps) * ones; gradient = TVector::Zero(a.size()); for (int i = 0; i < Kinvf.size(); ++i) { double di{(Kinvf(i) * Kinvf.transpose() - Kinv.row(i)) * K.col(i)}; gradient(0) -= di / a(0); } for (int i = 1; i < a.size(); ++i) { dKdai = this->dKerneld(a, i); for (int j = 0; j < Kinvf.size(); ++j) { double di{(Kinvf(j) * Kinvf.transpose() - Kinv.row(j)) * dKdai.col(j)}; gradient(i) += 0.5 * di; } } return gradient; }; return {std::move(minusLogLikelihood), std::move(minusLogLikelihoodGradient)}; } std::pair<CBayesianOptimisation::TEIFunc, CBayesianOptimisation::TEIGradientFunc> CBayesianOptimisation::minusExpectedImprovementAndGradient() const { TMatrix K{this->kernel(m_KernelParameters, this->meanErrorVariance())}; Eigen::LDLT<Eigen::MatrixXd> Kldl{K}; TVector Kinvf{Kldl.solve(this->function())}; double vx{this->meanErrorVariance()}; TVector Kxn; TVector KinvKxn; TVector muGradient; TVector sigmaGradient; TVector dKxndx; TVector zGradient; double fmin{ std::min_element(m_FunctionMeanValues.begin(), m_FunctionMeanValues.end(), [](const TVectorDoublePr& lhs, const TVectorDoublePr& rhs) { return lhs.second < rhs.second; }) ->second}; auto EI = [=](const TVector& x) mutable -> double { double Kxx; std::tie(Kxn, Kxx) = this->kernelCovariates(m_KernelParameters, x, vx); if (CMathsFuncs::isNan(Kxx)) { return 0.0; } KinvKxn = Kldl.solve(Kxn); double error{(K.lazyProduct(KinvKxn) - Kxn).norm()}; if (CMathsFuncs::isNan(error) || error > 0.01 * Kxn.norm()) { return 0.0; } double sigma{Kxx - Kxn.transpose() * KinvKxn}; if (sigma <= 0.0) { return 0.0; } double mu{Kxn.transpose() * Kinvf}; sigma = std::sqrt(sigma); double z{(fmin - mu) / sigma}; double cdfz{stableNormCdf(z)}; double pdfz{stableNormPdf(z)}; return -sigma * (z * cdfz + pdfz); }; auto EIGradient = [=](const TVector& x) mutable -> TVector { double Kxx; std::tie(Kxn, Kxx) = this->kernelCovariates(m_KernelParameters, x, vx); if (CMathsFuncs::isNan(Kxx)) { return las::zero(x); } KinvKxn = Kldl.solve(Kxn); double error{(K.lazyProduct(KinvKxn) - Kxn).norm()}; if (CMathsFuncs::isNan(error) || error > 0.01 * Kxn.norm()) { return las::zero(x); } double sigma{Kxx - Kxn.transpose() * KinvKxn}; if (sigma <= 0.0) { return las::zero(x); } double mu{Kxn.transpose() * Kinvf}; sigma = std::sqrt(sigma); double z{(fmin - mu) / sigma}; double cdfz{stableNormCdf(z)}; double pdfz{stableNormPdf(z)}; muGradient.resize(x.size()); sigmaGradient.resize(x.size()); for (int i = 0; i < x.size(); ++i) { dKxndx.resize(Kxn.size()); for (int j = 0; j < Kxn.size(); ++j) { const TVector& xj{m_FunctionMeanValues[j].first}; dKxndx(j) = 2.0 * (m_MinimumKernelCoordinateDistanceScale(0) + CTools::pow2(m_KernelParameters(i + 1))) * (xj(i) - x(i)) * Kxn(j); } muGradient(i) = Kinvf.transpose() * dKxndx; sigmaGradient(i) = -2.0 * KinvKxn.transpose() * dKxndx; } sigmaGradient /= 2.0 * sigma; zGradient = ((mu - fmin) / CTools::pow2(sigma)) * sigmaGradient - muGradient / sigma; return -(z * cdfz + pdfz) * sigmaGradient - sigma * cdfz * zGradient; }; return {std::move(EI), std::move(EIGradient)}; } const CBayesianOptimisation::TVector& CBayesianOptimisation::maximumLikelihoodKernel(std::size_t numberRounds) { if (m_FunctionMeanValues.size() < 2) { return m_KernelParameters; } using TDoubleVecVec = std::vector<TDoubleVec>; auto lastKernelParameters = m_KernelParameters; do { this->precondition(); TLikelihoodFunc l; TLikelihoodGradientFunc g; std::tie(l, g) = this->minusLikelihoodAndGradient(); CLbfgs<TVector> lbfgs{10}; double lmax{l(m_KernelParameters)}; TVector amax{m_KernelParameters}; // Try the current values first. double la; TVector a; std::tie(a, la) = lbfgs.minimize(l, g, m_KernelParameters, 1e-8, 75); if (COrderings::lexicographicalCompare(la, a.norm(), lmax, amax.norm())) { lmax = la; amax = a; } TMinAccumulator probes{m_Restarts - 1}; // We restart optimization with scales of the current values for global probing. TVector::TIndexType n{m_KernelParameters.size()}; TDoubleVecVec scales; scales.reserve(10 * (m_Restarts - 1)); CSampling::sobolSequenceSample(n, 10 * (m_Restarts - 1), scales); for (const auto& scale : scales) { a.noalias() = m_KernelParameters; for (TVector::TIndexType i = 0; i < n; ++i) { a(i) *= CTools::stableExp(CTools::linearlyInterpolate( 0.0, 1.0, std::log(0.2), std::log(2.0), scale[i])); } la = l(a); if (COrderings::lexicographicalCompare(la, a.norm(), lmax, amax.norm())) { lmax = la; amax = a; } probes.add({la, std::move(a)}); } for (auto& a0 : probes) { std::tie(a, la) = lbfgs.minimize(l, g, std::move(a0.second), 1e-8, 75); if (COrderings::lexicographicalCompare(la, a.norm(), lmax, amax.norm())) { lmax = la; amax = std::move(a); } } // Ensure that kernel lengths are always positive. It shouldn't change the results // but improves traceability. m_KernelParameters = amax.cwiseAbs(); LOG_TRACE(<< "kernel parameters = " << m_KernelParameters.transpose()); LOG_TRACE(<< "likelihood = " << -lmax); } while ([&] { bool result = (--numberRounds > 0 && (m_KernelParameters - lastKernelParameters).norm() > 5e-3 * m_KernelParameters.norm()); lastKernelParameters = m_KernelParameters; return result; }()); return m_KernelParameters; } void CBayesianOptimisation::precondition() { // The Gaussian process expects the data to be centred. We also scale the variance. // This is useful if one wants to threshold values such as EI but the scale of the // function values is very different for example if we're modelling the loss surface // for different loss functions. for (auto& value : m_FunctionMeanValues) { value.second = fromScaled(m_RangeShift, m_RangeScale, value.second); } for (auto& variance : m_ErrorVariances) { variance /= CTools::pow2(m_RangeScale); } m_ExplainedErrorVariance /= CTools::pow2(m_RangeScale); TMeanVarAccumulator valueMoments; for (const auto& value : m_FunctionMeanValues) { valueMoments.add(value.second); } m_RangeShift = CBasicStatistics::mean(valueMoments); m_RangeScale = CBasicStatistics::variance(valueMoments) == 0.0 ? 1.0 : 1.0 / std::sqrt(CBasicStatistics::variance(valueMoments)); for (auto& value : m_FunctionMeanValues) { value.second = toScaled(m_RangeShift, m_RangeScale, value.second); } for (auto& variance : m_ErrorVariances) { variance *= CTools::pow2(m_RangeScale); } m_ExplainedErrorVariance *= CTools::pow2(m_RangeScale); } CBayesianOptimisation::TVector CBayesianOptimisation::function() const { TVector result(m_FunctionMeanValues.size()); for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { result(i) = m_FunctionMeanValues[i].second; } return result; } double CBayesianOptimisation::meanErrorVariance() const { // So what are we doing here? When we supply function values we also supply their // error variance. Typically these might be the mean test loss function across // folds and their variance for a particular choice of hyperparameters. Sticking // with this example, the variance allows us to estimate the error w.r.t. the // true generalisation error due to finite sample size. We can think of the source // of this variance as being due to two effects: one which shifts the loss values // in each fold (this might be due to some folds simply having more hard examples) // and another which permutes the order of loss values. A shift in the loss function // is not something we wish to capture in the GP: it shouldn't materially affect // where to choose points to test since any sensible optimisation strategy should // only care about the difference in loss between points, which is unaffected by a // shift. More formally, if we assume the shift and permutation errors are independent // we have for losses l_i, mean loss per fold m_i and mean loss for a given set of // hyperparameters m that the variance is // // sum_i{ (l_i - m)^2 } = sum_i{ (l_i - m_i + m_i - m)^2 } // = sum_i{ (l_i - m_i)^2 } + sum_i{ (m_i - m)^2 } // = "permutation variance" + "shift variance" (1) // // with the cross-term expected to be small by independence. (Note, the independence // assumption is reasonable if one assumes that the shift is due to mismatch in hard // examples since the we choose folds independently at random.) We can estimate the // shift variance by looking at mean loss over all distinct hyperparameter settings // and we assume it is supplied as the parameter m_ExplainedErrorVariance. It should // also be smaller than the variance by construction although for numerical stability // we prevent the difference becoming too small. As discussed, here we wish return // the permutation variance which we get by rearranging (1). TMeanAccumulator variance; variance.add(m_ErrorVariances); return CBasicStatistics::mean(variance) - std::min(m_ExplainedErrorVariance, 0.99 * CBasicStatistics::mean(variance)); } CBayesianOptimisation::TMatrix CBayesianOptimisation::dKerneld(const TVector& a, int k) const { TMatrix result{m_FunctionMeanValues.size(), m_FunctionMeanValues.size()}; for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { result(i, i) = 0.0; const TVector& xi{m_FunctionMeanValues[i].first}; for (std::size_t j = 0; j < i; ++j) { const TVector& xj{m_FunctionMeanValues[j].first}; result(i, j) = result(j, i) = 2.0 * a(k) * CTools::pow2(xi(k - 1) - xj(k - 1)) * this->kernel(a, xi, xj); } } return result; } CBayesianOptimisation::TMatrix CBayesianOptimisation::kernel(const TVector& a, double v) const { TMatrix result{m_FunctionMeanValues.size(), m_FunctionMeanValues.size()}; for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { result(i, i) = CTools::pow2(a(0)) + v; const TVector& xi{m_FunctionMeanValues[i].first}; for (std::size_t j = 0; j < i; ++j) { const TVector& xj{m_FunctionMeanValues[j].first}; result(i, j) = result(j, i) = this->kernel(a, xi, xj); } } return result; } CBayesianOptimisation::TVectorDoublePr CBayesianOptimisation::kernelCovariates(const TVector& a, const TVector& x, double vx) const { double Kxx{CTools::pow2(a(0)) + vx}; TVector Kxn(m_FunctionMeanValues.size()); for (std::size_t i = 0; i < m_FunctionMeanValues.size(); ++i) { Kxn(i) = this->kernel(a, x, m_FunctionMeanValues[i].first); } return {std::move(Kxn), Kxx}; } double CBayesianOptimisation::kernel(const TVector& a, const TVector& x, const TVector& y) const { return CTools::pow2(a(0)) * CTools::stableExp(-(x - y).transpose() * (m_MinimumKernelCoordinateDistanceScale + a.tail(a.size() - 1).cwiseAbs2()) .cwiseProduct(x - y)); } CBayesianOptimisation::TVector CBayesianOptimisation::kinvf() const { TVector Kinvf; TMatrix K{this->kernel(m_KernelParameters, this->meanErrorVariance())}; Kinvf = K.ldlt().solve(this->function()); return Kinvf; } double CBayesianOptimisation::dissimilarity(const TVector& x) const { // This is used as a fallback when GP is very unsure we can actually make progress, // i.e. EI is miniscule. In this case we fallback to a different strategy to break // ties at the probes we used for the GP. We use two criteria: // 1. The average distance to points we already tried: we prefer evaluation points // where the density of points is low, // 2. The minimum distance to any point we've already tried: we assume the loss // is fairly smooth (to bother trying to do better than random search) so any // existing point tells us accurately what the loss will be in its immediate // neighbourhood and running there again is duplicate work. double sum{0.0}; double min{std::numeric_limits<double>::max()}; for (const auto& y : m_FunctionMeanValues) { double dxy{las::distance(x, y.first)}; sum += dxy; min += std::min(min, dxy); } return sum / static_cast<double>(m_FunctionMeanValues.size()) + min; } CBayesianOptimisation::TVector CBayesianOptimisation::to01(TVector x) const { // Self assign so operations are performed inplace. x = (x - m_MinBoundary).cwiseQuotient(m_MaxBoundary - m_MinBoundary); return x; } CBayesianOptimisation::TVector CBayesianOptimisation::from01(TVector x) const { x = m_MinBoundary + x.cwiseProduct(m_MaxBoundary - m_MinBoundary); return x; } void CBayesianOptimisation::acceptPersistInserter(core::CStatePersistInserter& inserter) const { try { core::CPersistUtils::persist(VERSION_7_5_TAG, "", inserter); inserter.insertValue(RNG_TAG, m_Rng.toString()); core::CPersistUtils::persist(MIN_BOUNDARY_TAG, m_MinBoundary, inserter); core::CPersistUtils::persist(MAX_BOUNDARY_TAG, m_MaxBoundary, inserter); core::CPersistUtils::persist(ERROR_VARIANCES_TAG, m_ErrorVariances, inserter); core::CPersistUtils::persist(KERNEL_PARAMETERS_TAG, m_KernelParameters, inserter); core::CPersistUtils::persist(MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG, m_MinimumKernelCoordinateDistanceScale, inserter); core::CPersistUtils::persist(FUNCTION_MEAN_VALUES_TAG, m_FunctionMeanValues, inserter); core::CPersistUtils::persist(RANGE_SCALE_TAG, m_RangeScale, inserter); core::CPersistUtils::persist(RANGE_SHIFT_TAG, m_RangeShift, inserter); core::CPersistUtils::persist(RESTARTS_TAG, m_Restarts, inserter); } catch (std::exception& e) { LOG_ERROR(<< "Failed to persist state! " << e.what()); } } bool CBayesianOptimisation::acceptRestoreTraverser(core::CStateRestoreTraverser& traverser) { if (traverser.name() == VERSION_7_5_TAG) { try { do { const std::string& name = traverser.name(); RESTORE(RNG_TAG, m_Rng.fromString(traverser.value())) RESTORE(MIN_BOUNDARY_TAG, core::CPersistUtils::restore(MIN_BOUNDARY_TAG, m_MinBoundary, traverser)) RESTORE(MAX_BOUNDARY_TAG, core::CPersistUtils::restore(MAX_BOUNDARY_TAG, m_MaxBoundary, traverser)) RESTORE(ERROR_VARIANCES_TAG, core::CPersistUtils::restore(ERROR_VARIANCES_TAG, m_ErrorVariances, traverser)) RESTORE(RANGE_SHIFT_TAG, core::CPersistUtils::restore(RANGE_SHIFT_TAG, m_RangeShift, traverser)) RESTORE(RANGE_SCALE_TAG, core::CPersistUtils::restore(RANGE_SCALE_TAG, m_RangeScale, traverser)) RESTORE(RESTARTS_TAG, core::CPersistUtils::restore(RESTARTS_TAG, m_Restarts, traverser)) RESTORE(KERNEL_PARAMETERS_TAG, core::CPersistUtils::restore(KERNEL_PARAMETERS_TAG, m_KernelParameters, traverser)) RESTORE(MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG, core::CPersistUtils::restore( MIN_KERNEL_COORDINATE_DISTANCE_SCALES_TAG, m_MinimumKernelCoordinateDistanceScale, traverser)) RESTORE(FUNCTION_MEAN_VALUES_TAG, core::CPersistUtils::restore(FUNCTION_MEAN_VALUES_TAG, m_FunctionMeanValues, traverser)) } while (traverser.next()); } catch (std::exception& e) { LOG_ERROR(<< "Failed to restore state! " << e.what()); return false; } this->checkRestoredInvariants(); return true; } LOG_ERROR(<< "Input error: unsupported state serialization version. Currently supported version: " << VERSION_7_5_TAG); return false; } void CBayesianOptimisation::checkRestoredInvariants() const { VIOLATES_INVARIANT(m_FunctionMeanValues.size(), !=, m_ErrorVariances.size()); VIOLATES_INVARIANT(m_MinBoundary.size(), !=, m_MaxBoundary.size()); VIOLATES_INVARIANT(m_KernelParameters.size(), !=, m_MinBoundary.size() + 1); VIOLATES_INVARIANT(m_MinimumKernelCoordinateDistanceScale.size(), !=, m_MinBoundary.size()); for (const auto& point : m_FunctionMeanValues) { VIOLATES_INVARIANT(point.first.size(), !=, m_MinBoundary.size()); } } std::size_t CBayesianOptimisation::memoryUsage() const { std::size_t mem{core::memory::dynamicSize(m_MinBoundary)}; mem += core::memory::dynamicSize(m_MaxBoundary); mem += core::memory::dynamicSize(m_FunctionMeanValues); mem += core::memory::dynamicSize(m_ErrorVariances); mem += core::memory::dynamicSize(m_KernelParameters); mem += core::memory::dynamicSize(m_MinimumKernelCoordinateDistanceScale); return mem; } std::size_t CBayesianOptimisation::estimateMemoryUsage(std::size_t numberParameters, std::size_t numberRounds) { std::size_t boundaryMemoryUsage{2 * numberParameters * sizeof(double)}; std::size_t functionMeanValuesMemoryUsage{numberRounds * sizeof(TVectorDoublePr)}; std::size_t errorVariancesMemoryUsage{numberRounds * sizeof(double)}; std::size_t kernelParametersMemoryUsage{(numberParameters + 1) * sizeof(double)}; std::size_t minimumKernelCoordinateDistanceScale{numberParameters * sizeof(double)}; return sizeof(CBayesianOptimisation) + boundaryMemoryUsage + functionMeanValuesMemoryUsage + errorVariancesMemoryUsage + kernelParametersMemoryUsage + minimumKernelCoordinateDistanceScale; } std::uint64_t CBayesianOptimisation::checksum(std::uint64_t seed) const { seed = CChecksum::calculate(seed, m_Rng); seed = CChecksum::calculate(seed, m_Restarts); seed = CChecksum::calculate(seed, m_RangeShift); seed = CChecksum::calculate(seed, m_RangeScale); seed = CChecksum::calculate(seed, m_ExplainedErrorVariance); seed = CChecksum::calculate(seed, m_MinBoundary); seed = CChecksum::calculate(seed, m_MaxBoundary); seed = CChecksum::calculate(seed, m_FunctionMeanValues); seed = CChecksum::calculate(seed, m_ErrorVariances); seed = CChecksum::calculate(seed, m_KernelParameters); return CChecksum::calculate(seed, m_MinimumKernelCoordinateDistanceScale); } const std::size_t CBayesianOptimisation::RESTARTS{10}; const double CBayesianOptimisation::NEGLIGIBLE_EXPECTED_IMPROVEMENT{1e-12}; const double CBayesianOptimisation::MINIMUM_KERNEL_COORDINATE_DISTANCE_SCALE{1e-3}; } } }