lib/maths/analytics/unittest/CBoostedTreeTest.cc (3,073 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <core/CContainerPrinter.h> #include <core/CDataFrame.h> #include <core/CJsonStatePersistInserter.h> #include <core/CLogger.h> #include <core/CMemoryDef.h> #include <core/CPackedBitVector.h> #include <core/CRegex.h> #include <core/CStopWatch.h> #include <core/CVectorRange.h> #include <maths/analytics/CBoostedTree.h> #include <maths/analytics/CBoostedTreeFactory.h> #include <maths/analytics/CBoostedTreeImpl.h> #include <maths/analytics/CBoostedTreeLoss.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CBasicStatisticsPersist.h> #include <maths/common/COrderings.h> #include <maths/common/COrderingsSimultaneousSort.h> #include <maths/common/CPRNG.h> #include <maths/common/CSampling.h> #include <maths/common/CSpline.h> #include <maths/common/CTools.h> #include <maths/common/CToolsDetail.h> #include <test/BoostTestCloseAbsolute.h> #include <test/CDataFrameAnalysisSpecificationFactory.h> #include <test/CRandomNumbers.h> #include <test/CTestTmpDir.h> #include <boost/make_shared.hpp> #include <boost/test/unit_test.hpp> #include <algorithm> #include <atomic> #include <fstream> #include <functional> #include <iostream> #include <limits> #include <memory> #include <stdexcept> #include <streambuf> #include <utility> #include <vector> using namespace ml; class CBoostedTreeImplForTest { public: using TAnalysisInstrumentationPtr = maths::analytics::CBoostedTreeImpl::TAnalysisInstrumentationPtr; using TLossFunctionUPtr = maths::analytics::CBoostedTreeImpl::TLossFunctionUPtr; using TSizeVec = maths::analytics::CBoostedTreeImpl::TSizeVec; using TDoubleVec = maths::analytics::CBoostedTreeImpl::TDoubleVec; using TBoostedTreeUPtr = std::unique_ptr<maths::analytics::CBoostedTree>; public: explicit CBoostedTreeImplForTest(maths::analytics::CBoostedTreeImpl& treeImpl) : m_TreeImpl{treeImpl} {} CBoostedTreeImplForTest(const CBoostedTreeImplForTest&) = delete; CBoostedTreeImplForTest& operator=(const CBoostedTreeImplForTest&) = delete; static double correctedMemoryUsage(double memoryUsageBytes) { return static_cast<double>(maths::analytics::CBoostedTreeImpl::correctedMemoryUsageForTraining( memoryUsageBytes)); } const TDoubleVec& featureSampleProbabilities() const { return m_TreeImpl.featureSampleProbabilities(); } void treeFeatureBag(TDoubleVec& probabilities, TSizeVec& treeFeatureBag) const { m_TreeImpl.treeFeatureBag(probabilities, treeFeatureBag); } void nodeFeatureBag(const TSizeVec& treeFeatureBag, TDoubleVec& probabilities, TSizeVec& nodeFeatureBag) const { m_TreeImpl.nodeFeatureBag(treeFeatureBag, probabilities, nodeFeatureBag); } TBoostedTreeUPtr cloneFor(core::CDataFrame& frame, std::size_t dependentVariable) const { std::stringstream persistState; { core::CJsonStatePersistInserter inserter(persistState); m_TreeImpl.acceptPersistInserter(inserter); persistState.flush(); } std::stringstream stateStr0(persistState.str()); return maths::analytics::CBoostedTreeFactory::constructFromString(stateStr0) .restoreFor(frame, dependentVariable); } double meanChangePenalisedLoss(core::CDataFrame& frame, const core::CPackedBitVector& rowMask) const { return maths::common::CBasicStatistics::mean( m_TreeImpl.meanChangePenalisedLoss(frame, rowMask)); } private: maths::analytics::CBoostedTreeImpl& m_TreeImpl; }; BOOST_TEST_DONT_PRINT_LOG_VALUE(CBoostedTreeImplForTest::TSizeVec::iterator) BOOST_AUTO_TEST_SUITE(CBoostedTreeTest) namespace { using TBoolVec = std::vector<bool>; using TDoubleVec = std::vector<double>; using TDoubleVecVec = std::vector<TDoubleVec>; using TSizeVec = std::vector<std::size_t>; using TStrVec = std::vector<std::string>; using TFactoryFunc = std::function<std::unique_ptr<core::CDataFrame>()>; using TFactoryFuncVec = std::vector<TFactoryFunc>; using TFactoryFuncVecVec = std::vector<TFactoryFuncVec>; using TRowRef = core::CDataFrame::TRowRef; using TRowItr = core::CDataFrame::TRowItr; using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator; using TMeanVarAccumulator = maths::common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator; using TLossFunctionType = maths::analytics::boosted_tree::ELossType; using TLossFunctionUPtr = maths::analytics::CBoostedTreeFactory::TLossFunctionUPtr; using TTargetFunc = std::function<double(const TRowRef&)>; using TVector = maths::common::CDenseVector<double>; using TMemoryMappedMatrix = maths::common::CMemoryMappedDenseMatrix<double>; const double BYTES_IN_MB{static_cast<double>(core::constants::BYTES_IN_MEGABYTES)}; class CTestInstrumentation : public maths::analytics::CDataFrameTrainBoostedTreeInstrumentationStub { public: using TIntVec = std::vector<int>; struct STaskProgess { STaskProgess(std::string name, bool monotonic, TIntVec tenPercentProgressPoints) : s_Name{std::move(name)}, s_Monotonic{monotonic}, s_TenPercentProgressPoints{std::move(tenPercentProgressPoints)} {} std::string s_Name; bool s_Monotonic; TIntVec s_TenPercentProgressPoints; }; using TTaskProgressVec = std::vector<STaskProgess>; public: explicit CTestInstrumentation(bool debug = false) : m_Debug{debug}, m_TotalFractionalProgress{0}, m_Monotonic{true}, m_MemoryUsage{0}, m_MaxMemoryUsage{0} {} int progress() const { return (100 * m_TotalFractionalProgress.load()) / 65536; } std::int64_t maxMemoryUsage() const { return m_MaxMemoryUsage.load(); } TTaskProgressVec taskProgress() const { return m_TaskProgress; } void startNewProgressMonitoredTask(const std::string& task) override { if (m_Task.empty() == false) { m_TaskProgress.emplace_back(m_Task, m_Monotonic.load(), m_TenPercentProgressPoints); } m_Task = task; m_TotalFractionalProgress.store(0); m_TenPercentProgressPoints.clear(); m_Monotonic.store(true); } void updateProgress(double fractionalProgress) override { int progress{m_TotalFractionalProgress.fetch_add( static_cast<int>(65536.0 * fractionalProgress + 0.5))}; m_Monotonic.store(m_Monotonic.load() && fractionalProgress >= 0.0); // This needn't be protected because progress is only written from one thread and // the tests arrange that it is never read at the same time it is being written. if (m_TenPercentProgressPoints.empty() || 100 * progress > 65536 * (m_TenPercentProgressPoints.back() + 10)) { m_TenPercentProgressPoints.push_back(10 * ((10 * progress) / 65536)); } } void updateMemoryUsage(std::int64_t delta) override { // fetch_add returns the value immediately _before_ adding delta. std::int64_t memory{m_MemoryUsage.fetch_add(delta) + delta}; std::int64_t previousMaxMemoryUsage{m_MaxMemoryUsage.load()}; while (previousMaxMemoryUsage < memory && m_MaxMemoryUsage.compare_exchange_weak(previousMaxMemoryUsage, memory) == false) { } if (m_Debug) { LOG_DEBUG(<< "current memory = " << m_MemoryUsage.load() << ", high water mark = " << m_MaxMemoryUsage.load()); } } private: bool m_Debug{false}; std::string m_Task; std::atomic_int m_TotalFractionalProgress; TIntVec m_TenPercentProgressPoints; std::atomic_bool m_Monotonic; TTaskProgressVec m_TaskProgress; std::atomic<std::int64_t> m_MemoryUsage; std::atomic<std::int64_t> m_MaxMemoryUsage; }; template<typename F, typename G> auto computeEvaluationMetrics(const core::CDataFrame& frame, std::size_t beginTestRows, std::size_t endTestRows, const F& actual, const G& target, double noiseVariance, const TSizeVec& outliers = {}) { TMeanVarAccumulator functionMoments; TMeanVarAccumulator modelPredictionErrorMoments; frame.readRows(1, beginTestRows, endTestRows, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { if (std::binary_search(outliers.begin(), outliers.end(), row->index()) == false) { functionMoments.add(target(*row)); modelPredictionErrorMoments.add(target(*row) - actual(*row)); } } }); LOG_TRACE(<< "function moments = " << functionMoments); LOG_TRACE(<< "model prediction error moments = " << modelPredictionErrorMoments); double functionVariance{maths::common::CBasicStatistics::variance(functionMoments)}; double predictionErrorMean{maths::common::CBasicStatistics::mean(modelPredictionErrorMoments)}; double predictionErrorVariance{ maths::common::CBasicStatistics::variance(modelPredictionErrorMoments)}; double rSquared{1.0 - (predictionErrorVariance - noiseVariance) / (functionVariance - noiseVariance)}; return std::make_pair(predictionErrorMean, rSquared); } template<typename F> void fillDataFrame(std::size_t trainRows, std::size_t testRows, std::size_t cols, const TBoolVec& categoricalColumns, const TDoubleVecVec& regressors, const TDoubleVec& noise, const F& target, core::CDataFrame& frame) { std::size_t offset{frame.numberRows()}; std::size_t rows{trainRows + testRows}; frame.categoricalColumns(categoricalColumns); for (std::size_t i = 0; i < rows; ++i) { frame.writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t j = 0; j < cols - 1; ++j, ++column) { *column = regressors[j][i]; } }); } frame.finishWritingRows(); frame.writeColumns(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { if (row->index() >= offset) { std::size_t index{row->index() - offset}; double targetValue{index < trainRows ? target(*row) + noise[index] : core::CDataFrame::valueOfMissing()}; row->writeColumn(cols - 1, targetValue); } } }); } template<typename F> void fillDataFrame(std::size_t trainRows, std::size_t testRows, std::size_t cols, const TDoubleVecVec& regressors, const TDoubleVec& noise, const F& target, core::CDataFrame& frame) { fillDataFrame(trainRows, testRows, cols, TBoolVec(cols, false), regressors, noise, target, frame); } template<typename W> void fillDataFrameWeights(std::size_t col, const W& weight, core::CDataFrame& frame) { frame.writeColumns(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { row->writeColumn(col, weight(*row)); } }); } template<typename F> auto predictAndComputeEvaluationMetrics(const F& generateFunction, test::CRandomNumbers& rng, std::size_t trainRows, std::size_t testRows, std::size_t cols, std::size_t capacity, double noiseVariance) { std::size_t rows{trainRows + testRows}; TFactoryFunc makeOnDisk{[=] { return core::makeDiskStorageDataFrame(test::CTestTmpDir::tmpDir(), cols, rows, capacity) .first; }}; TFactoryFunc makeMainMemory{ [=] { return core::makeMainStorageDataFrame(cols, capacity).first; }}; TFactoryFuncVecVec factories{ {makeOnDisk, makeMainMemory}, {makeMainMemory}, {makeMainMemory}}; TDoubleVecVec modelBias(factories.size()); TDoubleVecVec modelRSquared(factories.size()); for (std::size_t test = 0; test < factories.size(); ++test) { auto target = generateFunction(rng, cols); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); for (const auto& factory : factories[test]) { auto frame = factory(); fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); double bias; double rSquared; std::tie(bias, rSquared) = computeEvaluationMetrics( *frame, trainRows, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, target, noiseVariance / static_cast<double>(rows)); modelBias[test].push_back(bias); modelRSquared[test].push_back(rSquared); } } LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); return std::make_pair(std::move(modelBias), std::move(modelRSquared)); } template<typename F> auto predictAndComputeEvaluationMetrics(const F& generateFunction, test::CRandomNumbers& rng, std::size_t trainRows, std::size_t testRows, std::size_t cols, double noiseVariance, const TSizeVec& outliers) { std::size_t rows{trainRows + testRows}; TDoubleVec modelBias; TDoubleVec modelRSquared; for (std::size_t i = 0; i < 2; ++i) { auto target = generateFunction(rng, cols); TDoubleVecVec x(cols - 1); for (std::size_t j = 0; j < cols - 1; ++j) { rng.generateUniformSamples(0.0, 10.0, rows, x[j]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frame = core::makeMainStorageDataFrame(cols, rows).first; fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CPseudoHuber>(1.0)) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); double bias; double rSquared; std::tie(bias, rSquared) = computeEvaluationMetrics( *frame, trainRows, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, target, noiseVariance / static_cast<double>(rows), outliers); modelBias.push_back(bias); modelRSquared.push_back(rSquared); } LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); return std::make_pair(std::move(modelBias), std::move(modelRSquared)); } std::size_t maxDepth(const std::vector<maths::analytics::CBoostedTreeNode>& tree, const maths::analytics::CBoostedTreeNode& node, std::size_t depth) { std::size_t result{depth}; if (node.isLeaf() == false) { result = std::max(result, maxDepth(tree, tree[node.leftChildIndex()], depth + 1)); result = std::max(result, maxDepth(tree, tree[node.rightChildIndex()], depth + 1)); } return result; } void readFileToStream(const std::string& filename, std::stringstream& stream) { std::ifstream file(filename); BOOST_TEST_REQUIRE(file.is_open()); std::string str((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); stream << str; stream.flush(); } const std::size_t OLD_TRAINING_DATA{0}; const std::size_t NEW_TRAINING_DATA{1}; } BOOST_AUTO_TEST_CASE(testEdgeCases) { // Test some edge case inputs fail gracefully. auto errorHandler = [](const std::string& error) { throw std::runtime_error{error}; }; core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; std::size_t cols{2}; // No data. try { auto frame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(0, 0, 2, {}, {}, [](const TRowRef&) { return 1.0; }, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); } catch (const std::exception& e) { LOG_DEBUG(<< e.what()); } // No training data. try { auto frame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(0, 1, 2, {{1.0}}, {0.0}, [](const TRowRef&) { return 1.0; }, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); } catch (const std::exception& e) { LOG_DEBUG(<< e.what()); } // Insufficient training data. try { auto frame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(1, 0, 2, {{1.0}}, {0.0}, [](const TRowRef&) { return 1.0; }, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); } catch (const std::exception& e) { LOG_DEBUG(<< e.what()); } auto frame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(5, 0, 2, {{1.0, 1.0, 1.0, 1.0, 1.0}}, {0.0, 0.0, 0.0, 0.0, 0.0}, [](const TRowRef&) { return 1.0; }, *frame); BOOST_REQUIRE_NO_THROW(maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1) ->train()); } BOOST_AUTO_TEST_CASE(testMsePiecewiseConstant) { // Test regression quality on piecewise constant function. auto generatePiecewiseConstant = [](test::CRandomNumbers& rng, std::size_t cols) { TDoubleVec p; TDoubleVec v; rng.generateUniformSamples(0.0, 10.0, 2 * cols - 2, p); rng.generateUniformSamples(-10.0, 10.0, cols - 1, v); for (std::size_t i = 0; i < p.size(); i += 2) { std::sort(p.begin() + i, p.begin() + i + 2); } return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { if (row[i] >= p[2 * i] && row[i] < p[2 * i + 1]) { result += v[i]; } } return result; }; }; test::CRandomNumbers rng; double noiseVariance{0.2}; std::size_t trainRows{500}; std::size_t testRows{200}; std::size_t cols{3}; std::size_t capacity{250}; TDoubleVecVec modelBias; TDoubleVecVec modelRSquared; std::tie(modelBias, modelRSquared) = predictAndComputeEvaluationMetrics( generatePiecewiseConstant, rng, trainRows, testRows, cols, capacity, noiseVariance); TMeanAccumulator meanModelRSquared; for (std::size_t i = 0; i < modelBias.size(); ++i) { // In and out-of-core agree. for (std::size_t j = 1; j < modelBias[i].size(); ++j) { BOOST_REQUIRE_EQUAL(modelBias[i][0], modelBias[i][j]); BOOST_REQUIRE_EQUAL(modelRSquared[i][0], modelRSquared[i][j]); } // Roughly unbiased... BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, modelBias[i][0], 10.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows))); // Good R^2... BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.91); meanModelRSquared.add(modelRSquared[i][0]); } LOG_DEBUG(<< "mean R^2 = " << maths::common::CBasicStatistics::mean(meanModelRSquared)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.95); } BOOST_AUTO_TEST_CASE(testMseLinear) { // Test regression quality on linear function. auto generateLinear = [](test::CRandomNumbers& rng, std::size_t cols) { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }; test::CRandomNumbers rng; double noiseVariance{100.0}; std::size_t trainRows{500}; std::size_t testRows{200}; std::size_t cols{6}; std::size_t capacity{250}; TDoubleVecVec modelBias; TDoubleVecVec modelRSquared; std::tie(modelBias, modelRSquared) = predictAndComputeEvaluationMetrics( generateLinear, rng, trainRows, testRows, cols, capacity, noiseVariance); TMeanAccumulator meanModelRSquared; for (std::size_t i = 0; i < modelBias.size(); ++i) { // In and out-of-core agree. for (std::size_t j = 1; j < modelBias[i].size(); ++j) { BOOST_REQUIRE_EQUAL(modelBias[i][0], modelBias[i][j]); BOOST_REQUIRE_EQUAL(modelRSquared[i][0], modelRSquared[i][j]); } // Unbiased... BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, modelBias[i][0], 6.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows))); // Good R^2... BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.94); meanModelRSquared.add(modelRSquared[i][0]); } LOG_DEBUG(<< "mean R^2 = " << maths::common::CBasicStatistics::mean(meanModelRSquared)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.97); } BOOST_AUTO_TEST_CASE(testMseNonLinear) { // Test regression quality on non-linear function. auto generateNonLinear = [](test::CRandomNumbers& rng, std::size_t cols) { cols = cols - 1; TDoubleVec mean; TDoubleVec slope; TDoubleVec curve; TDoubleVec cross; rng.generateUniformSamples(0.0, 10.0, cols, mean); rng.generateUniformSamples(-10.0, 10.0, cols, slope); rng.generateUniformSamples(-1.0, 1.0, cols, curve); rng.generateUniformSamples(-0.5, 0.5, cols * cols, cross); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols; ++i) { result += mean[i] + (slope[i] + curve[i] * row[i]) * row[i]; } for (std::size_t i = 0; i < cols; ++i) { for (std::size_t j = 0; j < i; ++j) { result += cross[i * cols + j] * row[i] * row[j]; } } return result; }; }; test::CRandomNumbers rng; double noiseVariance{100.0}; std::size_t trainRows{500}; std::size_t testRows{200}; std::size_t cols{6}; std::size_t capacity{500}; TDoubleVecVec modelBias; TDoubleVecVec modelRSquared; std::tie(modelBias, modelRSquared) = predictAndComputeEvaluationMetrics( generateNonLinear, rng, trainRows, testRows, cols, capacity, noiseVariance); TMeanAccumulator meanModelRSquared; for (std::size_t i = 0; i < modelBias.size(); ++i) { // In and out-of-core agree. for (std::size_t j = 1; j < modelBias[i].size(); ++j) { BOOST_REQUIRE_EQUAL(modelBias[i][0], modelBias[i][j]); BOOST_REQUIRE_EQUAL(modelRSquared[i][0], modelRSquared[i][j]); } // Unbiased... BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, modelBias[i][0], 4.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows))); // Good R^2... BOOST_TEST_REQUIRE(modelRSquared[i][0] > 0.97); meanModelRSquared.add(modelRSquared[i][0]); } LOG_DEBUG(<< "mean R^2 = " << maths::common::CBasicStatistics::mean(meanModelRSquared)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.98); } BOOST_AUTO_TEST_CASE(testHuber) { // Test the quality of Huber on linear plus outliers. std::size_t trainRows{500}; std::size_t testRows{200}; double noiseVariance{100.0}; TSizeVec outliers; { test::CRandomNumbers rng; for (std::size_t i = 0; i < trainRows + testRows; ++i) { TDoubleVec u01; rng.generateUniformSamples(0.0, 1.0, 1, u01); if (u01[0] < 0.02) { outliers.push_back(i); } } } LOG_DEBUG(<< "outliers = " << outliers); auto generateLinearPlusOutliers = [&](test::CRandomNumbers& rng, std::size_t cols) { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{std::binary_search(outliers.begin(), outliers.end(), row.index()) ? 5.0 * std::sqrt(noiseVariance) : 0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }; test::CRandomNumbers rng; std::size_t cols{6}; TDoubleVec modelBias; TDoubleVec modelRSquared; std::tie(modelBias, modelRSquared) = predictAndComputeEvaluationMetrics( generateLinearPlusOutliers, rng, trainRows, testRows, cols, noiseVariance, outliers); TMeanAccumulator meanModelRSquared; for (std::size_t i = 0; i < modelBias.size(); ++i) { // Unbiased... BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, modelBias[i], 6.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows))); // Good R^2... BOOST_TEST_REQUIRE(modelRSquared[i] > 0.94); meanModelRSquared.add(modelRSquared[i]); } LOG_DEBUG(<< "mean R^2 = " << maths::common::CBasicStatistics::mean(meanModelRSquared)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanModelRSquared) > 0.95); } // TODO #1744 test quality of MSLE on data with log-normal errors. //BOOST_AUTO_TEST_CASE(testMsle) { //} BOOST_AUTO_TEST_CASE(testNonUnitWeights) { // Check that downweighting outlier values improves prediction errors and bias // on inlier values. std::size_t trainRows{500}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; double noiseVariance{25.0}; std::size_t cols{6}; std::size_t weightCol{cols}; TSizeVec outliers; test::CRandomNumbers rng; for (std::size_t i = 0; i < trainRows + testRows; ++i) { TDoubleVec u01; rng.generateUniformSamples(0.0, 1.0, 1, u01); if (u01[0] < 0.05) { outliers.push_back(i); } } LOG_DEBUG(<< "outliers = " << outliers); auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); auto targetWithOutliers = [&](const TRowRef& row) { return target(row) + (std::binary_search(outliers.begin(), outliers.end(), row.index()) ? 8.0 * std::sqrt(noiseVariance) : 0.0); }; auto weight = [&](const TRowRef& row) { return std::binary_search(outliers.begin(), outliers.end(), row.index()) ? 0.1 : 1.0; }; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frameWithoutWeights = core::makeMainStorageDataFrame(cols, rows).first; frameWithoutWeights->columnNames({"x1", "x2", "x3", "x4", "x5", "x6"}); fillDataFrame(trainRows, testRows, cols, x, noise, targetWithOutliers, *frameWithoutWeights); auto regressionWithoutWeights = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frameWithoutWeights, cols - 1); regressionWithoutWeights->train(); regressionWithoutWeights->predict(); auto frameWithWeights = core::makeMainStorageDataFrame(cols + 1, rows).first; frameWithWeights->columnNames({"x1", "x2", "x3", "x4", "x5", "x6", "w"}); fillDataFrame(trainRows, testRows, cols, TBoolVec(cols + 1, false), x, noise, targetWithOutliers, *frameWithWeights); fillDataFrameWeights(weightCol, weight, *frameWithWeights); auto regressionWithWeights = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .rowWeightColumnName("w") .buildForTrain(*frameWithWeights, cols - 1); regressionWithWeights->train(); regressionWithWeights->predict(); double biasWithoutWeights; double rSquaredWithoutWeights; std::tie(biasWithoutWeights, rSquaredWithoutWeights) = computeEvaluationMetrics( *frameWithoutWeights, trainRows, rows, [&](const TRowRef& row) { return regressionWithoutWeights->prediction(row)[0]; }, target, noiseVariance / static_cast<double>(rows)); LOG_DEBUG(<< "biasWithoutWeights = " << biasWithoutWeights << ", rSquaredWithoutWeights = " << rSquaredWithoutWeights); double biasWithWeights; double rSquaredWithWeights; std::tie(biasWithWeights, rSquaredWithWeights) = computeEvaluationMetrics( *frameWithWeights, trainRows, rows, [&](const TRowRef& row) { return regressionWithWeights->prediction(row)[0]; }, target, noiseVariance / static_cast<double>(rows)); LOG_DEBUG(<< "biasWithWeights = " << biasWithWeights << ", rSquaredWithWeights = " << rSquaredWithWeights); BOOST_TEST_REQUIRE(std::fabs(biasWithWeights) < 0.25 * std::fabs(biasWithoutWeights)); BOOST_TEST_REQUIRE(1.0 - rSquaredWithWeights < 0.8 * (1.0 - rSquaredWithoutWeights)); } BOOST_AUTO_TEST_CASE(testLowCardinalityFeatures) { // Test a linear model with low cardinality features. std::size_t trainRows{500}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; double noiseVariance{4.0}; std::size_t cols{6}; test::CRandomNumbers rng; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-2.0, 2.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); for (auto& ni : noise) { ni = std::round(ni); } TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } for (std::size_t i = 0; i < (cols - 1) / 2 + 1; ++i) { for (auto& xj : x[i]) { xj = std::round(xj); } } auto frame = core::makeMainStorageDataFrame(cols, rows).first; fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); double bias; double rSquared; std::tie(bias, rSquared) = computeEvaluationMetrics( *frame, trainRows, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, target, noiseVariance / static_cast<double>(rows)); LOG_DEBUG(<< "bias = " << bias << ", rSquared = " << rSquared); BOOST_TEST_REQUIRE(rSquared > 0.94); } BOOST_AUTO_TEST_CASE(testLowTrainFractionPerFold) { // Test regression using a very low train fraction per fold. This should // run in seconds, but we don't assert on the runtime because we don't // run CI on bare metal, and produce a good quality solution because the // final train is still on the full training set. test::CRandomNumbers rng; double noiseVariance{100.0}; std::size_t trainRows{10000}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; std::size_t cols{8}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frame = core::makeMainStorageDataFrame(cols, rows).first; fillDataFrame(trainRows, testRows, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .trainFractionPerFold(0.05) .buildForTrain(*frame, cols - 1); core::CStopWatch timer{true}; regression->train(); regression->predict(); LOG_DEBUG(<< "train duration " << timer.stop() << "ms"); double bias; double rSquared; std::tie(bias, rSquared) = computeEvaluationMetrics( *frame, trainRows, rows, [&](const TRowRef& row_) { return regression->prediction(row_)[0]; }, target, noiseVariance / static_cast<double>(rows)); // Unbiased... BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, bias, 7.0 * std::sqrt(noiseVariance / static_cast<double>(trainRows))); // Good R^2... BOOST_TEST_REQUIRE(rSquared > 0.98); } BOOST_AUTO_TEST_CASE(testHoldoutRowMask) { // Check that we can effectively train using loss on a specified holdout set. test::CRandomNumbers rng; double noiseVariance{10.0}; std::size_t rows{1000}; std::size_t cols{6}; std::size_t numberHoldoutRows{200}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frame = core::makeMainStorageDataFrame(cols, rows).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); // We disable early stopping because we test hold losses from fine tuning. auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .numberHoldoutRows(numberHoldoutRows) .stopHyperparameterOptimizationEarly(false) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); // Test the minimum cross-validation error matches the error we compute for // holdout set. core::CPackedBitVector holdoutRowMask(numberHoldoutRows, true); holdoutRowMask.extend(false, rows - numberHoldoutRows); TMeanVarAccumulator expectedMse; frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { double actual{(*row)[cols - 1]}; double prediction{regression->prediction(*row)[0]}; expectedMse.add(maths::common::CTools::pow2(actual - prediction)); } }, &holdoutRowMask); auto roundLosses = regression->impl().foldRoundTestLosses()[0]; auto actualMse = *std::min_element( roundLosses.begin(), roundLosses.end(), [](const auto& lhs, const auto& rhs) { return maths::common::COrderings::lexicographicalCompare( lhs != std::nullopt ? 0 : 1, *lhs, rhs != std::nullopt ? 0 : 1, *rhs); }); BOOST_REQUIRE_CLOSE(maths::common::CBasicStatistics::mean(expectedMse), *actualMse, 1e-3); } BOOST_AUTO_TEST_CASE(testIncrementalHoldoutRowMask) { // Check that we can effectively incrementally train using loss on a specified // holdout set. test::CRandomNumbers rng; double noiseVariance{10.0}; std::size_t rows{1000}; std::size_t cols{6}; std::size_t extraTrainingRows{200}; std::size_t numberHoldoutRows{200}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .numberHoldoutRows(numberHoldoutRows) .eta({0.02}) .dataSummarizationFraction(1.0) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); auto newFrame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(rows, 0, cols, x, noise, target, *newFrame); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, extraTrainingRows, x[i]); } rng.generateNormalSamples(0.0, noiseVariance, extraTrainingRows, noise); fillDataFrame(extraTrainingRows, 0, cols, x, noise, target, *newFrame); core::CPackedBitVector oldTrainingRowMask{rows, true}; oldTrainingRowMask.extend(false, extraTrainingRows); core::CPackedBitVector newTrainingRowMask{~oldTrainingRowMask}; double alpha{regression->hyperparameters().depthPenaltyMultiplier().value()}; double gamma{regression->hyperparameters().treeSizePenaltyMultiplier().value()}; double lambda{regression->hyperparameters().leafWeightPenaltyMultiplier().value()}; regression = maths::analytics::CBoostedTreeFactory::constructFromModel(std::move(regression)) .numberHoldoutRows(numberHoldoutRows) .newTrainingRowMask(newTrainingRowMask) .depthPenaltyMultiplier({0.5 * alpha, 2.0 * alpha}) .treeSizePenaltyMultiplier({0.5 * gamma, 2.0 * gamma}) .leafWeightPenaltyMultiplier({0.5 * lambda, 2.0 * lambda}) .forceAcceptIncrementalTraining(true) .buildForTrainIncremental(*newFrame, cols - 1); regression->trainIncremental(); regression->predict(); // Test the minimum cross-validation error matches the error we compute for // holdout set. core::CPackedBitVector holdoutRowMask(numberHoldoutRows, true); holdoutRowMask.extend(false, rows - numberHoldoutRows); TMeanAccumulator expectedMse; TMeanAccumulator expectedMsd; newFrame->readRows( 1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { double actual{(*row)[cols - 1]}; double prediction{regression->prediction(*row)[0]}; double previousPrediction{regression->previousPrediction(*row)[0]}; expectedMse.add(maths::common::CTools::pow2(actual - prediction)); expectedMsd.add(maths::common::CTools::pow2(prediction - previousPrediction)); } }, &holdoutRowMask); auto roundLosses = regression->impl().foldRoundTestLosses()[0]; auto actualMse = *std::min_element( roundLosses.begin(), roundLosses.end(), [](const auto& lhs, const auto& rhs) { return maths::common::COrderings::lexicographicalCompare( lhs == std::nullopt ? 1 : 0, *lhs, rhs == std::nullopt ? 1 : 0, *rhs); }); BOOST_REQUIRE_CLOSE(maths::common::CBasicStatistics::mean(expectedMse) + 0.01 * maths::common::CBasicStatistics::mean(expectedMsd), *actualMse, 1e-3); } BOOST_AUTO_TEST_CASE(testMseIncrementalForTargetDrift) { // Test incremental training for the MSE objective for drift in the target value. test::CRandomNumbers rng; double noiseVariance{9.0}; std::size_t rows{750}; std::size_t cols{6}; std::size_t extraTrainingRows{250}; TTargetFunc target; TTargetFunc perturbedTarget; std::tie(target, perturbedTarget) = [&] { TDoubleVec p; TDoubleVec v; TDoubleVec dv; rng.generateUniformSamples(0.0, 10.0, 2 * cols - 2, p); rng.generateUniformSamples(-10.0, 10.0, cols - 1, v); rng.generateUniformSamples(-5.0, 5.0, cols - 1, dv); for (std::size_t i = 0; i < p.size(); i += 2) { std::sort(p.begin() + i, p.begin() + i + 2); } auto target_ = [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { if (row[i] >= p[2 * i] && row[i] < p[2 * i + 1]) { result += v[i]; } } return result; }; auto perturbedTarget_ = [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { if (row[i] >= p[2 * i] && row[i] < p[2 * i + 1]) { result += v[i] + dv[i]; } } return result; }; return std::make_pair(target_, perturbedTarget_); }(); TDoubleVecVec x(cols - 1); TDoubleVec noise; for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 4.0, rows, x[i]); } rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .eta({0.02}) // Ensure there are enough trees. .buildForTrain(*frame, cols - 1); regression->train(); frame->resizeColumns(1, cols); auto newFrame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(rows, 0, cols, x, noise, target, *newFrame); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, extraTrainingRows, x[i]); } rng.generateNormalSamples(0.0, noiseVariance, extraTrainingRows, noise); fillDataFrame(extraTrainingRows, 0, cols, x, noise, perturbedTarget, *frame); fillDataFrame(extraTrainingRows, 0, cols, x, noise, perturbedTarget, *newFrame); core::CPackedBitVector oldTrainingRowMask{rows, true}; oldTrainingRowMask.extend(false, extraTrainingRows); core::CPackedBitVector newTrainingRowMask{~oldTrainingRowMask}; TMeanAccumulator mseBeforeIncrementalTraining[2]; TMeanAccumulator mseAfterIncrementalTraining[2]; frame->resizeColumns(1, cols + 1); regression->predict(); // Get the average error on the old and new training data from the old model. frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseBeforeIncrementalTraining[OLD_TRAINING_DATA].add( maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &oldTrainingRowMask); frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseBeforeIncrementalTraining[NEW_TRAINING_DATA].add( maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &newTrainingRowMask); double alpha{regression->hyperparameters().depthPenaltyMultiplier().value()}; double gamma{regression->hyperparameters().treeSizePenaltyMultiplier().value()}; double lambda{regression->hyperparameters().leafWeightPenaltyMultiplier().value()}; regression = maths::analytics::CBoostedTreeFactory::constructFromModel(std::move(regression)) .newTrainingRowMask(newTrainingRowMask) .depthPenaltyMultiplier({0.5 * alpha, 2.0 * alpha}) .treeSizePenaltyMultiplier({0.5 * gamma, 2.0 * gamma}) .leafWeightPenaltyMultiplier({0.5 * lambda, 2.0 * lambda}) .buildForTrainIncremental(*newFrame, cols - 1); regression->trainIncremental(); regression->predict(); // Get the average error on the old and new training data from the new model. newFrame->readRows( 1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseAfterIncrementalTraining[OLD_TRAINING_DATA].add(maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &oldTrainingRowMask); newFrame->readRows( 1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseAfterIncrementalTraining[NEW_TRAINING_DATA].add(maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &newTrainingRowMask); // By construction, the prediction error for the old training data must // increase because the old and new data distributions overlap and their // target values disagree. However, we should see proportionally a much // larger reduction in the new training data prediction error. double errorIncreaseOnOld{ maths::common::CBasicStatistics::mean(mseAfterIncrementalTraining[OLD_TRAINING_DATA]) / maths::common::CBasicStatistics::mean(mseBeforeIncrementalTraining[OLD_TRAINING_DATA]) - 1.0}; double errorDecreaseOnNew{ maths::common::CBasicStatistics::mean(mseBeforeIncrementalTraining[NEW_TRAINING_DATA]) / maths::common::CBasicStatistics::mean(mseAfterIncrementalTraining[NEW_TRAINING_DATA]) - 1.0}; LOG_DEBUG(<< "increase on old = " << errorIncreaseOnOld); LOG_DEBUG(<< "decrease on new = " << errorDecreaseOnNew); BOOST_TEST_REQUIRE(errorDecreaseOnNew > 25.0 * errorIncreaseOnOld); } BOOST_AUTO_TEST_CASE(testMseIncrementalForOutOfDomain) { // Test incremental training for the MSE objective for values out of the training // data domain domain. test::CRandomNumbers rng; double noiseVariance{100.0}; std::size_t rows{750}; std::size_t cols{6}; std::size_t extraTrainingRows{250}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); auto frame = core::makeMainStorageDataFrame(cols).first; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 4.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); fillDataFrame(rows, 0, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .eta({0.02}) // Ensure there are enough trees. .buildForTrain(*frame, cols - 1); regression->train(); frame->resizeColumns(1, cols); auto newFrame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(rows, 0, cols, x, noise, target, *newFrame); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(10.0, 15.0, extraTrainingRows, x[i]); } rng.generateNormalSamples(0.0, noiseVariance, extraTrainingRows, noise); fillDataFrame(extraTrainingRows, 0, cols, x, noise, target, *frame); fillDataFrame(extraTrainingRows, 0, cols, x, noise, target, *newFrame); regression->predict(); core::CPackedBitVector oldTrainingRowMask{rows, true}; oldTrainingRowMask.extend(false, extraTrainingRows); core::CPackedBitVector newTrainingRowMask{~oldTrainingRowMask}; TMeanAccumulator mseBeforeIncrementalTraining[2]; TMeanAccumulator mseAfterIncrementalTraining[2]; frame->resizeColumns(1, cols + 1); regression->predict(); // Get the average error on the old and new training data from the old model. frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseBeforeIncrementalTraining[OLD_TRAINING_DATA].add( maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &oldTrainingRowMask); frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseBeforeIncrementalTraining[NEW_TRAINING_DATA].add( maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &newTrainingRowMask); double alpha{regression->hyperparameters().depthPenaltyMultiplier().value()}; double gamma{regression->hyperparameters().treeSizePenaltyMultiplier().value()}; double lambda{regression->hyperparameters().leafWeightPenaltyMultiplier().value()}; regression = maths::analytics::CBoostedTreeFactory::constructFromModel(std::move(regression)) .newTrainingRowMask(newTrainingRowMask) .depthPenaltyMultiplier({0.5 * alpha, 2.0 * alpha}) .treeSizePenaltyMultiplier({0.5 * gamma, 2.0 * gamma}) .leafWeightPenaltyMultiplier({0.5 * lambda, 2.0 * lambda}) .buildForTrainIncremental(*newFrame, cols - 1); regression->trainIncremental(); regression->predict(); // Get the average error on the old and new training data from the new model. newFrame->readRows( 1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseAfterIncrementalTraining[OLD_TRAINING_DATA].add(maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &oldTrainingRowMask); newFrame->readRows( 1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { mseAfterIncrementalTraining[NEW_TRAINING_DATA].add(maths::common::CTools::pow2( (*row)[cols - 1] - regression->prediction(*row)[0])); } }, &newTrainingRowMask); // We should see little change in the prediction errors on the old data // set which doesn't overlap the new data, but a large improvement on // the new data for constant extrapolation performs poorly. double errorIncreaseOnOld{ maths::common::CBasicStatistics::mean(mseAfterIncrementalTraining[OLD_TRAINING_DATA]) / maths::common::CBasicStatistics::mean(mseBeforeIncrementalTraining[OLD_TRAINING_DATA]) - 1.0}; double errorDecreaseOnNew{ maths::common::CBasicStatistics::mean(mseBeforeIncrementalTraining[NEW_TRAINING_DATA]) / maths::common::CBasicStatistics::mean(mseAfterIncrementalTraining[NEW_TRAINING_DATA]) - 1.0}; LOG_DEBUG(<< "increase on old = " << errorIncreaseOnOld); LOG_DEBUG(<< "decrease on new = " << errorDecreaseOnNew); BOOST_TEST_REQUIRE(errorDecreaseOnNew > 100.0 * errorIncreaseOnOld); } BOOST_AUTO_TEST_CASE(testMseIncrementalAddNewTrees) { // Update the base model by allowing 0, 5, and 10 new trees. Verify that // the holdout error is not getting worse when allowing for more model // capacity. test::CRandomNumbers rng; double noiseVariance{16.0}; std::size_t batch1Size{500}; std::size_t batch2Size{300}; std::size_t batch1InHoldoutSet{150}; std::size_t batch2InHoldoutSet{50}; std::size_t numberHoldoutRows{batch1InHoldoutSet + batch2InHoldoutSet}; std::size_t cols{6}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x1Test(cols - 1); TDoubleVecVec x1Train(cols - 1); TDoubleVec noise1Test; TDoubleVec noise1Train; for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 4.0, batch1InHoldoutSet, x1Test[i]); rng.generateUniformSamples(0.0, 4.0, batch1Size - batch1InHoldoutSet, x1Train[i]); } rng.generateNormalSamples(0.0, noiseVariance, batch1InHoldoutSet, noise1Test); rng.generateNormalSamples(0.0, noiseVariance, batch1Size - batch1InHoldoutSet, noise1Train); TDoubleVecVec x2Test(cols - 1); TDoubleVecVec x2Train(cols - 1); TDoubleVec noise2Test; TDoubleVec noise2Train; // The second batch of data extends the domain. for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(2.0, 6.0, batch2InHoldoutSet, x2Test[i]); rng.generateUniformSamples(2.0, 6.0, batch2Size - batch2InHoldoutSet, x2Train[i]); } rng.generateNormalSamples(0.0, noiseVariance, batch2InHoldoutSet, noise2Test); rng.generateNormalSamples(0.0, noiseVariance, batch2Size, noise2Train); // Fill in the first batch. auto makeBatch1 = [&] { auto result = core::makeMainStorageDataFrame(cols).first; fillDataFrame(batch1InHoldoutSet, 0, cols, x1Test, noise1Test, target, *result); fillDataFrame(batch1Size - batch1InHoldoutSet, 0, cols, x1Train, noise1Train, target, *result); return result; }; auto batch1 = makeBatch1(); // We arrange to use 150 of the original batch and 50 of the new batch // as a holdout set. auto makeBatch2 = [&] { auto result = core::makeMainStorageDataFrame(cols).first; fillDataFrame(batch1InHoldoutSet, 0, cols, x1Test, noise1Test, target, *result); fillDataFrame(batch2InHoldoutSet, 0, cols, x2Test, noise2Test, target, *result); fillDataFrame(batch1Size - batch1InHoldoutSet, 0, cols, x1Train, noise1Train, target, *result); fillDataFrame(batch2Size - batch2InHoldoutSet, 0, cols, x2Train, noise2Train, target, *result); return result; }; auto baseModel = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .dataSummarizationFraction(1.0) .maximumNumberTrees(4) .numberHoldoutRows(numberHoldoutRows) .buildForTrain(*batch1, cols - 1); baseModel->train(); core::CPackedBitVector batch2RowMask{batch1InHoldoutSet, false}; batch2RowMask.extend(true, batch2InHoldoutSet); batch2RowMask.extend(false, batch1Size - batch1InHoldoutSet); batch2RowMask.extend(true, batch2Size - batch2InHoldoutSet); core::CPackedBitVector holdoutRowMask{numberHoldoutRows, true}; holdoutRowMask.extend(false, batch1Size + batch2Size - numberHoldoutRows); double eta{baseModel->hyperparameters().eta().value()}; double alpha{baseModel->hyperparameters().depthPenaltyMultiplier().value()}; double gamma{baseModel->hyperparameters().treeSizePenaltyMultiplier().value()}; double lambda{baseModel->hyperparameters().leafWeightPenaltyMultiplier().value()}; CBoostedTreeImplForTest baseImpl{baseModel->impl()}; auto cloneBaseModel = [&](core::CDataFrame& frame) { auto tmp_ = makeBatch2(); auto clonedModel = maths::analytics::CBoostedTreeFactory::constructFromModel( baseImpl.cloneFor(*tmp_, cols - 1)) .numberHoldoutRows(numberHoldoutRows) .buildForPredict(frame, cols - 1); clonedModel->predict(); return clonedModel; }; auto updateBaseModel = [&](core::CDataFrame& frame, std::size_t maxNumNewTrees) { auto tmp_ = makeBatch2(); // We fix the tree topology penalty because its initialization is // affected by the maxNumNewTrees. Changing the parameter ranges for // trainIncremental means we can no longer be sure that the hold out // loss is no larger when we _optionally_ allow adding extra trees. auto updatedModel = maths::analytics::CBoostedTreeFactory::constructFromModel( baseImpl.cloneFor(*tmp_, cols - 1)) .newTrainingRowMask(batch2RowMask) .eta({0.5 * eta, eta}) .depthPenaltyMultiplier({0.5 * alpha, 2.0 * alpha}) .treeSizePenaltyMultiplier({0.5 * gamma, 2.0 * gamma}) .leafWeightPenaltyMultiplier({0.5 * lambda, 2.0 * lambda}) .treeTopologyChangePenalty({18.0, 22.0}) .maximumNumberNewTrees(maxNumNewTrees) .numberHoldoutRows(numberHoldoutRows) .forceAcceptIncrementalTraining(true) .buildForTrainIncremental(frame, cols - 1); updatedModel->trainIncremental(); updatedModel->predict(); return updatedModel; }; auto computeTestError = [&](core::CDataFrame& frame, const maths::analytics::CBoostedTreeFactory::TBoostedTreeUPtr& model) { TMeanAccumulator squaredError; frame.readRows(1, 0, frame.numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { squaredError.add(maths::common::CTools::pow2( (*row)[cols - 1] - model->prediction(*row)[0])); } }, &holdoutRowMask); return maths::common::CBasicStatistics::mean(squaredError); }; auto computePenalisedTestError = [&](core::CDataFrame& frame, const maths::analytics::CBoostedTreeFactory::TBoostedTreeUPtr& model) { auto modelForTest = CBoostedTreeImplForTest{model->impl()}; return modelForTest.meanChangePenalisedLoss(frame, holdoutRowMask); }; auto frame = makeBatch2(); double testErrorBase{computeTestError(*frame, cloneBaseModel(*frame))}; auto frame0 = makeBatch2(); double testError0{computePenalisedTestError(*frame0, updateBaseModel(*frame0, 0))}; auto frame1 = makeBatch2(); double testError5{computePenalisedTestError(*frame1, updateBaseModel(*frame1, 5))}; LOG_DEBUG(<< "Holdout errors: base = " << testErrorBase << ", 0 new trees = " << testError0 << ", 5 new trees = " << testError5); // The initial model has too little capacity so we should get substantial // improvements for adding trees vs just retraining. BOOST_TEST_REQUIRE(0.9 * testError0 >= testError5); } BOOST_AUTO_TEST_CASE(testThreading) { // Test we get the same results whether we run with multiple threads or not. // Note because we compute approximate quantiles for a partition of the data // (one subset for each thread) and merge we get slightly different results // if running multiple vs single threaded. However, we should get the same // results whether we actually execute in parallel or not provided we perform // the same partitioning. Therefore, we test with two logical threads but // with and without starting the thread pool. test::CRandomNumbers rng; std::size_t rows{500}; std::size_t cols{6}; std::size_t capacity{100}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [m, s, cols](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, 0.1, rows, noise); core::stopDefaultAsyncExecutor(); TDoubleVec modelBias; TDoubleVec modelMse; for (const auto& test : {"serial", "parallel"}) { LOG_DEBUG(<< test); auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); core::CStopWatch watch{true}; auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 2, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); LOG_DEBUG(<< "took " << watch.lap() << "ms"); TMeanVarAccumulator modelPredictionErrorMoments; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { modelPredictionErrorMoments.add(target(*row) - regression->prediction(*row)[0]); } }); LOG_DEBUG(<< "model prediction error moments = " << modelPredictionErrorMoments); modelBias.push_back(maths::common::CBasicStatistics::mean(modelPredictionErrorMoments)); modelMse.push_back(maths::common::CBasicStatistics::variance(modelPredictionErrorMoments)); core::startDefaultAsyncExecutor(2); } BOOST_REQUIRE_EQUAL(modelBias[0], modelBias[1]); BOOST_REQUIRE_EQUAL(modelMse[0], modelMse[1]); core::stopDefaultAsyncExecutor(); } BOOST_AUTO_TEST_CASE(testConstantFeatures) { // Test constant features are excluded from the model. test::CRandomNumbers rng; std::size_t rows{500}; std::size_t cols{6}; std::size_t capacity{500}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [m, s, cols](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1, TDoubleVec(rows, 1.0)); for (std::size_t i = 0; i < cols - 2; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, 0.1, rows, noise); auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); TDoubleVec featureWeightsForTraining(regression->featureWeightsForTraining()); LOG_DEBUG(<< "feature weights = " << featureWeightsForTraining); BOOST_TEST_REQUIRE(featureWeightsForTraining[cols - 2] < 1e-4); } BOOST_AUTO_TEST_CASE(testConstantTarget) { // Test we correctly deal with a constant dependent variable. test::CRandomNumbers rng; std::size_t rows{500}; std::size_t cols{6}; std::size_t capacity{500}; TDoubleVecVec x(cols - 1); for (auto& xi : x) { rng.generateUniformSamples(0.0, 10.0, rows, xi); } auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(rows, 0, cols, x, TDoubleVec(rows, 0.0), [](const TRowRef&) { return 1.0; }, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); TMeanAccumulator modelPredictionError; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { modelPredictionError.add(1.0 - regression->prediction(*row)[0]); } }); LOG_DEBUG(<< "mean prediction error = " << maths::common::CBasicStatistics::mean(modelPredictionError)); BOOST_REQUIRE_EQUAL(0.0, maths::common::CBasicStatistics::mean(modelPredictionError)); } BOOST_AUTO_TEST_CASE(testCategoricalRegressors) { // Test automatic handling of categorical regressors. test::CRandomNumbers rng; std::size_t trainRows{1000}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; std::size_t cols{6}; std::size_t capacity{500}; TDoubleVecVec offsets{{0.0, 0.0, 12.0, -3.0, 0.0}, {12.0, 1.0, -3.0, 0.0, 0.0, 2.0, 16.0, 0.0, 0.0, -6.0}}; TDoubleVec weights{0.7, 0.2, -0.4}; auto target = [&](const TRowRef& x) { double result{offsets[0][static_cast<std::size_t>(x[0])] + offsets[1][static_cast<std::size_t>(x[1])]}; for (std::size_t i = 2; i < cols - 1; ++i) { result += weights[i - 2] * x[i]; } return result; }; TDoubleVecVec regressors(cols - 1); rng.generateMultinomialSamples({0.0, 1.0, 2.0, 3.0, 4.0}, {0.03, 0.17, 0.3, 0.1, 0.4}, rows, regressors[0]); rng.generateMultinomialSamples( {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}, {0.03, 0.07, 0.08, 0.02, 0.2, 0.15, 0.1, 0.05, 0.26, 0.04}, rows, regressors[1]); for (std::size_t i = 2; i < regressors.size(); ++i) { rng.generateUniformSamples(-10.0, 10.0, rows, regressors[i]); } auto frame = core::makeMainStorageDataFrame(cols, capacity).first; frame->categoricalColumns(TBoolVec{true, true, false, false, false, false}); for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t j = 0; j < cols - 1; ++j, ++column) { *column = regressors[j][i]; } }); } frame->finishWritingRows(); frame->writeColumns(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { double targetValue{row->index() < trainRows ? target(*row) : core::CDataFrame::valueOfMissing()}; row->writeColumn(cols - 1, targetValue); } }); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); double modelBias; double modelRSquared; std::tie(modelBias, modelRSquared) = computeEvaluationMetrics( *frame, trainRows, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, target, 0.0); LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, modelBias, 0.1); BOOST_TEST_REQUIRE(modelRSquared > 0.97); } BOOST_AUTO_TEST_CASE(testFeatureBags) { // Test sampling of feature bags: // 1. Simple invariants such as tree bag containts node bag. // 2. The selection frequency is almost monotonic increasing with selection // probability. test::CRandomNumbers rng; std::size_t trainRows{1000}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; std::size_t cols{6}; std::size_t capacity{500}; TDoubleVecVec offsets{{0.0, 0.0, 12.0, -3.0, 0.0}, {12.0, 1.0, -3.0, 0.0, 0.0, 2.0, 16.0, 0.0, 0.0, -6.0}}; TDoubleVec weights{0.7, 0.2, -0.4}; auto target = [&](const TRowRef& x) { double result{offsets[0][static_cast<std::size_t>(x[0])] + offsets[1][static_cast<std::size_t>(x[1])]}; for (std::size_t i = 2; i < cols - 1; ++i) { result += weights[i - 2] * x[i]; } return result; }; TDoubleVecVec regressors(cols - 1); rng.generateMultinomialSamples({0.0, 1.0, 2.0, 3.0, 4.0}, {0.03, 0.17, 0.3, 0.1, 0.4}, rows, regressors[0]); rng.generateMultinomialSamples( {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0}, {0.03, 0.07, 0.08, 0.02, 0.2, 0.15, 0.1, 0.05, 0.26, 0.04}, rows, regressors[1]); for (std::size_t i = 2; i < regressors.size(); ++i) { rng.generateUniformSamples(-10.0, 10.0, rows, regressors[i]); } auto frame = core::makeMainStorageDataFrame(cols, capacity).first; frame->categoricalColumns(TBoolVec{true, true, false, false, false, false}); for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t j = 0; j < cols - 1; ++j, ++column) { *column = regressors[j][i]; } }); } frame->finishWritingRows(); frame->writeColumns(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { double targetValue{row->index() < trainRows ? target(*row) : core::CDataFrame::valueOfMissing()}; row->writeColumn(cols - 1, targetValue); } }); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); CBoostedTreeImplForTest impl{regression->impl()}; TSizeVec selectedForTree(impl.featureSampleProbabilities().size(), 0); TSizeVec selectedForNode(impl.featureSampleProbabilities().size(), 0); TDoubleVec probabilities; TSizeVec treeFeatureBag; TSizeVec nodeFeatureBag; for (std::size_t i = 0; i < 5000; ++i) { probabilities = impl.featureSampleProbabilities(); impl.treeFeatureBag(probabilities, treeFeatureBag); for (auto j : treeFeatureBag) { ++selectedForTree[j]; } BOOST_TEST_REQUIRE(treeFeatureBag.empty() == false); BOOST_TEST_REQUIRE( *std::max_element(treeFeatureBag.begin(), treeFeatureBag.end()) < probabilities.size()); BOOST_TEST_REQUIRE(std::is_sorted(treeFeatureBag.begin(), treeFeatureBag.end())); probabilities = impl.featureSampleProbabilities(); impl.nodeFeatureBag(treeFeatureBag, probabilities, nodeFeatureBag); for (auto j : nodeFeatureBag) { ++selectedForNode[j]; } BOOST_TEST_REQUIRE(nodeFeatureBag.empty() == false); BOOST_TEST_REQUIRE(std::is_sorted(nodeFeatureBag.begin(), nodeFeatureBag.end())); TSizeVec difference; std::set_difference(nodeFeatureBag.begin(), nodeFeatureBag.end(), treeFeatureBag.begin(), treeFeatureBag.end(), std::back_inserter(difference)); BOOST_TEST_REQUIRE(difference.empty()); BOOST_REQUIRE_EQUAL(nodeFeatureBag.end(), std::unique(nodeFeatureBag.begin(), nodeFeatureBag.end())); } probabilities = impl.featureSampleProbabilities(); maths::common::COrderings::simultaneousSort(probabilities, selectedForTree, selectedForNode); auto distanceToSorted = [](const TSizeVec& selected) { TSizeVec selectedSorted{selected}; std::sort(selectedSorted.begin(), selectedSorted.end()); std::size_t distance{0}; for (std::size_t i = 0; i < selected.size(); ++i) { distance += std::max(selected[i], selectedSorted[i]) - std::min(selected[i], selectedSorted[i]); } return static_cast<double>(distance) / static_cast<double>(std::accumulate(selected.begin(), selected.end(), std::size_t{0})); }; LOG_DEBUG(<< "distanceToSorted(selectedForTree) = " << distanceToSorted(selectedForTree) << ", distanceToSorted(selectedForNode) = " << distanceToSorted(selectedForNode)); BOOST_TEST_REQUIRE(distanceToSorted(selectedForTree) < 0.017); BOOST_TEST_REQUIRE(distanceToSorted(selectedForNode) < 0.017); } BOOST_AUTO_TEST_CASE(testIntegerRegressor) { // Test a simple integer regressor. test::CRandomNumbers rng; std::size_t trainRows{1000}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; auto frame = core::makeMainStorageDataFrame(2).first; frame->categoricalColumns(TBoolVec{false, false}); for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { TDoubleVec regressor; rng.generateUniformSamples(1.0, 4.0, 1, regressor); *(column++) = std::floor(regressor[0]); *column = i < trainRows ? 10.0 * std::floor(regressor[0]) : core::CDataFrame::valueOfMissing(); }); } frame->finishWritingRows(); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, 1); regression->train(); regression->predict(); double modelBias; double modelRSquared; std::tie(modelBias, modelRSquared) = computeEvaluationMetrics( *frame, trainRows, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, [&](const TRowRef& x) { return 10.0 * x[0]; }, 0.0); LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, modelBias, 0.082); BOOST_TEST_REQUIRE(modelRSquared > 0.96); } BOOST_AUTO_TEST_CASE(testSingleSplit) { // We were getting an out-of-bound read in initialization when running on // data with only two distinct values for the target variable. This test // fails intermittently without the fix. test::CRandomNumbers rng; std::size_t rows{100}; std::size_t cols{2}; TDoubleVec x; rng.generateUniformSamples(0.0, 1.0, rows, x); for (auto& xi : x) { xi = std::floor(xi + 0.5); } auto frame = core::makeMainStorageDataFrame(cols).first; frame->categoricalColumns(TBoolVec{false, false}); for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { *(column++) = x[i]; *column = 10.0 * x[i]; }); } frame->finishWritingRows(); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); double modelBias; double modelRSquared; std::tie(modelBias, modelRSquared) = computeEvaluationMetrics( *frame, 0, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, [](const TRowRef& row) { return 10.0 * row[0]; }, 0.0); LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, modelBias, 0.21); BOOST_TEST_REQUIRE(modelRSquared > 0.95); } BOOST_AUTO_TEST_CASE(testTranslationInvariance) { // We should get similar performance independent of fixed shifts for the target. test::CRandomNumbers rng; std::size_t trainRows{1000}; std::size_t rows{1200}; std::size_t cols{4}; std::size_t capacity{1200}; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TTargetFunc target{[&](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += row[i]; } return result; }}; TTargetFunc shiftedTarget{[&](const TRowRef& row) { double result{10000.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += row[i]; } return result; }}; TDoubleVec rsquared; for (const auto& target_ : {target, shiftedTarget}) { auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(trainRows, rows - trainRows, cols, x, TDoubleVec(rows, 0.0), target_, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); double modelBias; double modelRSquared; std::tie(modelBias, modelRSquared) = computeEvaluationMetrics( *frame, trainRows, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, target_, 0.0); LOG_DEBUG(<< "bias = " << modelBias); LOG_DEBUG(<< " R^2 = " << modelRSquared); rsquared.push_back(modelRSquared); } BOOST_REQUIRE_CLOSE_ABSOLUTE(rsquared[0], rsquared[1], 0.01); } BOOST_AUTO_TEST_CASE(testDepthBasedRegularization) { // Test that the trained tree depth is correctly limited based on a target. test::CRandomNumbers rng; double noiseVariance{100.0}; std::size_t rows{1000}; std::size_t cols{4}; std::size_t capacity{1200}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [m, s, cols](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); for (auto targetDepth : {3.0, 5.0}) { LOG_DEBUG(<< "target depth = " << targetDepth); auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .treeSizePenaltyMultiplier({0.0}) .leafWeightPenaltyMultiplier({0.0}) .softTreeDepthLimit({targetDepth}) .softTreeDepthTolerance({0.05}) .buildForTrain(*frame, cols - 1); regression->train(); TMeanAccumulator meanDepth; for (const auto& tree : regression->trainedModel()) { BOOST_TEST_REQUIRE(maxDepth(tree, tree[0], 0) <= static_cast<std::size_t>(targetDepth + 1)); meanDepth.add(static_cast<double>(maxDepth(tree, tree[0], 0))); } LOG_DEBUG(<< "mean depth = " << maths::common::CBasicStatistics::mean(meanDepth)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanDepth) > targetDepth - 1.2); } } BOOST_AUTO_TEST_CASE(testBinomialLogisticRegression) { // The idea of this test is to create a random linear relationship between // the feature values and the log-odds of class 1, i.e. // // log-odds(class_1) = sum_i{ w * x_i + noise } // // where, w is some fixed weight vector and x_i denoted the i'th feature vector. // // We try to recover this relationship in logistic regression by observing // the actual labels and want to test that we've roughly correctly estimated // the linear function. Because we target the cross-entropy we're effectively // targeting relative error in the estimated probabilities. However, we aren't // interested in accurate characterisation of very small probabilities and so // threshold the KL divergence which is also minimized targeting cross-entropy. test::CRandomNumbers rng; std::size_t trainRows{1000}; std::size_t rows{1200}; std::size_t cols{4}; std::size_t capacity{600}; TMeanAccumulator meanKlDivergence; for (std::size_t test = 0; test < 3; ++test) { TDoubleVec weights; rng.generateUniformSamples(-2.0, 2.0, cols - 1, weights); TDoubleVec noise; rng.generateNormalSamples(0.0, 1.0, rows, noise); TDoubleVec uniform01; rng.generateUniformSamples(0.0, 1.0, rows, uniform01); auto probability = [&](const TRowRef& row) { double x{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { x += weights[i] * row[i]; } return maths::common::CTools::logisticFunction(x + noise[row.index()]); }; auto target = [&](const TRowRef& row) { return uniform01[row.index()] < probability(row) ? 1.0 : 0.0; }; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 4.0, rows, x[i]); } auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(trainRows, rows - trainRows, cols, {false, false, false, true}, x, TDoubleVec(rows, 0.0), target, *frame); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CBinomialLogisticLoss>()) .buildForTrain(*frame, cols - 1); classifier->train(); classifier->predict(); TMeanAccumulator klDivergence; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { if (row->index() >= trainRows) { double expectedProbability{probability(*row)}; double actualProbability{classifier->prediction(*row)[1]}; klDivergence.add(expectedProbability * std::log(expectedProbability / actualProbability)); } } }); LOG_DEBUG(<< "KL divergence = " << maths::common::CBasicStatistics::mean(klDivergence)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(klDivergence) < 0.06); meanKlDivergence.add(maths::common::CBasicStatistics::mean(klDivergence)); } LOG_DEBUG(<< "mean KL divergence = " << maths::common::CBasicStatistics::mean(meanKlDivergence)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanKlDivergence) < 0.04); } BOOST_AUTO_TEST_CASE(testBinomialLogisticIncrementalForTargetDrift) { // Test incremental training for the binomial logistic objective for drift in the // target value. test::CRandomNumbers rng; std::size_t rows{750}; std::size_t cols{5}; std::size_t extraTrainingRows{500}; auto probability = [&] { TDoubleVec weights{-0.6, 1.0, 0.8, -1.2}; TDoubleVec noise; rng.generateNormalSamples(0.0, 0.5, rows + extraTrainingRows, noise); return [=](const TRowRef& row) { double x{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { x += weights[i] * row[i]; } return maths::common::CTools::logisticFunction(x + noise[row.index()]); }; }(); auto perturbedProbability = [&] { TDoubleVec weights{-0.4, 1.3, 0.9, -1.8}; TDoubleVec noise; rng.generateNormalSamples(0.0, 0.5, rows + extraTrainingRows, noise); return [=](const TRowRef& row) { double x{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { x += weights[i] * row[i]; } return maths::common::CTools::logisticFunction(x + noise[row.index()]); }; }(); auto target = [&] { TDoubleVec uniform01; rng.generateUniformSamples(0.0, 1.0, rows + extraTrainingRows, uniform01); return [=](const TRowRef& row) { return uniform01[row.index()] < probability(row) ? 1.0 : 0.0; }; }(); auto perturbedTarget = [&] { TDoubleVec uniform01; rng.generateUniformSamples(0.0, 1.0, rows + extraTrainingRows, uniform01); return [=](const TRowRef& row) { return uniform01[row.index()] < perturbedProbability(row) ? 1.0 : 0.0; }; }(); auto frame = core::makeMainStorageDataFrame(cols).first; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(-3.0, 3.0, rows, x[i]); } fillDataFrame(rows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(rows, 0.0), target, *frame); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CBinomialLogisticLoss>()) .buildForTrain(*frame, cols - 1); classifier->train(); frame->resizeColumns(1, cols); auto newFrame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(rows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(rows, 0.0), target, *newFrame); // If the target drift isn't on a subset of feature space it is an even trade increasing // errors for old and decreasing for new. for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 3.0, extraTrainingRows, x[i]); } fillDataFrame(extraTrainingRows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(extraTrainingRows, 0.0), perturbedTarget, *frame); fillDataFrame(extraTrainingRows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(extraTrainingRows, 0.0), perturbedTarget, *newFrame); classifier->predict(); core::CPackedBitVector oldTrainingRowMask{rows, true}; oldTrainingRowMask.extend(false, extraTrainingRows); core::CPackedBitVector newTrainingRowMask{~oldTrainingRowMask}; TMeanAccumulator logRelativeErrorBeforeIncrementalTraining[2]; TMeanAccumulator logRelativeErrorAfterIncrementalTraining[2]; auto addToRelativeError = [&](const TRowRef& row, TMeanAccumulator& logRelativeError) { double expectedProbability{probability(row)}; double actualProbability{classifier->prediction(row)[1]}; logRelativeError.add(std::log(std::max(actualProbability, expectedProbability) / std::min(actualProbability, expectedProbability))); }; auto addToPerturbedRelativeError = [&](const TRowRef& row, TMeanAccumulator& logRelativeError) { double expectedProbability{perturbedProbability(row)}; double actualProbability{classifier->prediction(row)[1]}; logRelativeError.add(std::log(std::max(actualProbability, expectedProbability) / std::min(actualProbability, expectedProbability))); }; frame->resizeColumns(1, cols + 1); classifier->predict(); // Get the average error on the old and new training data from the old model. frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToRelativeError( *row, logRelativeErrorBeforeIncrementalTraining[OLD_TRAINING_DATA]); } }, &oldTrainingRowMask); frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToPerturbedRelativeError( *row, logRelativeErrorBeforeIncrementalTraining[NEW_TRAINING_DATA]); } }, &newTrainingRowMask); double alpha{classifier->hyperparameters().depthPenaltyMultiplier().value()}; double gamma{classifier->hyperparameters().treeSizePenaltyMultiplier().value()}; double lambda{classifier->hyperparameters().leafWeightPenaltyMultiplier().value()}; classifier = maths::analytics::CBoostedTreeFactory::constructFromModel(std::move(classifier)) .newTrainingRowMask(newTrainingRowMask) .depthPenaltyMultiplier({0.5 * alpha, 2.0 * alpha}) .treeSizePenaltyMultiplier({0.5 * gamma, 2.0 * gamma}) .leafWeightPenaltyMultiplier({0.5 * lambda, 2.0 * lambda}) .buildForTrainIncremental(*newFrame, cols - 1); classifier->trainIncremental(); classifier->predict(); // Get the average error on the old and new training data from the new model. newFrame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToRelativeError( *row, logRelativeErrorAfterIncrementalTraining[OLD_TRAINING_DATA]); } }, &oldTrainingRowMask); newFrame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToPerturbedRelativeError( *row, logRelativeErrorAfterIncrementalTraining[NEW_TRAINING_DATA]); } }, &newTrainingRowMask); // We should see little change in the prediction errors on the old data // set which doesn't overlap the new data, but a large improvement on // the new data for constant extrapolation performs poorly. double errorIncreaseOnOld{ maths::common::CBasicStatistics::mean( logRelativeErrorAfterIncrementalTraining[OLD_TRAINING_DATA]) / maths::common::CBasicStatistics::mean( logRelativeErrorBeforeIncrementalTraining[OLD_TRAINING_DATA]) - 1.0}; double errorDecreaseOnNew{ maths::common::CBasicStatistics::mean( logRelativeErrorBeforeIncrementalTraining[NEW_TRAINING_DATA]) / maths::common::CBasicStatistics::mean( logRelativeErrorAfterIncrementalTraining[NEW_TRAINING_DATA]) - 1.0}; LOG_DEBUG(<< "increase on old = " << errorIncreaseOnOld); LOG_DEBUG(<< "decrease on new = " << errorDecreaseOnNew); BOOST_TEST_REQUIRE(errorDecreaseOnNew > 30.0 * errorIncreaseOnOld); } BOOST_AUTO_TEST_CASE(testBinomialLogisticIncrementalForOutOfDomain) { // Test incremental training for binomial logistic objective for values out of the // training data domain. test::CRandomNumbers rng; std::size_t rows{750}; std::size_t cols{5}; std::size_t extraTrainingRows{250}; auto probability = [&] { TDoubleVec weights{-0.6, 1.0, -0.8, 1.2}; TDoubleVec noise; rng.generateNormalSamples(0.0, 0.2, rows + extraTrainingRows, noise); return [=](const TRowRef& row) { double x{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { x += weights[i] * row[i]; } return maths::common::CTools::logisticFunction(x + noise[row.index()]); }; }(); auto target = [&] { TDoubleVec uniform01; rng.generateUniformSamples(0.0, 1.0, rows + extraTrainingRows, uniform01); return [=](const TRowRef& row) { return uniform01[row.index()] < probability(row) ? 1.0 : 0.0; }; }(); auto frame = core::makeMainStorageDataFrame(cols).first; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(-3.0, -1.0, rows, x[i]); } fillDataFrame(rows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(rows, 0.0), target, *frame); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CBinomialLogisticLoss>()) .buildForTrain(*frame, cols - 1); classifier->train(); frame->resizeColumns(1, cols); auto newFrame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(rows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(rows, 0.0), target, *newFrame); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(1.0, 3.0, extraTrainingRows, x[i]); } fillDataFrame(extraTrainingRows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(extraTrainingRows, 0.0), target, *frame); fillDataFrame(extraTrainingRows, 0, cols, {false, false, false, false, true}, x, TDoubleVec(extraTrainingRows, 0.0), target, *newFrame); classifier->predict(); core::CPackedBitVector oldTrainingRowMask{rows, true}; oldTrainingRowMask.extend(false, extraTrainingRows); core::CPackedBitVector newTrainingRowMask{~oldTrainingRowMask}; TMeanAccumulator logRelativeErrorBeforeIncrementalTraining[2]; TMeanAccumulator logRelativeErrorAfterIncrementalTraining[2]; auto addToRelativeError = [&](const TRowRef& row, TMeanAccumulator& logRelativeError) { double expectedProbability{probability(row)}; double actualProbability{classifier->prediction(row)[1]}; logRelativeError.add(std::log(std::max(actualProbability, expectedProbability) / std::min(actualProbability, expectedProbability))); }; frame->resizeColumns(1, cols + 1); classifier->predict(); // Get the average error on the old and new training data from the old model. frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToRelativeError( *row, logRelativeErrorBeforeIncrementalTraining[OLD_TRAINING_DATA]); } }, &oldTrainingRowMask); frame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToRelativeError( *row, logRelativeErrorBeforeIncrementalTraining[NEW_TRAINING_DATA]); } }, &newTrainingRowMask); double alpha{classifier->hyperparameters().depthPenaltyMultiplier().value()}; double gamma{classifier->hyperparameters().treeSizePenaltyMultiplier().value()}; double lambda{classifier->hyperparameters().leafWeightPenaltyMultiplier().value()}; classifier = maths::analytics::CBoostedTreeFactory::constructFromModel(std::move(classifier)) .newTrainingRowMask(newTrainingRowMask) .depthPenaltyMultiplier({0.5 * alpha, 2.0 * alpha}) .treeSizePenaltyMultiplier({0.5 * gamma, 2.0 * gamma}) .leafWeightPenaltyMultiplier({0.5 * lambda, 2.0 * lambda}) .buildForTrainIncremental(*newFrame, cols - 1); classifier->trainIncremental(); classifier->predict(); // Get the average error on the old and new training data from the new model. newFrame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToRelativeError( *row, logRelativeErrorAfterIncrementalTraining[OLD_TRAINING_DATA]); } }, &oldTrainingRowMask); newFrame->readRows(1, 0, frame->numberRows(), [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { addToRelativeError( *row, logRelativeErrorAfterIncrementalTraining[NEW_TRAINING_DATA]); } }, &newTrainingRowMask); // We should see little change in the prediction errors on the old data // set which doesn't overlap the new data, but a large improvement on // the new data for constant extrapolation performs poorly. double errorIncreaseOnOld{ maths::common::CBasicStatistics::mean( logRelativeErrorAfterIncrementalTraining[OLD_TRAINING_DATA]) / maths::common::CBasicStatistics::mean( logRelativeErrorBeforeIncrementalTraining[OLD_TRAINING_DATA]) - 1.0}; double errorDecreaseOnNew{ maths::common::CBasicStatistics::mean( logRelativeErrorBeforeIncrementalTraining[NEW_TRAINING_DATA]) / maths::common::CBasicStatistics::mean( logRelativeErrorAfterIncrementalTraining[NEW_TRAINING_DATA]) - 1.0}; LOG_DEBUG(<< "increase on old = " << errorIncreaseOnOld); LOG_DEBUG(<< "decrease on new = " << errorDecreaseOnNew); BOOST_TEST_REQUIRE(errorDecreaseOnNew > 14.0 * errorIncreaseOnOld); } BOOST_AUTO_TEST_CASE(testImbalancedClasses) { // Test we get similar per class precision and recall with unbalanced training // data when using the calculated decision threshold to assign to class one. test::CRandomNumbers rng; std::size_t trainRows{2000}; std::size_t classes[]{1, 0, 1, 0}; std::size_t classesRowCounts[]{1600, 400, 100, 100}; std::size_t cols{3}; TDoubleVecVec x; TDoubleVec means{0.0, 3.0}; TDoubleVec variances{6.0, 6.0}; for (std::size_t i = 0; i < std::size(classes); ++i) { TDoubleVecVec xi; double mean{means[classes[i]]}; double variance{variances[classes[i]]}; rng.generateMultivariateNormalSamples( {mean, mean}, {{variance, 0.0}, {0.0, variance}}, classesRowCounts[i], xi); x.insert(x.end(), xi.begin(), xi.end()); } auto frame = core::makeMainStorageDataFrame(cols).first; frame->categoricalColumns(TBoolVec{false, false, true}); for (std::size_t i = 0, index = 0; i < std::size(classes); ++i) { for (std::size_t j = 0; j < classesRowCounts[i]; ++j, ++index) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t k = 0; k < cols - 1; ++k, ++column) { *column = x[index][k]; } *column = index < trainRows ? static_cast<double>(i) : core::CDataFrame::valueOfMissing(); }); } } frame->finishWritingRows(); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CBinomialLogisticLoss>()) .buildForTrain(*frame, cols - 1); classifier->train(); classifier->predict(); TDoubleVec precisions; TDoubleVec recalls; { TDoubleVec truePositives(2, 0.0); TDoubleVec trueNegatives(2, 0.0); TDoubleVec falsePositives(2, 0.0); TDoubleVec falseNegatives(2, 0.0); frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { auto scores = classifier->adjustedPrediction(*row); std::size_t prediction(scores[1] < scores[0] ? 0 : 1); if (row->index() >= trainRows && row->index() < trainRows + classesRowCounts[2]) { // Actual is zero. (prediction == 0 ? truePositives[0] : falseNegatives[0]) += 1.0; (prediction == 0 ? trueNegatives[1] : falsePositives[1]) += 1.0; } else if (row->index() >= trainRows + classesRowCounts[2]) { // Actual is one. (prediction == 1 ? truePositives[1] : falseNegatives[1]) += 1.0; (prediction == 1 ? trueNegatives[0] : falsePositives[0]) += 1.0; } } }); precisions.push_back(truePositives[0] / (truePositives[0] + falsePositives[0])); precisions.push_back(truePositives[1] / (truePositives[1] + falsePositives[1])); recalls.push_back(truePositives[0] / (truePositives[0] + falseNegatives[0])); recalls.push_back(truePositives[1] / (truePositives[1] + falseNegatives[1])); } LOG_DEBUG(<< "precisions = " << precisions); LOG_DEBUG(<< "recalls = " << recalls); BOOST_TEST_REQUIRE(std::fabs(precisions[0] - precisions[1]) < 0.1); BOOST_TEST_REQUIRE(std::fabs(recalls[0] - recalls[1]) < 0.16); } BOOST_AUTO_TEST_CASE(testClassificationWeightsOverride) { // Test that the overrides are reflected in the classification weights used. test::CRandomNumbers rng; TSizeVec classes{1, 0}; TSizeVec classesRowCounts{50, 50}; std::size_t cols{3}; TDoubleVecVec x; TDoubleVec means{0.0, 3.0}; TDoubleVec variances{6.0, 6.0}; for (std::size_t i = 0; i < classes.size(); ++i) { TDoubleVecVec xi; double mean{means[classes[i]]}; double variance{variances[classes[i]]}; rng.generateMultivariateNormalSamples( {mean, mean}, {{variance, 0.0}, {0.0, variance}}, classesRowCounts[i], xi); x.insert(x.end(), xi.begin(), xi.end()); } auto frame = core::makeMainStorageDataFrame(cols).first; frame->categoricalColumns(TBoolVec{false, false, true}); TStrVec classValues{"foo", "bar"}; TStrVec row(cols); for (std::size_t i = 0, index = 0; i < classes.size(); ++i) { for (std::size_t j = 0; j < classesRowCounts[i]; ++j, ++index) { row[0] = std::to_string(x[index][0]); row[1] = std::to_string(x[index][1]); row[2] = classValues[i]; frame->parseAndWriteRow({row, 0, cols}); } } frame->finishWritingRows(); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CBinomialLogisticLoss>()) .classAssignmentObjective(maths::analytics::CBoostedTree::E_Custom) .classificationWeights({{"foo", 0.8}, {"bar", 0.2}}) .buildForTrain(*frame, cols - 1); classifier->train(); BOOST_TEST_REQUIRE("[0.8, 0.2]", core::CContainerPrinter::print( classifier->classificationWeights())); } BOOST_AUTO_TEST_CASE(testMultinomialLogisticRegression) { // The idea of this test is to create a random linear relationship between // the feature values and the logit, i.e. logit_i = W * x_i for matrix W is // some fixed weight matrix and x_i denoted the i'th feature vector. // // We try to recover this relationship in logistic regression by observing // the actual labels and want to test that we've roughly correctly estimated // the linear function. Because we target the cross-entropy we're effectively // targeting relative error in the estimated probabilities. However, we aren't // interested in accurate characterisation of very small probabilities and so // threshold the KL divergence which is also minimized targeting cross-entropy. maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; std::size_t trainRows{1000}; std::size_t rows{1200}; std::size_t cols{4}; int numberClasses{3}; int numberFeatures{static_cast<int>(cols - 1)}; TMeanAccumulator meanKLDivergence; TDoubleVec weights; TDoubleVec noise; TDoubleVec uniform01; for (std::size_t test = 0; test < 3; ++test) { testRng.generateUniformSamples(-2.0, 2.0, numberClasses * numberFeatures, weights); testRng.generateNormalSamples(0.0, 1.0, numberFeatures * rows, noise); testRng.generateUniformSamples(0.0, 1.0, rows, uniform01); auto probability = [&](const TRowRef& row) { TMemoryMappedMatrix W(weights.data(), numberClasses, numberFeatures); TVector x(numberFeatures); TVector n{numberClasses}; for (int i = 0; i < numberFeatures; ++i) { x(i) = row[i]; n(i) = noise[numberFeatures * row.index() + i]; } TVector result{W * x + n}; maths::common::CTools::inplaceSoftmax(result); return result; }; auto target = [&](const TRowRef& row) { TDoubleVec probabilities{probability(row).to<TDoubleVec>()}; return static_cast<double>( maths::common::CSampling::categoricalSample(rng, probabilities)); }; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { testRng.generateUniformSamples(0.0, 4.0, rows, x[i]); } auto frame = core::makeMainStorageDataFrame(cols).first; fillDataFrame(trainRows, rows - trainRows, cols, {false, false, false, true}, x, TDoubleVec(rows, 0.0), target, *frame); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMultinomialLogisticLoss>(numberClasses)) .buildForTrain(*frame, cols - 1); classifier->train(); classifier->predict(); TMeanAccumulator klDivergence; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { if (row->index() >= trainRows) { TVector expectedProbability{probability(*row)}; TVector actualProbability{ TVector::fromSmallVector(classifier->prediction(*row))}; klDivergence.add( (expectedProbability.array() * (expectedProbability.array() / actualProbability.array()) .log()) .sum()); } } }); LOG_DEBUG(<< "KL divergence = " << maths::common::CBasicStatistics::mean(klDivergence)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(klDivergence) < 0.21); meanKLDivergence.add(maths::common::CBasicStatistics::mean(klDivergence)); } LOG_DEBUG(<< "mean KL divergence = " << maths::common::CBasicStatistics::mean(meanKLDivergence)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanKLDivergence) < 0.12); } BOOST_AUTO_TEST_CASE(testMultinomialLogisticRegressionForManyClasses, *boost::unit_test::disabled()) { // This is too slow to run as part of regular testing. maths::common::CPRNG::CXorOShiro128Plus rng; test::CRandomNumbers testRng; std::size_t trainRows{1800}; std::size_t rows{2000}; std::size_t cols{20}; int numberClasses{30}; int numberFeatures{static_cast<int>(cols - 1)}; TDoubleVec weights; TDoubleVec noise; testRng.generateUniformSamples(-2.0, 2.0, numberClasses * numberFeatures, weights); testRng.generateNormalSamples(0.0, 0.2, numberFeatures * rows, noise); auto probability = [&](const TRowRef& row) { TMemoryMappedMatrix W(weights.data(), numberClasses, numberFeatures); TVector x(numberFeatures); TVector n{numberClasses}; for (int i = 0; i < numberFeatures; ++i) { x(i) = row[i]; n(i) = noise[numberFeatures * row.index() + i]; } TVector result{W * x + n}; maths::common::CTools::inplaceSoftmax(result); return result; }; auto target = [&](const TRowRef& row) { TDoubleVec probabilities{probability(row).to<TDoubleVec>()}; return static_cast<double>( maths::common::CSampling::categoricalSample(rng, probabilities)); }; TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { testRng.generateUniformSamples(0.0, 4.0, rows, x[i]); } auto frame = core::makeMainStorageDataFrame(cols).first; TBoolVec categoricalColumns(cols, false); categoricalColumns[cols - 1] = true; fillDataFrame(trainRows, rows - trainRows, cols, categoricalColumns, x, TDoubleVec(rows, 0.0), target, *frame); auto classifier = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMultinomialLogisticLoss>(numberClasses)) .buildForTrain(*frame, cols - 1); classifier->train(); classifier->predict(); TMeanAccumulator klDivergence; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { if (row->index() >= trainRows) { TVector expectedProbability{probability(*row)}; TVector actualProbability{ TVector::fromSmallVector(classifier->prediction(*row))}; klDivergence.add( (expectedProbability.array() * (expectedProbability.array() / actualProbability.array()).log()) .sum()); } } }); LOG_DEBUG(<< "KL divergence = " << maths::common::CBasicStatistics::mean(klDivergence)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(klDivergence) < 0.27); } BOOST_AUTO_TEST_CASE(testEstimateMemory) { // Test estimation of the memory used training a model. Note that we don't // test circuit breaking because that is managed from the api library. test::CRandomNumbers rng; std::size_t rows{1000}; std::size_t cols{6}; std::size_t categoricalCols{1}; std::size_t capacity{600}; std::int64_t previousEstimatedMemory{std::numeric_limits<std::int64_t>::max()}; for (std::size_t test = 0; test < 3; ++test) { TDoubleVecVec x(cols - 1); std::size_t numTestRows{((test + 1) * 100)}; for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } auto target = [&](std::size_t i) { double result{0.0}; for (std::size_t j = 0; j < cols - 1; ++j) { result += x[j][i]; } return result; }; auto makeDataFrame = [&] { auto frame = core::makeMainStorageDataFrame(cols, capacity).first; TBoolVec categoricalColumns(categoricalCols, true); categoricalColumns.resize(cols, false); frame->categoricalColumns(std::move(categoricalColumns)); for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t j = 0; j < categoricalCols; ++j, ++column) { *column = std::floor(x[j][i]); } for (std::size_t j = categoricalCols; j < cols - 1; ++j, ++column) { *column = x[j][i]; } *column = i < numTestRows ? core::CDataFrame::valueOfMissing() : target(i); }); } frame->finishWritingRows(); return frame; }; double percentTrainingRows{1.0 - static_cast<double>(numTestRows) / static_cast<double>(rows)}; std::size_t trainRows{static_cast<std::size_t>(static_cast<double>(rows) * percentTrainingRows)}; LOG_DEBUG(<< "percent training rows = " << percentTrainingRows); LOG_DEBUG(<< "Encode..."); std::size_t extraCols{ maths::analytics::CBoostedTreeFactory::estimateExtraColumnsForEncode()}; std::int64_t estimatedMemory( core::CDataFrame::estimateMemoryUsage(true, rows, cols + extraCols, core::CAlignment::E_Aligned16) + maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .estimateMemoryUsageForEncode(trainRows, cols, categoricalCols)); CTestInstrumentation encodeInstrumentation; auto frame = makeDataFrame(); encodeInstrumentation.updateMemoryUsage(frame->memoryUsage()); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .analysisInstrumentation(encodeInstrumentation) .buildForEncode(*frame, cols - 1); LOG_DEBUG(<< "estimated memory usage = " << estimatedMemory); LOG_DEBUG(<< "high water mark = " << encodeInstrumentation.maxMemoryUsage()); BOOST_TEST_REQUIRE(encodeInstrumentation.maxMemoryUsage() <= estimatedMemory); LOG_DEBUG(<< "Train..."); extraCols = maths::analytics::CBoostedTreeFactory::estimateExtraColumnsForTrain( cols, 1 /*dimension predictions*/, 1 /*dimension gradients*/); estimatedMemory = core::CDataFrame::estimateMemoryUsage(true, rows, cols + extraCols, core::CAlignment::E_Aligned16) + maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .estimateMemoryUsageForTrain(trainRows, cols); BOOST_TEST_REQUIRE(previousEstimatedMemory > estimatedMemory); previousEstimatedMemory = estimatedMemory; CTestInstrumentation trainInstrumentation; frame = makeDataFrame(); trainInstrumentation.updateMemoryUsage(frame->memoryUsage()); regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .analysisInstrumentation(trainInstrumentation) .buildForTrain(*frame, cols - 1); regression->train(); LOG_DEBUG(<< "estimated memory usage = " << estimatedMemory); LOG_DEBUG(<< "high water mark = " << trainInstrumentation.maxMemoryUsage()); BOOST_TEST_REQUIRE(trainInstrumentation.maxMemoryUsage() < estimatedMemory); LOG_DEBUG(<< "Train incremental..."); extraCols = maths::analytics::CBoostedTreeFactory::estimateExtraColumnsForTrainIncremental( cols, 1 /*dimension predictions*/, 1 /*dimension gradients*/); estimatedMemory = core::CDataFrame::estimateMemoryUsage(true, rows, cols + extraCols, core::CAlignment::E_Aligned16) + core::memory::dynamicSize(regression->trainedModel()) + maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .estimateMemoryUsageForTrainIncremental(rows, cols); CTestInstrumentation trainIncrementalInstrumentation; auto newFrame = makeDataFrame(); trainIncrementalInstrumentation.updateMemoryUsage(newFrame->memoryUsage()); regression = maths::analytics::CBoostedTreeFactory::constructFromModel( std::move(regression)) .analysisInstrumentation(trainIncrementalInstrumentation) .buildForTrainIncremental(*newFrame, cols - 1); regression->trainIncremental(); LOG_DEBUG(<< "estimated memory usage = " << estimatedMemory); LOG_DEBUG(<< "high water mark = " << trainIncrementalInstrumentation.maxMemoryUsage()); BOOST_TEST_REQUIRE(trainIncrementalInstrumentation.maxMemoryUsage() <= estimatedMemory); LOG_DEBUG(<< "Predict..."); extraCols = maths::analytics::CBoostedTreeFactory::estimateExtraColumnsForPredict(1); estimatedMemory = core::CDataFrame::estimateMemoryUsage(true, rows, cols + extraCols, core::CAlignment::E_Aligned16) + core::memory::dynamicSize(&regression->impl()) + maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .estimateMemoryUsageForPredict(rows, cols); CTestInstrumentation predictInstrumentation; newFrame = makeDataFrame(); predictInstrumentation.updateMemoryUsage(newFrame->memoryUsage()); regression = maths::analytics::CBoostedTreeFactory::constructFromModel( std::move(regression)) .analysisInstrumentation(predictInstrumentation) .buildForPredict(*newFrame, cols - 1); LOG_DEBUG(<< "estimated memory usage = " << estimatedMemory); LOG_DEBUG(<< "high water mark = " << predictInstrumentation.maxMemoryUsage()); BOOST_TEST_REQUIRE(predictInstrumentation.maxMemoryUsage() <= estimatedMemory); } } BOOST_AUTO_TEST_CASE(testDeployedMemoryLimiting) { // Test we always produce a model which meets our memory constraint. test::CRandomNumbers rng; double noiseVariance{10.0}; std::size_t trainRows{800}; std::size_t testRows{200}; std::size_t rows{trainRows + testRows}; std::size_t cols{8}; std::size_t capacity{250}; std::size_t maximumDeployedSize{20000}; auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frameConstrained = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(trainRows, testRows, cols, x, noise, target, *frameConstrained); auto regressionConstrained = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .maximumDeployedSize(maximumDeployedSize) .numberFolds(2) .buildForTrain(*frameConstrained, cols - 1); regressionConstrained->train(); regressionConstrained->predict(); auto frameUnconstrained = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(trainRows, testRows, cols, x, noise, target, *frameUnconstrained); auto regressionUnconstrained = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .numberFolds(2) .buildForTrain(*frameUnconstrained, cols - 1); regressionUnconstrained->train(); regressionUnconstrained->predict(); std::size_t deployedSizeConstrained{ std::accumulate(regressionConstrained->trainedModel().begin(), regressionConstrained->trainedModel().end(), std::size_t{0}, [](std::size_t sum, const auto& tree) { for (const auto& node : tree) { sum += node.deployedSize(); } return sum; })}; std::size_t deployedSizeUnconstrained{ std::accumulate(regressionUnconstrained->trainedModel().begin(), regressionUnconstrained->trainedModel().end(), std::size_t{0}, [](std::size_t sum, const auto& tree) { for (const auto& node : tree) { sum += node.deployedSize(); } return sum; })}; auto[modelBiasConstrained, modelRSquaredConstrained] = computeEvaluationMetrics(*frameConstrained, trainRows, rows, [&](const TRowRef& row) { return regressionConstrained->prediction(row)[0]; }, target, 0.0); auto[modelBiasUnconstrained, modelRSquaredUnconstrained] = computeEvaluationMetrics( *frameUnconstrained, trainRows, rows, [&](const TRowRef& row) { return regressionUnconstrained->prediction(row)[0]; }, target, 0.0); LOG_DEBUG(<< "deployedSizeConstrained = " << deployedSizeConstrained << ", modelBiasConstrained = " << modelBiasConstrained << ", modelRSquaredConstrained = " << modelRSquaredConstrained); LOG_DEBUG(<< "deployedSizeUnconstrained = " << deployedSizeUnconstrained << ", modelBiasUnconstrained = " << modelBiasUnconstrained << ", modelRSquaredUnconstrained = " << modelRSquaredUnconstrained); // We don't hurt model accuracy by more than 3%. BOOST_TEST_REQUIRE(modelRSquaredConstrained > 0.97 * modelRSquaredUnconstrained); // We need to constrain vs the optimum for this data and we are within // the memory budget. BOOST_TEST_REQUIRE(deployedSizeConstrained < maximumDeployedSize); BOOST_TEST_REQUIRE(deployedSizeConstrained < deployedSizeUnconstrained); } BOOST_AUTO_TEST_CASE(testProgressMonitoring) { // Test progress monitoring invariants. test::CRandomNumbers rng; std::size_t rows{500}; std::size_t cols{6}; std::size_t capacity{600}; TDoubleVecVec x(cols); for (std::size_t i = 0; i < cols; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } auto target = [](const TRowRef& row) { return row[0] + 3.0 * row[3]; }; core::stopDefaultAsyncExecutor(); TStrVec tests{"serial", "parallel"}; for (std::size_t threads : {1, 2}) { LOG_DEBUG(<< tests[threads == 1 ? 0 : 1]); auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(rows, 0, cols, x, TDoubleVec(rows, 0.0), target, *frame); CTestInstrumentation instrumentation; std::atomic_bool finished{false}; std::thread worker{[&]() { auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( threads, std::make_unique<maths::analytics::boosted_tree::CMse>()) .analysisInstrumentation(instrumentation) .stopHyperparameterOptimizationEarly(false) .buildForTrain(*frame, cols - 1); regression->train(); finished.store(true); }}; worker.join(); // Flush the last task... instrumentation.startNewProgressMonitoredTask(""); for (const auto& task : instrumentation.taskProgress()) { LOG_DEBUG(<< "task = " << task.s_Name); LOG_DEBUG(<< "monotonic = " << task.s_Monotonic); LOG_DEBUG(<< "progress points = " << task.s_TenPercentProgressPoints); if (task.s_Name == maths::analytics::CBoostedTreeFactory::FEATURE_SELECTION) { // We don't do feature selection (we have enough data to use all of them). } else if (task.s_Name == maths::analytics::CBoostedTreeFactory::COARSE_PARAMETER_SEARCH) { // We don't have accurate upfront estimate of the number of steps so // we only get progress up to 80% or 90% depending on the compiler and // platform. In non-test code we always pass 100% when the task is // complete. BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.size() >= 9); BOOST_TEST_REQUIRE( std::is_sorted(task.s_TenPercentProgressPoints.begin(), task.s_TenPercentProgressPoints.end())); BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.front() == 0); BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.back() >= 80); } else if (task.s_Name == maths::analytics::CBoostedTreeFactory::FINE_TUNING_PARAMETERS) { BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.size() >= 10); BOOST_TEST_REQUIRE( std::is_sorted(task.s_TenPercentProgressPoints.begin(), task.s_TenPercentProgressPoints.end())); BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.front() == 0); BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.back() >= 90); } else if (task.s_Name == maths::analytics::CBoostedTreeFactory::FINAL_TRAINING) { // Progress might be 90% or 100% depending on whether the final // progress update registered. BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.size() >= 10); BOOST_TEST_REQUIRE( std::is_sorted(task.s_TenPercentProgressPoints.begin(), task.s_TenPercentProgressPoints.end())); BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.front() == 0); BOOST_TEST_REQUIRE(task.s_TenPercentProgressPoints.back() >= 90); } BOOST_TEST_REQUIRE(task.s_Monotonic); } core::startDefaultAsyncExecutor(); } core::stopDefaultAsyncExecutor(); } BOOST_AUTO_TEST_CASE(testMissingFeatures) { // Test censoring, i.e. data missing is correlated with the target variable. std::size_t rows{1000}; std::size_t cols{4}; test::CRandomNumbers rng; auto frame = core::makeMainStorageDataFrame(cols).first; frame->categoricalColumns(TBoolVec{false, false, false, false}); for (std::size_t i = 0; i < rows - 4; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { TDoubleVec regressors; rng.generateUniformSamples(0.0, 10.0, cols - 1, regressors); double target{0.0}; for (auto regressor : regressors) { *(column++) = regressor > 9.0 ? core::CDataFrame::valueOfMissing() : regressor; target += regressor; } *column = target; }); } frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { *(column++) = core::CDataFrame::valueOfMissing(); *(column++) = 2.0; *(column++) = 6.0; *column = core::CDataFrame::valueOfMissing(); }); frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { *(column++) = 2.0; *(column++) = core::CDataFrame::valueOfMissing(); *(column++) = 6.0; *column = core::CDataFrame::valueOfMissing(); }); frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { *(column++) = 2.0; *(column++) = 6.0; *(column++) = core::CDataFrame::valueOfMissing(); *column = core::CDataFrame::valueOfMissing(); }); frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { *(column++) = core::CDataFrame::valueOfMissing(); *(column++) = 3.0; *(column++) = core::CDataFrame::valueOfMissing(); *column = core::CDataFrame::valueOfMissing(); }); frame->finishWritingRows(); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); // For each missing value given we censor for regression variables greater // than 9.0, target = sum_i{R_i} for regressors R_i and R_i ~ U([0,10]) so // we expect a n * E[U([0, 10]) | U([0, 10]) > 9.0] = 9.5 * n contribution // to the target for n equal to the number of missing variables. TDoubleVec expectedPredictions{17.5, 17.5, 17.5, 22.0}; TDoubleVec actualPredictions; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { if (maths::analytics::CDataFrameUtils::isMissing((*row)[cols - 1])) { actualPredictions.push_back(regression->prediction(*row)[0]); } } }); BOOST_REQUIRE_EQUAL(expectedPredictions.size(), actualPredictions.size()); for (std::size_t i = 0; i < expectedPredictions.size(); ++i) { BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedPredictions[i], actualPredictions[i], 0.9); } } BOOST_AUTO_TEST_CASE(testHyperparameterOverrides) { // Test hyperparameter overrides are respected. test::CRandomNumbers rng; std::size_t rows{300}; std::size_t cols{6}; std::size_t capacity{600}; TDoubleVecVec x(cols); for (std::size_t i = 0; i < cols; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } auto target = [](const TRowRef& row) { return row[0] + 3.0 * row[3]; }; auto frame = core::makeMainStorageDataFrame(cols, capacity).first; fillDataFrame(rows, 0, cols, x, TDoubleVec(rows, 0.0), target, *frame); CTestInstrumentation instrumentation; { auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .analysisInstrumentation(instrumentation) .maximumNumberTrees(10) .treeSizePenaltyMultiplier({0.1}) .leafWeightPenaltyMultiplier({0.01}) .buildForTrain(*frame, cols - 1); regression->train(); // We use a single leaf to centre the data so end up with *at most* limit + 1 trees. BOOST_TEST_REQUIRE(regression->hyperparameters().maximumNumberTrees().value() <= 11); BOOST_REQUIRE_EQUAL( 10, regression->hyperparameters().maximumNumberTrees().value()); BOOST_REQUIRE_EQUAL( 0.1, regression->hyperparameters().treeSizePenaltyMultiplier().value()); BOOST_REQUIRE_EQUAL( 0.01, regression->hyperparameters().leafWeightPenaltyMultiplier().value()); } { auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .analysisInstrumentation(instrumentation) .eta({0.2}) .softTreeDepthLimit({2.0}) .softTreeDepthTolerance({0.1}) .buildForTrain(*frame, cols - 1); regression->train(); BOOST_REQUIRE_EQUAL(0.2, regression->hyperparameters().eta().value()); BOOST_REQUIRE_EQUAL( 2.0, regression->hyperparameters().softTreeDepthLimit().value()); BOOST_REQUIRE_EQUAL( 0.1, regression->hyperparameters().softTreeDepthTolerance().value()); } { auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .analysisInstrumentation(instrumentation) .depthPenaltyMultiplier({1.0}) .featureBagFraction({0.4}) .downsampleFactor({0.6}) .buildForTrain(*frame, cols - 1); regression->train(); BOOST_REQUIRE_EQUAL( 1.0, regression->hyperparameters().depthPenaltyMultiplier().value()); BOOST_REQUIRE_EQUAL( 0.4, regression->hyperparameters().featureBagFraction().value()); BOOST_REQUIRE_EQUAL( 0.6, regression->hyperparameters().downsampleFactor().value()); } { auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .analysisInstrumentation(instrumentation) .depthPenaltyMultiplier({1.0}) .softTreeDepthLimit({3.0}) .softTreeDepthTolerance({0.1}) .featureBagFraction({0.4}) .downsampleFactor({0.6}) .etaGrowthRatePerTree({1.1}) .buildForTrain(*frame, cols - 1); regression->train(); BOOST_REQUIRE_EQUAL( 1.0, regression->hyperparameters().depthPenaltyMultiplier().value()); BOOST_REQUIRE_EQUAL( 3.0, regression->hyperparameters().softTreeDepthLimit().value()); BOOST_REQUIRE_EQUAL( 0.1, regression->hyperparameters().softTreeDepthTolerance().value()); BOOST_REQUIRE_EQUAL( 0.4, regression->hyperparameters().featureBagFraction().value()); BOOST_REQUIRE_EQUAL( 1.1, regression->hyperparameters().etaGrowthRatePerTree().value()); } } BOOST_AUTO_TEST_CASE(testPersistRestore) { // Check persist + restore is idempotent. std::size_t rows{50}; std::size_t cols{3}; std::size_t capacity{50}; test::CRandomNumbers rng; std::stringstream persistOnceState; std::stringstream persistTwiceState; // Generate completely random data. TDoubleVecVec x(cols); for (std::size_t i = 0; i < cols; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } auto frame = core::makeMainStorageDataFrame(cols, capacity).first; for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t j = 0; j < cols; ++j, ++column) { *column = x[j][i]; } }); } frame->finishWritingRows(); // Persist. { auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .numberFolds(2) .maximumNumberTrees(2) .maximumOptimisationRoundsPerHyperparameter(3) .buildForTrain(*frame, cols - 1); boostedTree->train(); core::CJsonStatePersistInserter inserter(persistOnceState); boostedTree->acceptPersistInserter(inserter); persistOnceState.flush(); } // Restore. auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromString(persistOnceState) .restoreFor(*frame, cols - 1); { // We need to call the inserter destructor to finish writing the state. core::CJsonStatePersistInserter inserter(persistTwiceState); boostedTree->acceptPersistInserter(inserter); persistTwiceState.flush(); } LOG_DEBUG(<< "State " << persistOnceState.str()); LOG_DEBUG(<< "State after restore " << persistTwiceState.str()); BOOST_REQUIRE_EQUAL(persistOnceState.str(), persistTwiceState.str()); } BOOST_AUTO_TEST_CASE(testPersistRestoreDuringInitialization) { // Grab checkpoints during initialization and check they all produce the // same initialized tree after restoring. using TSStreamVec = std::vector<std::stringstream>; std::size_t rows{50}; std::size_t cols{3}; std::size_t capacity{50}; test::CRandomNumbers rng; // Generate completely random data. TDoubleVecVec x(cols); for (std::size_t i = 0; i < cols; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } auto makeFrame = [&] { auto frame = core::makeMainStorageDataFrame(cols, capacity).first; for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { for (std::size_t j = 0; j < cols; ++j, ++column) { *column = x[j][i]; } }); } frame->finishWritingRows(); return frame; }; TSStreamVec checkpoints; auto writeCheckpoint = [&checkpoints](maths::analytics::CBoostedTree::TPersistFunc persist) { std::stringstream state; { // We need to call the inserter destructor to finish writing the state. core::CJsonStatePersistInserter inserter(state); persist(inserter); state.flush(); } checkpoints.push_back(std::move(state)); }; std::ostringstream expectedState; { auto frame = makeFrame(); auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .numberFolds(2) .maximumNumberTrees(2) .maximumOptimisationRoundsPerHyperparameter(3) .trainingStateCallback(writeCheckpoint) .buildForTrain(*frame, cols - 1); core::CJsonStatePersistInserter inserter(expectedState); boostedTree->acceptPersistInserter(inserter); expectedState.flush(); } // Make sure this test fails if there aren't any checkpoints. BOOST_TEST_REQUIRE(checkpoints.size() > 0); for (auto& checkpoint : checkpoints) { auto frame = makeFrame(); auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromString(checkpoint) .restoreFor(*frame, cols - 1); std::ostringstream actualState; { // We need to call the inserter destructor to finish writing the state. core::CJsonStatePersistInserter inserter(actualState); boostedTree->acceptPersistInserter(inserter); actualState.flush(); } BOOST_REQUIRE_EQUAL(expectedState.str(), actualState.str()); } } BOOST_AUTO_TEST_CASE(testRestoreErrorHandling) { auto errorHandler = [](std::string error) { throw std::runtime_error{error}; }; core::CLogger::CScopeSetFatalErrorHandler scope{errorHandler}; auto stream = boost::make_shared<std::ostringstream>(); // Redirect logging to a string stream so we can check errors generated. BOOST_TEST_REQUIRE(core::CLogger::instance().reconfigure(stream)); std::size_t cols{3}; std::size_t capacity{50}; auto frame = core::makeMainStorageDataFrame(cols, capacity).first; std::stringstream errorInBayesianOptimisationState; readFileToStream("testfiles/error_bayesian_optimisation_state.json", errorInBayesianOptimisationState); errorInBayesianOptimisationState.flush(); bool throwsExceptions{false}; try { auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromString( errorInBayesianOptimisationState) .restoreFor(*frame, 2); } catch (const std::exception& e) { std::cout << "got = " << e.what() << std::endl; std::cout << "Logging = " << stream->str() << std::endl; throwsExceptions = true; core::CRegex re; re.init("Input error:.*"); BOOST_TEST_REQUIRE(re.matches(e.what())); BOOST_TEST_REQUIRE(stream->str().find("Failed to restore MAX_BOUNDARY_TAG") != std::string::npos); } BOOST_TEST_REQUIRE(throwsExceptions); std::stringstream errorInBoostedTreeImplState; readFileToStream("testfiles/error_boosted_tree_impl_state.json", errorInBoostedTreeImplState); errorInBoostedTreeImplState.flush(); throwsExceptions = false; stream->clear(); try { auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromString(errorInBoostedTreeImplState) .restoreFor(*frame, 2); } catch (const std::exception& e) { std::cout << "got = " << e.what() << std::endl; std::cout << "Logging = " << stream->str() << std::endl; throwsExceptions = true; core::CRegex re; re.init("Input error:.*"); BOOST_TEST_REQUIRE(re.matches(e.what())); BOOST_TEST_REQUIRE(stream->str().find("Failed to restore NUMBER_FOLDS_TAG") != std::string::npos); } BOOST_TEST_REQUIRE(throwsExceptions); std::stringstream errorInStateVersion; readFileToStream("testfiles/error_no_version_state.json", errorInStateVersion); errorInStateVersion.flush(); throwsExceptions = false; stream->clear(); try { auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromString(errorInBoostedTreeImplState) .restoreFor(*frame, 2); } catch (const std::exception& e) { std::cout << "got = " << e.what() << std::endl; std::cout << "Logging = " << stream->str() << std::endl; throwsExceptions = true; core::CRegex re; re.init("Input error:.*"); BOOST_TEST_REQUIRE(re.matches(e.what())); BOOST_TEST_REQUIRE(stream->str().find("unsupported state serialization version.") != std::string::npos); } BOOST_TEST_REQUIRE(throwsExceptions); ml::core::CLogger::instance().reset(); } BOOST_AUTO_TEST_CASE(testWorstCaseMemoryCorrection) { // test for 15mb BOOST_REQUIRE_CLOSE(CBoostedTreeImplForTest::correctedMemoryUsage(15.0 * BYTES_IN_MB) / BYTES_IN_MB, 15.0, 1.0); // test for 50mb BOOST_REQUIRE_CLOSE(CBoostedTreeImplForTest::correctedMemoryUsage(50.0 * BYTES_IN_MB) / BYTES_IN_MB, 24.76, 1.0); // test for 300mb BOOST_REQUIRE_CLOSE(CBoostedTreeImplForTest::correctedMemoryUsage(300.0 * BYTES_IN_MB) / BYTES_IN_MB, 64.4, 1.0); // test for 1024mb BOOST_REQUIRE_CLOSE(CBoostedTreeImplForTest::correctedMemoryUsage(1024.0 * BYTES_IN_MB) / BYTES_IN_MB, 179.2, 1.0); // test for 18000mb BOOST_REQUIRE_CLOSE(CBoostedTreeImplForTest::correctedMemoryUsage(18000.0 * BYTES_IN_MB) / BYTES_IN_MB, 1355.75, 1.0); // test for monotonicity std::size_t numberSamples{1000}; TDoubleVec lhs; lhs.reserve(numberSamples); TDoubleVec rhs; rhs.reserve(numberSamples); test::CRandomNumbers rng; rng.generateUniformSamples(0.0, 20000.0, numberSamples, lhs); rng.generateUniformSamples(0.0, 20000.0, numberSamples, rhs); for (std::size_t i = 0; i < numberSamples; ++i) { BOOST_TEST_REQUIRE((lhs[i] <= rhs[i]) == (CBoostedTreeImplForTest::correctedMemoryUsage(lhs[i]) <= CBoostedTreeImplForTest::correctedMemoryUsage(rhs[i]))); } } BOOST_AUTO_TEST_CASE(testEncodingSeparately) { // Test that we can do encoding separately from training. TDoubleVec categoryValue{-15.0, 20.0, 0.0}; auto target = [&](const TDoubleVecVec& features, std::size_t row) { return categoryValue[static_cast<std::size_t>(std::min(features[0][row], 2.0))] + 2.8 * features[1][row] - 5.3 * features[2][row]; }; test::CRandomNumbers rng; std::size_t rows{300}; std::size_t cols{4}; double numberCategories{5.0}; std::uint64_t seed{1000}; TDoubleVecVec features(cols - 1); rng.generateUniformSamples(0.0, numberCategories, rows, features[0]); rng.generateNormalSamples(0.0, 4.0, rows, features[1]); rng.generateNormalSamples(2.0, 2.0, rows, features[2]); auto frame = core::makeMainStorageDataFrame(cols, 2 * rows).first; frame->categoricalColumns(TBoolVec{true, false, false, false}); for (std::size_t i = 0; i < rows; ++i) { frame->writeRow([&](core::CDataFrame::TFloatVecItr column, std::int32_t&) { *(column++) = std::floor(features[0][i]); for (std::size_t j = 1; j + 1 < cols; ++j, ++column) { *column = features[j][i]; } *column = target(features, i); }); } frame->finishWritingRows(); // Encode and persist std::stringstream persistState; { auto boostedTree = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .seed(seed) .buildForEncode(*frame, cols - 1); core::CJsonStatePersistInserter inserter(persistState); boostedTree->acceptPersistInserter(inserter); persistState.flush(); } // Restore from encoded state. auto encodedFirstModel = maths::analytics::CBoostedTreeFactory::constructFromString(persistState) .restoreFor(*frame, cols - 1); encodedFirstModel->train(); encodedFirstModel->predict(); TDoubleVec encodedFirstPredictions; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { encodedFirstPredictions.push_back(encodedFirstModel->prediction(*row)[0]); } }); // Resize data frame back to the original column number frame->resizeColumns(1, cols); auto trainedFromScratchModel = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .seed(seed) .buildForTrain(*frame, cols - 1); trainedFromScratchModel->train(); trainedFromScratchModel->predict(); TDoubleVec trainedFromScratchPredictions; frame->readRows(1, [&](const TRowItr& beginRows, const TRowItr& endRows) { for (auto row = beginRows; row != endRows; ++row) { trainedFromScratchPredictions.push_back( trainedFromScratchModel->prediction(*row)[0]); } }); BOOST_REQUIRE_EQUAL(encodedFirstPredictions.size(), trainedFromScratchPredictions.size()); for (std::size_t i = 0; i < encodedFirstPredictions.size(); ++i) { BOOST_REQUIRE_CLOSE_ABSOLUTE(encodedFirstPredictions[i], trainedFromScratchPredictions[i], 0.01); } } BOOST_AUTO_TEST_CASE(testStopAfterCoarseParameterTuning) { // Check that we stop without fine tuning if we can make little progress // on the optimisation objective. test::CRandomNumbers rng; std::size_t rows{2500}; std::size_t cols{4}; std::size_t numberHoldoutRows{1500}; auto verify = [&](double noiseVariance) { auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frame = core::makeMainStorageDataFrame(cols, rows).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); auto regression = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()) .numberHoldoutRows(numberHoldoutRows) .buildForTrain(*frame, cols - 1); return regression->hyperparameters().fineTuneSearchNotFinished(); }; BOOST_REQUIRE_EQUAL(verify(0.0), true); BOOST_REQUIRE_EQUAL(verify(1000.0), false); } BOOST_AUTO_TEST_CASE(testEarlyStoppingAccuracy) { // Test we don't hurt accuracy too much if we stop early. test::CRandomNumbers rng; std::size_t rows{1000}; std::size_t cols{3}; double noiseVariance{100.0}; std::size_t numberHoldoutRows{200}; auto computeMetrics = [&](bool earlyStoppingEnabled) -> std::pair<double, double> { rng.seed(100); auto target = [&] { TDoubleVec m; TDoubleVec s; rng.generateUniformSamples(0.0, 10.0, cols - 1, m); rng.generateUniformSamples(-10.0, 10.0, cols - 1, s); return [=](const TRowRef& row) { double result{0.0}; for (std::size_t i = 0; i < cols - 1; ++i) { result += m[i] + s[i] * row[i]; } return result; }; }(); TDoubleVecVec x(cols - 1); for (std::size_t i = 0; i < cols - 1; ++i) { rng.generateUniformSamples(0.0, 10.0, rows, x[i]); } TDoubleVec noise; rng.generateNormalSamples(0.0, noiseVariance, rows, noise); auto frame = core::makeMainStorageDataFrame(cols, rows).first; fillDataFrame(rows, 0, cols, x, noise, target, *frame); auto factory = maths::analytics::CBoostedTreeFactory::constructFromParameters( 1, std::make_unique<maths::analytics::boosted_tree::CMse>()); factory.numberHoldoutRows(numberHoldoutRows).stopHyperparameterOptimizationEarly(earlyStoppingEnabled); auto regression = factory.buildForTrain(*frame, cols - 1); regression->train(); regression->predict(); // Make sure early stopping was triggered if it was enabled. BOOST_REQUIRE_EQUAL(regression->hyperparameters().optimisationMakingNoProgress(), earlyStoppingEnabled); double bias; double rSquared; std::tie(bias, rSquared) = computeEvaluationMetrics( *frame, 0, rows, [&](const TRowRef& row) { return regression->prediction(row)[0]; }, target, noiseVariance / static_cast<double>(rows)); return {bias, rSquared}; }; double biasStopEarly; double rSquaredStopEarly; double biasNotStopEarly; double rSquaredNotStopEarly; std::tie(biasStopEarly, rSquaredStopEarly) = computeMetrics(true); std::tie(biasNotStopEarly, rSquaredNotStopEarly) = computeMetrics(false); LOG_DEBUG(<< "biasStopEarly = " << biasStopEarly << " rSquaredStopEarly = " << rSquaredStopEarly); LOG_DEBUG(<< "biasNotStopEarly = " << biasNotStopEarly << " rSquaredNotStopEarly = " << rSquaredNotStopEarly); BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, biasStopEarly, std::sqrt(noiseVariance / static_cast<double>(rows - numberHoldoutRows))); BOOST_REQUIRE_CLOSE_ABSOLUTE( 0.0, biasNotStopEarly, std::sqrt(noiseVariance / static_cast<double>(rows - numberHoldoutRows))); BOOST_TEST_REQUIRE(rSquaredNotStopEarly > 0.98); BOOST_TEST_REQUIRE(rSquaredStopEarly > 0.92); BOOST_REQUIRE_CLOSE(rSquaredNotStopEarly, rSquaredStopEarly, 10); } BOOST_AUTO_TEST_SUITE_END()