lib/maths/common/unittest/CLogNormalMeanPrecConjugateTest.cc (1,231 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <core/CJsonStatePersistInserter.h> #include <core/CJsonStateRestoreTraverser.h> #include <core/CLogger.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CLogNormalMeanPrecConjugate.h> #include <maths/common/CRestoreParams.h> #include <maths/common/CTools.h> #include <test/BoostTestCloseAbsolute.h> #include <test/CRandomNumbers.h> #include "TestUtils.h" #include <boost/math/constants/constants.hpp> #include <boost/math/distributions/lognormal.hpp> #include <boost/math/distributions/normal.hpp> #include <boost/test/unit_test.hpp> #include <algorithm> #include <cmath> #include <fstream> #include <iostream> #include <iterator> #include <limits> BOOST_AUTO_TEST_SUITE(CLogNormalMeanPrecConjugateTest) using namespace ml; using namespace handy_typedefs; namespace { using TDoubleVec = std::vector<double>; using TDoubleDoublePr = std::pair<double, double>; using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>; using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator; using TMeanVarAccumulator = maths::common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator; using CLogNormalMeanPrecConjugate = CPriorTestInterfaceMixin<maths::common::CLogNormalMeanPrecConjugate>; using TWeightFunc = maths_t::TDoubleWeightsAry (*)(double); CLogNormalMeanPrecConjugate makePrior(maths_t::EDataType dataType = maths_t::E_ContinuousData, const double& offset = 0.0, const double& decayRate = 0.0) { return CLogNormalMeanPrecConjugate::nonInformativePrior(dataType, offset, decayRate, 0.0); } } BOOST_AUTO_TEST_CASE(testMultipleUpdate) { // Test that we get the same result updating once with a vector of 100 // samples of an R.V. versus updating individually 100 times. using TEqual = maths::common::CEqualWithTolerance<double>; const maths_t::EDataType dataTypes[] = {maths_t::E_IntegerData, maths_t::E_ContinuousData}; const double location = std::log(10.0); const double squareScale = 3.0; test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 100, samples); for (std::size_t i = 0; i < std::size(dataTypes); ++i) { CLogNormalMeanPrecConjugate filter1(makePrior(dataTypes[i])); CLogNormalMeanPrecConjugate filter2(filter1); for (std::size_t j = 0; j < samples.size(); ++j) { filter1.addSamples(TDouble1Vec{samples[j]}); } filter2.addSamples(samples); TEqual equal(maths::common::CToleranceTypes::E_AbsoluteTolerance, 1e-2); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } // Test with variance scale. double mean = std::exp(location + squareScale / 2.0); double variance = mean * mean * (std::exp(squareScale) - 1.0); LOG_DEBUG(<< "mean = " << mean << " variance = " << variance); double scaledSquareScale = std::log(1.0 + 2.0 * variance / mean / mean); double scaledLocation = std::log(mean) - scaledSquareScale / 2.0; double scaledMean = std::exp(scaledLocation + scaledSquareScale / 2.0); double scaledVariance = scaledMean * scaledMean * (std::exp(scaledSquareScale) - 1.0); LOG_DEBUG(<< "scaled mean = " << scaledMean << " scaled variance = " << scaledVariance); TDoubleVec scaledSamples; rng.generateLogNormalSamples(scaledLocation, scaledSquareScale, 100, scaledSamples); for (size_t i = 0; i < std::size(dataTypes); ++i) { CLogNormalMeanPrecConjugate filter1(makePrior(dataTypes[i])); filter1.addSamples(samples); CLogNormalMeanPrecConjugate filter2(filter1); maths_t::TDoubleWeightsAry1Vec weights; weights.resize(samples.size(), maths_t::countVarianceScaleWeight(2.0)); for (std::size_t j = 0; j < scaledSamples.size(); ++j) { filter1.addSamples({scaledSamples[j]}, {weights[j]}); } filter2.addSamples(scaledSamples, weights); LOG_DEBUG(<< filter1.print() << "\nvs" << filter2.print()); TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 0.015); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } // Test the count weight is equivalent to adding repeated samples. for (size_t i = 0; i < std::size(dataTypes); ++i) { CLogNormalMeanPrecConjugate filter1(makePrior(dataTypes[i])); CLogNormalMeanPrecConjugate filter2(filter1); double x = 3.0; std::size_t count = 10; for (std::size_t j = 0; j < count; ++j) { filter1.addSamples(TDouble1Vec{x}); } filter2.addSamples({x}, {maths_t::countWeight(static_cast<double>(count))}); LOG_DEBUG(<< filter1.print() << "\nvs" << filter2.print()); TEqual equal(maths::common::CToleranceTypes::E_AbsoluteTolerance, 1e-3); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } } BOOST_AUTO_TEST_CASE(testPropagation) { // Test that propagation doesn't affect the expected values // of likelihood mean and precision. const double eps = 1e-12; test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(1.0, 0.3, 500, samples); CLogNormalMeanPrecConjugate filter(makePrior(maths_t::E_ContinuousData, 0.1)); for (std::size_t i = 0; i < samples.size(); ++i) { filter.addSamples(TDouble1Vec(1, static_cast<double>(samples[i]))); } double mean = filter.normalMean(); double precision = filter.normalPrecision(); filter.propagateForwardsByTime(5.0); double propagatedMean = filter.normalMean(); double propagatedPrecision = filter.normalPrecision(); LOG_DEBUG(<< "mean = " << mean << ", precision = " << precision << ", propagatedMean = " << propagatedMean << ", propagatedPrecision = " << propagatedPrecision); BOOST_REQUIRE_CLOSE_ABSOLUTE(mean, propagatedMean, eps); BOOST_REQUIRE_CLOSE_ABSOLUTE(precision, propagatedPrecision, eps); } BOOST_AUTO_TEST_CASE(testMeanEstimation) { // We are going to test that we correctly estimate the distribution // for the mean of the exponentiated Gaussian of a log-normal process // by checking that the true mean lies in various confidence intervals // the correct percentage of the times. const double decayRates[] = {0.0, 0.001, 0.01}; const unsigned int nTests = 500; const double testIntervals[] = {50.0, 60.0, 70.0, 80.0, 85.0, 90.0, 95.0, 99.0}; for (size_t i = 0; i < std::size(decayRates); ++i) { test::CRandomNumbers rng; unsigned int errors[] = {0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u}; for (unsigned int test = 0; test < nTests; ++test) { double location = std::log(0.5 * (test + 1)); double squareScale = 4.0; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 500, samples); CLogNormalMeanPrecConjugate filter( makePrior(maths_t::E_ContinuousData, 0.0, decayRates[i])); for (std::size_t j = 0; j < samples.size(); ++j) { filter.addSamples(TDouble1Vec(1, samples[j])); filter.propagateForwardsByTime(1.0); } for (size_t j = 0; j < std::size(testIntervals); ++j) { TDoubleDoublePr confidenceInterval = filter.confidenceIntervalNormalMean(testIntervals[j]); if (location < confidenceInterval.first || location > confidenceInterval.second) { ++errors[j]; } } } for (size_t j = 0; j < std::size(testIntervals); ++j) { double interval = 100.0 * errors[j] / static_cast<double>(nTests); LOG_TRACE(<< "interval = " << interval << ", expectedInterval = " << (100.0 - testIntervals[j])); // If the decay rate is zero the intervals should be accurate. // Otherwise, they should be an upper bound. if (decayRates[i] == 0.0) { BOOST_REQUIRE_CLOSE_ABSOLUTE(interval, (100.0 - testIntervals[j]), 4.0); } else { BOOST_TEST_REQUIRE(interval <= (100.0 - testIntervals[j])); } } } } BOOST_AUTO_TEST_CASE(testPrecisionEstimation) { // We are going to test that we correctly estimate a distribution for // the precision of the exponentiated Gaussian of a log-normal process by // checking that the true precision lies in various confidence intervals // the correct percentage of the times. const double decayRates[] = {0.0, 0.001, 0.01}; const unsigned int nTests = 500; const double testIntervals[] = {50.0, 60.0, 70.0, 80.0, 85.0, 90.0, 95.0, 99.0}; for (size_t i = 0; i < std::size(decayRates); ++i) { test::CRandomNumbers rng; unsigned int errors[] = {0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u}; for (unsigned int test = 0; test < nTests; ++test) { double location = 1.0; double squareScale = 0.002 * static_cast<double>(test + 1); double precision = 1 / squareScale; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 500, samples); CLogNormalMeanPrecConjugate filter( makePrior(maths_t::E_ContinuousData, 0.0, decayRates[i])); for (std::size_t j = 0; j < samples.size(); ++j) { filter.addSamples(TDouble1Vec(1, samples[j])); filter.propagateForwardsByTime(1.0); } for (size_t j = 0; j < std::size(testIntervals); ++j) { TDoubleDoublePr confidenceInterval = filter.confidenceIntervalNormalPrecision(testIntervals[j]); if (precision < confidenceInterval.first || precision > confidenceInterval.second) { ++errors[j]; } } } for (size_t j = 0; j < std::size(testIntervals); ++j) { double interval = 100.0 * errors[j] / static_cast<double>(nTests); LOG_TRACE(<< "interval = " << interval << ", expectedInterval = " << (100.0 - testIntervals[j])); // If the decay rate is zero the intervals should be accurate. // Otherwise, they should be an upper bound. if (decayRates[i] == 0.0) { BOOST_REQUIRE_CLOSE_ABSOLUTE(interval, (100.0 - testIntervals[j]), 4.0); } else { BOOST_TEST_REQUIRE(interval <= (100.0 - testIntervals[j])); } } } } BOOST_AUTO_TEST_CASE(testMarginalLikelihood) { // Check that the c.d.f. <= 1 at extreme. maths_t::EDataType dataTypes[] = {maths_t::E_ContinuousData, maths_t::E_IntegerData}; for (std::size_t t = 0; t < std::size(dataTypes); ++t) { CLogNormalMeanPrecConjugate filter(makePrior(dataTypes[t])); const double location = 1.0; const double squareScale = 1.0; test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 200, samples); filter.addSamples(samples); TWeightFunc weightsFuncs[]{static_cast<TWeightFunc>(maths_t::countWeight), static_cast<TWeightFunc>(maths_t::outlierWeight)}; double weights[]{0.1, 1.0, 10.0}; for (std::size_t i = 0; i < std::size(weightsFuncs); ++i) { for (std::size_t j = 0; j < std::size(weights); ++j) { double lb, ub; filter.minusLogJointCdf({10000.0}, {weightsFuncs[i](weights[j])}, lb, ub); LOG_DEBUG(<< "-log(c.d.f) = " << (lb + ub) / 2.0); BOOST_TEST_REQUIRE(lb >= 0.0); BOOST_TEST_REQUIRE(ub >= 0.0); } } } // Check that the marginal likelihood and c.d.f. agree for some // test data and that the c.d.f. <= 1 and that the expected value // of the log likelihood tends to the differential entropy. const double decayRates[] = {0.0, 0.001, 0.01}; const double location = 3.0; const double squareScale = 1.0; unsigned int numberSamples[] = {2u, 10u, 500u}; const double tolerance = 1e-3; test::CRandomNumbers rng; for (size_t i = 0; i < std::size(numberSamples); ++i) { TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, numberSamples[i], samples); for (size_t j = 0; j < std::size(decayRates); ++j) { CLogNormalMeanPrecConjugate filter( makePrior(maths_t::E_ContinuousData, 0.0, decayRates[j])); for (std::size_t k = 0; k < samples.size(); ++k) { filter.addSamples(TDouble1Vec(1, samples[k])); filter.propagateForwardsByTime(1.0); } // We'll check that the p.d.f. is close to the derivative of the // c.d.f. at a range of deltas from the true location. const double eps = 1e-4; double deltas[] = {-5.0, -4.0, -3.0, -2.0, -1.0, -0.5, 0.0, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0}; for (size_t k = 0; k < std::size(deltas); ++k) { double x = std::exp(location + deltas[k] * std::sqrt(squareScale)); TDouble1Vec sample(1, x); LOG_DEBUG(<< "number = " << numberSamples[i] << ", sample = " << sample[0]); double logLikelihood = 0.0; BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors, filter.jointLogMarginalLikelihood(sample, logLikelihood)); double pdf = std::exp(logLikelihood); double lowerBound = 0.0, upperBound = 0.0; sample[0] -= eps; BOOST_TEST_REQUIRE(filter.minusLogJointCdf(sample, lowerBound, upperBound)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); double minusLogCdf = (lowerBound + upperBound) / 2.0; double cdfAtMinusEps = std::exp(-minusLogCdf); BOOST_TEST_REQUIRE(minusLogCdf >= 0.0); sample[0] += 2.0 * eps; BOOST_TEST_REQUIRE(filter.minusLogJointCdf(sample, lowerBound, upperBound)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); minusLogCdf = (lowerBound + upperBound) / 2.0; double cdfAtPlusEps = std::exp(-minusLogCdf); BOOST_TEST_REQUIRE(minusLogCdf >= 0.0); double dcdfdx = (cdfAtPlusEps - cdfAtMinusEps) / 2.0 / eps; LOG_DEBUG(<< "pdf(x) = " << pdf << ", d(cdf)/dx = " << dcdfdx); BOOST_REQUIRE_CLOSE_ABSOLUTE(pdf, dcdfdx, tolerance); } } } { // Test that the sample mean of the log likelihood tends to the // expected log likelihood for a log-normal distribution (uniform // law of large numbers), which is just the differential entropy // of a log-normal R.V. boost::math::lognormal_distribution<> logNormal(location, std::sqrt(squareScale)); double expectedDifferentialEntropy = maths::common::CTools::differentialEntropy(logNormal); CLogNormalMeanPrecConjugate filter(makePrior()); double differentialEntropy = 0.0; TDoubleVec seedSamples; rng.generateLogNormalSamples(location, squareScale, 100, seedSamples); filter.addSamples(seedSamples); TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 100000, samples); for (std::size_t i = 0; i < samples.size(); ++i) { TDouble1Vec sample(1, samples[i]); filter.addSamples(sample); double logLikelihood = 0.0; BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors, filter.jointLogMarginalLikelihood(sample, logLikelihood)); differentialEntropy -= logLikelihood; } differentialEntropy /= static_cast<double>(samples.size()); LOG_DEBUG(<< "differentialEntropy = " << differentialEntropy << ", expectedDifferentialEntropy = " << expectedDifferentialEntropy); BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedDifferentialEntropy, differentialEntropy, 5e-3); } { const double varianceScales[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0}; boost::math::lognormal_distribution<> logNormal(location, std::sqrt(squareScale)); CLogNormalMeanPrecConjugate filter(makePrior()); TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 1000, samples); filter.addSamples(samples); const double percentages[] = {5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 95.0}; { // Test that marginal likelihood confidence intervals are // what we'd expect for various variance scales. TMeanAccumulator error; for (std::size_t i = 0; i < std::size(percentages); ++i) { double q1, q2; filter.marginalLikelihoodQuantileForTest(50.0 - percentages[i] / 2.0, 1e-3, q1); filter.marginalLikelihoodQuantileForTest(50.0 + percentages[i] / 2.0, 1e-3, q2); TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval(percentages[i]); LOG_TRACE(<< "[q1, q2] = [" << q1 << ", " << q2 << "]" << ", interval = " << interval); BOOST_REQUIRE_CLOSE_ABSOLUTE(q1, interval.first, 1e-3); BOOST_REQUIRE_CLOSE_ABSOLUTE(q2, interval.second, 1e-3); error.add(std::fabs(interval.first - q1)); error.add(std::fabs(interval.second - q2)); } LOG_DEBUG(<< "error = " << maths::common::CBasicStatistics::mean(error)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(error) < 1e-3); } { TMeanAccumulator totalError; for (std::size_t i = 0; i < std::size(varianceScales); ++i) { TMeanAccumulator error; double vs = varianceScales[i]; double shift = std::log(1.0 + vs * (std::exp(squareScale) - 1.0)) - squareScale; double shiftedLocation = location - 0.5 * shift; double shiftedSquareScale = squareScale + shift; boost::math::lognormal_distribution<> scaledLogNormal( shiftedLocation, std::sqrt(shiftedSquareScale)); LOG_DEBUG(<< "*** vs = " << boost::math::variance(scaledLogNormal) / boost::math::variance(logNormal) << " ***"); for (std::size_t j = 0; j < std::size(percentages); ++j) { double q1 = boost::math::quantile( scaledLogNormal, (50.0 - percentages[j] / 2.0) / 100.0); double q2 = boost::math::quantile( scaledLogNormal, (50.0 + percentages[j] / 2.0) / 100.0); TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval( percentages[j], maths_t::countVarianceScaleWeight(vs)); LOG_TRACE(<< "[q1, q2] = [" << q1 << ", " << q2 << "]" << ", interval = " << interval); BOOST_REQUIRE_CLOSE_ABSOLUTE(q1, interval.first, std::max(0.5, 0.2 * q1)); BOOST_REQUIRE_CLOSE_ABSOLUTE(q2, interval.second, 0.1 * q2); error.add(std::fabs(interval.first - q1) / q1); error.add(std::fabs(interval.second - q2) / q2); } LOG_DEBUG(<< "error = " << maths::common::CBasicStatistics::mean(error)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(error) < 0.07); totalError += error; } LOG_DEBUG(<< "totalError = " << maths::common::CBasicStatistics::mean(totalError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(totalError) < 0.06); } } } BOOST_AUTO_TEST_CASE(testMarginalLikelihoodMean) { // Test that the expectation of the marginal likelihood matches // the expected mean of the marginal likelihood. const double locations[] = {0.1, 1.0, 3.0}; const double squareScales[] = {0.1, 1.0, 3.0}; test::CRandomNumbers rng; for (std::size_t i = 0; i < std::size(locations); ++i) { for (std::size_t j = 0; j < std::size(squareScales); ++j) { LOG_DEBUG(<< "*** location = " << locations[i] << ", squareScale = " << squareScales[j] << " ***"); CLogNormalMeanPrecConjugate filter(makePrior()); TDoubleVec seedSamples; rng.generateLogNormalSamples(locations[i], squareScales[j], 10, seedSamples); filter.addSamples(seedSamples); TDoubleVec samples; rng.generateLogNormalSamples(locations[i], squareScales[j], 100, samples); TMeanAccumulator relativeError; for (std::size_t k = 0; k < samples.size(); ++k) { filter.addSamples(TDouble1Vec(1, samples[k])); double expectedMean; BOOST_TEST_REQUIRE(filter.marginalLikelihoodMeanForTest(expectedMean)); LOG_TRACE(<< "marginalLikelihoodMean = " << filter.marginalLikelihoodMean() << ", expectedMean = " << expectedMean); BOOST_REQUIRE_CLOSE_ABSOLUTE( expectedMean, filter.marginalLikelihoodMean(), 0.35 * expectedMean); relativeError.add(std::fabs(filter.marginalLikelihoodMean() - expectedMean) / expectedMean); } LOG_DEBUG(<< "relativeError = " << maths::common::CBasicStatistics::mean(relativeError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(relativeError) < 0.07); } } } BOOST_AUTO_TEST_CASE(testMarginalLikelihoodMode) { // Test that the marginal likelihood mode is what we'd expect // with variances variance scales. const double locations[] = {0.1, 1.0, 3.0}; const double squareScales[] = {0.1, 1.0, 3.0}; const double varianceScales[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0}; test::CRandomNumbers rng; for (std::size_t i = 0; i < std::size(locations); ++i) { for (std::size_t j = 0; j < std::size(squareScales); ++j) { LOG_DEBUG(<< "*** location = " << locations[i] << ", squareScale = " << squareScales[j] << " ***"); boost::math::lognormal_distribution<> logNormal( locations[i], std::sqrt(squareScales[j])); CLogNormalMeanPrecConjugate filter(makePrior()); TDoubleVec samples; rng.generateLogNormalSamples(locations[i], squareScales[j], 1000, samples); filter.addSamples(samples); maths_t::TDoubleWeightsAry weight(maths_t::CUnitWeights::UNIT); TMeanAccumulator error; for (std::size_t k = 0; k < std::size(varianceScales); ++k) { double vs = varianceScales[k]; maths_t::setCountVarianceScale(vs, weight); double shift = std::log(1.0 + vs * (std::exp(squareScales[j]) - 1.0)) - squareScales[j]; double shiftedLocation = locations[i] - 0.5 * shift; double shiftedSquareScale = squareScales[j] + shift; boost::math::lognormal_distribution<> scaledLogNormal( shiftedLocation, std::sqrt(shiftedSquareScale)); double expectedMode = boost::math::mode(scaledLogNormal); LOG_DEBUG( << "dm = " << boost::math::mean(scaledLogNormal) - boost::math::mean(logNormal) << ", vs = " << boost::math::variance(scaledLogNormal) / boost::math::variance(logNormal) << ", marginalLikelihoodMode = " << filter.marginalLikelihoodMode(weight) << ", expectedMode = " << expectedMode); BOOST_REQUIRE_CLOSE_ABSOLUTE( expectedMode, filter.marginalLikelihoodMode(weight), 1.0); error.add(std::fabs(filter.marginalLikelihoodMode(weight) - expectedMode)); } LOG_DEBUG(<< "error = " << maths::common::CBasicStatistics::mean(error)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(error) < 0.26); } } } BOOST_AUTO_TEST_CASE(testMarginalLikelihoodVariance) { // Test that the expectation of the residual from the mean for // the marginal likelihood matches the expected variance of the // marginal likelihood. const double locations[] = {0.1, 1.0, 3.0}; const double squareScales[] = {0.1, 1.0, 3.0}; test::CRandomNumbers rng; for (std::size_t i = 0; i < std::size(locations); ++i) { for (std::size_t j = 0; j < std::size(squareScales); ++j) { LOG_DEBUG(<< "*** location = " << locations[i] << ", squareScale = " << squareScales[j] << " ***"); CLogNormalMeanPrecConjugate filter(makePrior()); TDoubleVec seedSamples; rng.generateLogNormalSamples(locations[i], squareScales[j], 10, seedSamples); filter.addSamples(seedSamples); TDoubleVec samples; rng.generateLogNormalSamples(locations[i], squareScales[j], 200, samples); TMeanAccumulator relativeError; for (std::size_t k = 0; k < samples.size(); ++k) { filter.addSamples(TDouble1Vec(1, samples[k])); double expectedVariance; BOOST_TEST_REQUIRE(filter.marginalLikelihoodVarianceForTest(expectedVariance)); LOG_TRACE(<< "marginalLikelihoodVariance = " << filter.marginalLikelihoodVariance() << ", expectedVariance = " << expectedVariance); relativeError.add(std::fabs(filter.marginalLikelihoodVariance() - expectedVariance) / expectedVariance); } LOG_DEBUG(<< "relativeError = " << maths::common::CBasicStatistics::mean(relativeError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(relativeError) < 0.23); } } } BOOST_AUTO_TEST_CASE(testSampleMarginalLikelihood) { // We're going to test two properties of the sampling: // 1) That the sample mean is equal to the marginal // likelihood mean. // 2) That the sample percentiles match the distribution // percentiles. // I want to cross check these with the implementations of the // jointLogMarginalLikelihood and minusLogJointCdf so use these // to compute the mean and percentiles. const double mean = 0.9; const double squareScale = 1.2; const double eps = 1e-3; test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(mean, squareScale, 50, samples); CLogNormalMeanPrecConjugate filter(makePrior()); TDouble1Vec sampled; for (std::size_t i = 0; i < 1; ++i) { filter.addSamples(TDouble1Vec(1, samples[i])); sampled.clear(); filter.sampleMarginalLikelihood(10, sampled); BOOST_REQUIRE_EQUAL(i + 1, sampled.size()); BOOST_REQUIRE_CLOSE_ABSOLUTE(samples[i], sampled[i], eps); } TMeanAccumulator meanMeanError; std::size_t numberSampled = 20; for (std::size_t i = 1; i < samples.size(); ++i) { filter.addSamples(TDouble1Vec(1, samples[i])); sampled.clear(); filter.sampleMarginalLikelihood(numberSampled, sampled); // The error is due to the approximation of the likelihood // function by a moment matched log-normal. This becomes // increasingly accurate as the number of updates increases. if (i >= 10u) { TMeanVarAccumulator sampledMoments; sampledMoments = std::for_each(sampled.begin(), sampled.end(), sampledMoments); BOOST_REQUIRE_EQUAL(numberSampled, sampled.size()); LOG_TRACE(<< "expectedMean = " << filter.marginalLikelihoodMean() << ", sampledMean = " << maths::common::CBasicStatistics::mean(sampledMoments)); LOG_TRACE(<< "expectedVar = " << filter.marginalLikelihoodVariance() << ", sampledVar = " << maths::common::CBasicStatistics::variance(sampledMoments)); BOOST_REQUIRE_CLOSE_ABSOLUTE( filter.marginalLikelihoodMean(), maths::common::CBasicStatistics::mean(sampledMoments), 0.8); meanMeanError.add(std::fabs(filter.marginalLikelihoodMean() - maths::common::CBasicStatistics::mean(sampledMoments))); } std::sort(sampled.begin(), sampled.end()); for (std::size_t j = 1; j < sampled.size(); ++j) { double q = 100.0 * static_cast<double>(j) / static_cast<double>(numberSampled); double expectedQuantile; BOOST_TEST_REQUIRE(filter.marginalLikelihoodQuantileForTest(q, eps, expectedQuantile)); LOG_TRACE(<< "quantile = " << q << ", x_quantile = " << expectedQuantile << ", quantile range = [" << sampled[j - 1] << "," << sampled[j] << "]"); BOOST_TEST_REQUIRE( expectedQuantile >= sampled[j - 1] - 0.2 * std::max(6.0 - static_cast<double>(i), 0.0)); BOOST_TEST_REQUIRE( expectedQuantile <= sampled[j] + 1.2 * std::max(6.0 - static_cast<double>(i), 0.0)); } } LOG_DEBUG(<< "mean mean error = " << maths::common::CBasicStatistics::mean(meanMeanError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanMeanError) < 0.25); } BOOST_AUTO_TEST_CASE(testCdf) { // Test error cases. // // Test some invariants: // "cdf" + "cdf complement" = 1 // cdf x for x < 0 = 1 // cdf complement x for x < 0 = 0 const double location = 2.0; const double squareScale = 0.8; const std::size_t n[] = {20u, 80u}; test::CRandomNumbers rng; CLogNormalMeanPrecConjugate filter(makePrior()); for (std::size_t i = 0; i < std::size(n); ++i) { TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, n[i], samples); filter.addSamples(samples); double lowerBound; double upperBound; BOOST_TEST_REQUIRE(!filter.minusLogJointCdf(TDouble1Vec(), lowerBound, upperBound)); BOOST_TEST_REQUIRE(!filter.minusLogJointCdfComplement( TDouble1Vec(), lowerBound, upperBound)); BOOST_TEST_REQUIRE(filter.minusLogJointCdf(TDouble1Vec(1, -1.0), lowerBound, upperBound)); double f = (lowerBound + upperBound) / 2.0; BOOST_TEST_REQUIRE(filter.minusLogJointCdfComplement( TDouble1Vec(1, -1.0), lowerBound, upperBound)); double fComplement = (lowerBound + upperBound) / 2.0; LOG_DEBUG(<< "log(F(x)) = " << -f << ", log(1 - F(x)) = " << fComplement); BOOST_REQUIRE_CLOSE_ABSOLUTE(std::log(std::numeric_limits<double>::min()), -f, 1e-10); BOOST_REQUIRE_EQUAL(1.0, std::exp(-fComplement)); for (std::size_t j = 1; j < 500; ++j) { double x = static_cast<double>(j) / 2.0; BOOST_TEST_REQUIRE(filter.minusLogJointCdf(TDouble1Vec(1, x), lowerBound, upperBound)); f = (lowerBound + upperBound) / 2.0; BOOST_TEST_REQUIRE(filter.minusLogJointCdfComplement( TDouble1Vec(1, x), lowerBound, upperBound)); fComplement = (lowerBound + upperBound) / 2.0; LOG_TRACE(<< "log(F(x)) = " << (f == 0.0 ? f : -f) << ", log(1 - F(x)) = " << (fComplement == 0.0 ? fComplement : -fComplement)); BOOST_REQUIRE_CLOSE_ABSOLUTE(1.0, std::exp(-f) + std::exp(-fComplement), 1e-10); } } } BOOST_AUTO_TEST_CASE(testProbabilityOfLessLikelySamples) { // We test that the probability of less likely samples calculation // agrees with the chance of seeing a sample with lower marginal // likelihood, up to the sampling error. // // We also check that the tail calculation attributes samples to // the appropriate tail of the distribution. const double means[] = {0.1, 1.5, 3.0}; const double squareScales[] = {0.2, 0.4, 1.5}; const double vs[] = {0.5, 1.0, 2.0}; test::CRandomNumbers rng; TMeanAccumulator meanError; for (size_t i = 0; i < std::size(means); ++i) { for (size_t j = 0; j < std::size(squareScales); ++j) { LOG_DEBUG(<< "means = " << means[i] << ", scale = " << std::sqrt(squareScales[j])); TDoubleVec samples; rng.generateLogNormalSamples(means[i], squareScales[j], 1000, samples); CLogNormalMeanPrecConjugate filter(makePrior()); filter.addSamples(samples); double location = filter.normalMean(); double scale = std::sqrt(1.0 / filter.normalPrecision()); TDoubleVec likelihoods; for (std::size_t k = 0; k < samples.size(); ++k) { double likelihood; filter.jointLogMarginalLikelihood(TDouble1Vec(1, samples[k]), likelihood); likelihoods.push_back(likelihood); } std::sort(likelihoods.begin(), likelihoods.end()); boost::math::lognormal_distribution<> lognormal(location, scale); for (std::size_t k = 1; k < 10; ++k) { double x = boost::math::quantile(lognormal, static_cast<double>(k) / 10.0); TDouble1Vec sample(1, x); double fx; filter.jointLogMarginalLikelihood(sample, fx); double px = static_cast<double>(std::lower_bound(likelihoods.begin(), likelihoods.end(), fx) - likelihoods.begin()) / static_cast<double>(likelihoods.size()); double lb, ub; filter.probabilityOfLessLikelySamples(maths_t::E_TwoSided, sample, lb, ub); double ssd = std::sqrt(px * (1.0 - px) / static_cast<double>(samples.size())); LOG_DEBUG(<< "expected P(x) = " << px << ", actual P(x) = " << (lb + ub) / 2.0 << " sample sd = " << ssd); BOOST_REQUIRE_CLOSE_ABSOLUTE(px, (lb + ub) / 2.0, 3.0 * ssd); meanError.add(std::fabs(px - (lb + ub) / 2.0)); } for (std::size_t k = 0; k < std::size(vs); ++k) { double mode = filter.marginalLikelihoodMode( maths_t::countVarianceScaleWeight(vs[k])); double ss[] = {0.9 * mode, 1.1 * mode}; LOG_DEBUG(<< "vs = " << vs[k] << ", mode = " << mode); double lb, ub; maths_t::ETail tail; { filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {ss[0]}, {maths_t::countVarianceScaleWeight(vs[k])}, lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_LeftTail, tail); if (mode > 0.0) { filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_MixedOrNeitherTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedBelow, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_LeftTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedAbove, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_RightTail, tail); } } if (mode > 0.0) { filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {ss[1]}, {maths_t::countVarianceScaleWeight(vs[k])}, lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_RightTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_MixedOrNeitherTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedBelow, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_LeftTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedAbove, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_RightTail, tail); } } } } LOG_DEBUG(<< "mean error = " << maths::common::CBasicStatistics::mean(meanError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanError) < 0.01); } BOOST_AUTO_TEST_CASE(testAnomalyScore) { // This test pushes 500 samples through the filter and adds in // anomalous signals in the bins at 30, 120, 300 and 420 with // magnitude 4, 5, 10 and 15 standard deviations, respectively, // and checks the anomaly score has: // 1) high probability of detecting the anomalies, and // 2) a very low rate of false positives. using TUIntVec = std::vector<unsigned int>; const double decayRates[] = {0.0, 0.001, 0.01}; const double means[] = {0.1, 1.5, 3.0}; const double squareScales[] = {0.2, 0.4, 1.5}; const double threshold = 0.02; const unsigned int anomalyTimes[] = {30u, 120u, 300u, 420u}; const double anomalies[] = {4.0, 5.0, 10.0, 15.0, 0.0}; test::CRandomNumbers rng; unsigned int test = 0; //std::ofstream file; //file.open("results.m"); double totalFalsePositiveRate = 0.0; std::size_t totalPositives[] = {0u, 0u, 0u}; for (size_t i = 0; i < std::size(means); ++i) { for (size_t j = 0; j < std::size(squareScales); ++j) { LOG_DEBUG(<< "mean = " << means[i] << ", scale = " << std::sqrt(squareScales[j])); boost::math::lognormal_distribution<> logNormal( means[i], std::sqrt(squareScales[j])); TDoubleVec samples; rng.generateLogNormalSamples(means[i], squareScales[j], 500, samples); for (size_t k = 0; k < std::size(decayRates); ++k) { CLogNormalMeanPrecConjugate filter( makePrior(maths_t::E_ContinuousData, 0.0, decayRates[k])); ++test; //std::ostringstream x; //std::ostringstream scores; //x << "x" << test << " = ["; //scores << "score" << test << " = ["; TUIntVec candidateAnomalies; for (unsigned int time = 0; time < samples.size(); ++time) { double anomaly = anomalies[std::find(std::begin(anomalyTimes), std::end(anomalyTimes), time) - std::begin(anomalyTimes)] * boost::math::standard_deviation(logNormal); double sample = samples[time] + anomaly; TDouble1Vec sampleVec(1, sample); filter.addSamples(sampleVec); double score; filter.anomalyScore(maths_t::E_TwoSided, sampleVec, score); if (score > threshold) { candidateAnomalies.push_back(time); } filter.propagateForwardsByTime(1.0); //x << time << " "; //scores << score << " "; } //x << "];\n"; //scores << "];\n"; //file << x.str() << scores.str() << "plot(x" << test << ", score" // << test << ");\n" // << "input(\"Hit any key for next test\");\n\n"; TUIntVec falsePositives; std::set_difference(candidateAnomalies.begin(), candidateAnomalies.end(), std::begin(anomalyTimes), std::end(anomalyTimes), std::back_inserter(falsePositives)); double falsePositiveRate = static_cast<double>(falsePositives.size()) / static_cast<double>(samples.size()); totalFalsePositiveRate += falsePositiveRate; TUIntVec positives; std::set_intersection(candidateAnomalies.begin(), candidateAnomalies.end(), std::begin(anomalyTimes), std::end(anomalyTimes), std::back_inserter(positives)); LOG_DEBUG(<< "falsePositiveRate = " << falsePositiveRate << ", positives = " << positives.size()); // False alarm rate should be less than 1%. BOOST_TEST_REQUIRE(falsePositiveRate <= 0.01); // Should detect at least the two big anomalies. BOOST_TEST_REQUIRE(positives.size() >= 2u); totalPositives[k] += positives.size(); } } } totalFalsePositiveRate /= static_cast<double>(test); LOG_DEBUG(<< "totalFalsePositiveRate = " << totalFalsePositiveRate); for (size_t i = 0; i < std::size(totalPositives); ++i) { LOG_DEBUG(<< "positives = " << totalPositives[i]); BOOST_TEST_REQUIRE(totalPositives[i] >= 20u); } // Total false alarm rate should be less than 0.4%. BOOST_TEST_REQUIRE(totalFalsePositiveRate < 0.004); } BOOST_AUTO_TEST_CASE(testOffset) { // The idea of this test is to check that the offset correctly cancels // out a translation applied to a log-normally distributed data set. const maths_t::EDataType dataTypes[] = {maths_t::E_IntegerData, maths_t::E_ContinuousData}; const double offsets[] = {-0.5, 0.5}; const double decayRates[] = {0.0, 0.001, 0.01}; const double location = 3.0; const double squareScale = 1.0; const double eps = 1e-8; test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 100, samples); for (size_t i = 0; i < std::size(dataTypes); ++i) { for (size_t j = 0; j < std::size(offsets); ++j) { for (size_t k = 0; k < std::size(decayRates); ++k) { CLogNormalMeanPrecConjugate filter1( makePrior(dataTypes[i], offsets[j], decayRates[k])); CLogNormalMeanPrecConjugate filter2( makePrior(dataTypes[i], 0.0, decayRates[k])); for (std::size_t l = 0; l < samples.size(); ++l) { double offsetSample = samples[l] - offsets[j]; TDouble1Vec offsetSampleVec(1, offsetSample); filter1.addSamples(offsetSampleVec); filter1.propagateForwardsByTime(1.0); double x = samples[l]; TDouble1Vec sample(1, x); filter2.addSamples(sample); filter2.propagateForwardsByTime(1.0); double likelihood1; filter1.jointLogMarginalLikelihood(offsetSampleVec, likelihood1); double lowerBound1, upperBound1; filter1.probabilityOfLessLikelySamples( maths_t::E_TwoSided, offsetSampleVec, lowerBound1, upperBound1); BOOST_REQUIRE_EQUAL(lowerBound1, upperBound1); double probability1 = (lowerBound1 + upperBound1) / 2.0; double likelihood2; filter2.jointLogMarginalLikelihood(sample, likelihood2); double lowerBound2, upperBound2; filter2.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, lowerBound2, upperBound2); BOOST_REQUIRE_EQUAL(lowerBound2, upperBound2); double probability2 = (lowerBound2 + upperBound2) / 2.0; BOOST_REQUIRE_CLOSE_ABSOLUTE(likelihood1, likelihood2, eps); BOOST_REQUIRE_CLOSE_ABSOLUTE(probability1, probability2, eps); } using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_AbsoluteTolerance, eps); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } } } } BOOST_AUTO_TEST_CASE(testIntegerData) { // If the data are discrete then we approximate the discrete distribution // by saying it is uniform on the intervals [n,n+1] for each integral n. // The idea of this test is to check that the inferred model agrees in the // limit (large n) with the model inferred from such data. const double locations[] = {0.2, 1.0, 1.5}; const double squareScales[] = {0.5, 2.0}; const std::size_t nSamples = 100000; for (std::size_t i = 0; i < std::size(locations); ++i) { for (std::size_t j = 0; j < std::size(squareScales); ++j) { test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(locations[i], squareScales[j], nSamples, samples); TDoubleVec uniform; rng.generateUniformSamples(0.0, 1.0, nSamples, uniform); CLogNormalMeanPrecConjugate filter1(makePrior(maths_t::E_IntegerData, 0.1)); CLogNormalMeanPrecConjugate filter2(makePrior(maths_t::E_ContinuousData, 0.1)); for (std::size_t k = 0; k < nSamples; ++k) { double x = std::floor(samples[k]); TDouble1Vec sample(1, x); filter1.addSamples(sample); sample[0] += uniform[k]; filter2.addSamples(sample); } using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 0.01); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); TMeanAccumulator meanLogLikelihood1; TMeanAccumulator meanLogLikelihood2; for (std::size_t k = 0; k < nSamples; ++k) { double x = std::floor(samples[k]); TDouble1Vec sample(1, x); double logLikelihood1; filter1.jointLogMarginalLikelihood(sample, logLikelihood1); meanLogLikelihood1.add(-logLikelihood1); sample[0] += uniform[k]; double logLikelihood2; filter2.jointLogMarginalLikelihood(sample, logLikelihood2); meanLogLikelihood2.add(-logLikelihood2); } LOG_DEBUG(<< "meanLogLikelihood1 = " << maths::common::CBasicStatistics::mean(meanLogLikelihood1) << ", meanLogLikelihood2 = " << maths::common::CBasicStatistics::mean(meanLogLikelihood2)); BOOST_REQUIRE_CLOSE_ABSOLUTE( maths::common::CBasicStatistics::mean(meanLogLikelihood1), maths::common::CBasicStatistics::mean(meanLogLikelihood2), 0.05); } } TMeanAccumulator meanError; for (size_t i = 0; i < std::size(locations); ++i) { for (std::size_t j = 0; j < std::size(squareScales); ++j) { test::CRandomNumbers rng; TDoubleVec seedSamples; rng.generateLogNormalSamples(locations[i], squareScales[j], 200, seedSamples); CLogNormalMeanPrecConjugate filter1(makePrior(maths_t::E_IntegerData, 0.1)); filter1.addSamples(seedSamples); CLogNormalMeanPrecConjugate filter2 = filter1; filter2.dataType(maths_t::E_ContinuousData); TDoubleVec samples; rng.generateLogNormalSamples(locations[i], squareScales[j], nSamples, samples); TDoubleVec uniform; rng.generateUniformSamples(0.0, 1.0, nSamples, uniform); TMeanAccumulator meanProbability1; TMeanAccumulator meanProbability2; for (std::size_t k = 0; k < nSamples; ++k) { double x = std::floor(samples[k]); TDouble1Vec sample(1, x); double l1, u1; BOOST_TEST_REQUIRE(filter1.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, l1, u1)); BOOST_REQUIRE_EQUAL(l1, u1); double p1 = (l1 + u1) / 2.0; meanProbability1.add(p1); sample[0] += uniform[k]; double l2, u2; BOOST_TEST_REQUIRE(filter2.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, l2, u2)); BOOST_REQUIRE_EQUAL(l2, u2); double p2 = (l2 + u2) / 2.0; meanProbability2.add(p2); } double p1 = maths::common::CBasicStatistics::mean(meanProbability1); double p2 = maths::common::CBasicStatistics::mean(meanProbability2); LOG_DEBUG(<< "location = " << locations[i] << ", p1 = " << p1 << ", p2 = " << p2); BOOST_REQUIRE_CLOSE_ABSOLUTE(p1, p2, 0.05 * p1); meanError.add(fabs(p1 - p2)); } } LOG_DEBUG(<< "mean error = " << maths::common::CBasicStatistics::mean(meanError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanError) < 0.005); } BOOST_AUTO_TEST_CASE(testLowVariationData) { { CLogNormalMeanPrecConjugate filter(makePrior(maths_t::E_IntegerData)); for (std::size_t i = 0; i < 100; ++i) { filter.addSamples(TDouble1Vec(1, 430.0)); } TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval(68.0); double sigma = (interval.second - interval.first) / 2.0; LOG_DEBUG(<< "68% confidence interval " << interval << ", approximate variance = " << sigma * sigma); BOOST_REQUIRE_CLOSE_ABSOLUTE(12.0, 1.0 / (sigma * sigma), 0.15); } { CLogNormalMeanPrecConjugate filter(makePrior(maths_t::E_ContinuousData)); for (std::size_t i = 0; i < 100; ++i) { filter.addSamples(TDouble1Vec(1, 430.0)); } TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval(68.0); double sigma = (interval.second - interval.first) / 2.0; LOG_DEBUG(<< "68% confidence interval " << interval << ", approximate s.t.d. = " << sigma); BOOST_REQUIRE_CLOSE_ABSOLUTE(1e-4, sigma / 430.5, 5e-5); } } BOOST_AUTO_TEST_CASE(testPersist) { const double location = std::log(10.0); const double squareScale = 3.0; test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 100, samples); maths::common::CLogNormalMeanPrecConjugate origFilter(makePrior()); for (std::size_t i = 0; i < samples.size(); ++i) { origFilter.addSamples({samples[i]}, maths_t::CUnitWeights::SINGLE_UNIT); } double decayRate = origFilter.decayRate(); std::uint64_t checksum = origFilter.checksum(); std::ostringstream origJson; core::CJsonStatePersistInserter::persist( origJson, std::bind_front(&maths::common::CLogNormalMeanPrecConjugate::acceptPersistInserter, &origFilter)); LOG_DEBUG(<< "Log normal mean conjugate JSON representation:\n" << origJson.str()); // Restore the JSON into a new filter std::istringstream origJsonStrm{"{\"topLevel\" : " + origJson.str() + "}"}; core::CJsonStateRestoreTraverser traverser(origJsonStrm); maths::common::SDistributionRestoreParams params( maths_t::E_ContinuousData, decayRate + 0.1, maths::common::MINIMUM_CLUSTER_SPLIT_FRACTION, maths::common::MINIMUM_CLUSTER_SPLIT_COUNT, maths::common::MINIMUM_CATEGORY_COUNT); maths::common::CLogNormalMeanPrecConjugate restoredFilter(params, traverser); LOG_DEBUG(<< "orig checksum = " << checksum << " restored checksum = " << restoredFilter.checksum()); BOOST_REQUIRE_EQUAL(checksum, restoredFilter.checksum()); // The JSON representation of the new filter should be the same as the original std::ostringstream newJson; core::CJsonStatePersistInserter::persist( newJson, std::bind_front(&maths::common::CLogNormalMeanPrecConjugate::acceptPersistInserter, &restoredFilter)); BOOST_REQUIRE_EQUAL(origJson.str(), newJson.str()); } BOOST_AUTO_TEST_CASE(testVarianceScale) { // The strategy for this test is to check we correctly account // for variance scaling by scaling the variance of a collection // of samples and then checking that the percentiles for those // samples "probability of less likely sample" are correct. In // particular, we expect that the calculation correctly predicts // the fraction of samples which have lower probability for all // percentiles (if the variance scaling was off we'd either get // too few or too many). // // We use the same idea for testing the variance scaling for // the marginal likelihood. The sample mean of the scaled log // likelihood tends to the expected log likelihood for a scaled // log-normal distribution (uniform law of large numbers), which // is just the differential entropy of a scaled log-normal R.V. // // Finally, we test update with scaled samples produces the // correct posterior. TWeightFunc weightsFuncs[]{ static_cast<TWeightFunc>(maths_t::seasonalVarianceScaleWeight), static_cast<TWeightFunc>(maths_t::countVarianceScaleWeight)}; for (std::size_t s = 0; s < std::size(weightsFuncs); ++s) { const double location = 2.0; const double squareScale = 1.5; { boost::math::lognormal_distribution<> logNormal(location, std::sqrt(squareScale)); LOG_DEBUG(<< "mean = " << boost::math::mean(logNormal) << ", variance = " << boost::math::variance(logNormal)); } const double varianceScales[] = {0.20, 0.50, 0.75, 1.50, 2.00, 5.00}; LOG_DEBUG(<< ""); LOG_DEBUG(<< "****** probabilityOfLessLikelySamples ******"); const double percentiles[] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0}; const std::size_t nSamples[] = {10u, 20u, 40u, 80u, 1000u}; const std::size_t nScaledSamples = 50000; double percentileErrorTolerance = 0.08; double meanPercentileErrorTolerance = 0.055; double totalMeanPercentileErrorTolerance = 0.005; double totalUnscaledMeanPercentileError = 0.0; double totalMeanPercentileError = 0.0; double trials = 0.0; for (size_t i = 0; i < std::size(nSamples); ++i) { LOG_DEBUG(<< "**** nSamples = " << nSamples[i] << " ****"); test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, nSamples[i], samples); CLogNormalMeanPrecConjugate filter(makePrior()); filter.addSamples(samples); filter.checksum(); double unscaledMeanPercentileError = 0.0; TDoubleVec unscaledPercentileErrors; { TDoubleVec unscaledSamples; rng.generateLogNormalSamples(location, squareScale, nScaledSamples, unscaledSamples); TDoubleVec probabilities; probabilities.reserve(nScaledSamples); for (std::size_t j = 0; j < unscaledSamples.size(); ++j) { TDouble1Vec sample(1, unscaledSamples[j]); double lowerBound, upperBound; BOOST_TEST_REQUIRE(filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, lowerBound, upperBound)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); double probability = (lowerBound + upperBound) / 2.0; probabilities.push_back(probability); } std::sort(probabilities.begin(), probabilities.end()); for (size_t j = 0; j < std::size(percentiles); ++j) { std::size_t index = static_cast<std::size_t>( static_cast<double>(nScaledSamples) * percentiles[j] / 100.0); double error = std::fabs(probabilities[index] - percentiles[j] / 100.0); unscaledPercentileErrors.push_back(error); unscaledMeanPercentileError += error; } unscaledMeanPercentileError /= static_cast<double>(std::size(percentiles)); } for (size_t j = 0; j < std::size(varianceScales); ++j) { LOG_TRACE(<< "**** variance scale = " << varianceScales[j] << " ****"); double ss = std::log(1.0 + varianceScales[j] * (std::exp(squareScale) - 1.0)); double shiftedLocation = location + (squareScale - ss) / 2.0; { boost::math::lognormal_distribution<> logNormal(shiftedLocation, std::sqrt(ss)); LOG_TRACE(<< "mean = " << boost::math::mean(logNormal) << ", variance = " << boost::math::variance(logNormal)); } TDoubleVec scaledSamples; rng.generateLogNormalSamples(shiftedLocation, ss, nScaledSamples, scaledSamples); TDoubleVec probabilities; probabilities.reserve(nScaledSamples); for (std::size_t k = 0; k < scaledSamples.size(); ++k) { double lowerBound, upperBound; maths_t::ETail tail; BOOST_TEST_REQUIRE(filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {scaledSamples[k]}, {weightsFuncs[s](varianceScales[j])}, lowerBound, upperBound, tail)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); double probability = (lowerBound + upperBound) / 2.0; probabilities.push_back(probability); } std::sort(probabilities.begin(), probabilities.end()); double meanPercentileError = 0.0; for (size_t k = 0; k < std::size(percentiles); ++k) { std::size_t index = static_cast<std::size_t>( static_cast<double>(nScaledSamples) * percentiles[k] / 100.0); double error = std::fabs(probabilities[index] - percentiles[k] / 100.0); meanPercentileError += error; double threshold = percentileErrorTolerance + unscaledPercentileErrors[k]; LOG_TRACE(<< "percentile = " << percentiles[k] << ", probability = " << probabilities[index] << ", error = " << error << ", error threshold = " << threshold); BOOST_TEST_REQUIRE(error < threshold); } meanPercentileError /= static_cast<double>(std::size(percentiles)); double threshold = meanPercentileErrorTolerance + unscaledMeanPercentileError; LOG_TRACE(<< "mean error = " << meanPercentileError << ", mean error threshold = " << threshold); BOOST_TEST_REQUIRE(meanPercentileError < threshold); totalMeanPercentileError += meanPercentileError; totalUnscaledMeanPercentileError += unscaledMeanPercentileError; trials += 1.0; } } totalMeanPercentileError /= trials; totalUnscaledMeanPercentileError /= trials; { double threshold = totalMeanPercentileErrorTolerance + totalUnscaledMeanPercentileError; LOG_DEBUG(<< "total unscaled mean error = " << totalUnscaledMeanPercentileError); LOG_DEBUG(<< "total mean error = " << totalMeanPercentileError << ", total mean error threshold = " << threshold); BOOST_TEST_REQUIRE(totalMeanPercentileError < threshold); } LOG_DEBUG(<< "****** jointLogMarginalLikelihood ******"); test::CRandomNumbers rng; for (size_t i = 0; i < std::size(varianceScales); ++i) { LOG_DEBUG(<< "**** variance scale = " << varianceScales[i] << " ****"); double ss = std::log(1.0 + varianceScales[i] * (std::exp(squareScale) - 1.0)); double shiftedLocation = location + (squareScale - ss) / 2.0; boost::math::lognormal_distribution<> logNormal(shiftedLocation, std::sqrt(ss)); LOG_DEBUG(<< "mean = " << boost::math::mean(logNormal) << ", variance = " << boost::math::variance(logNormal)); double expectedDifferentialEntropy = maths::common::CTools::differentialEntropy(logNormal); CLogNormalMeanPrecConjugate filter(makePrior()); double differentialEntropy = 0.0; TDoubleVec seedSamples; rng.generateLogNormalSamples(location, squareScale, 100, seedSamples); filter.addSamples(seedSamples); TDoubleVec scaledSamples; rng.generateLogNormalSamples(shiftedLocation, ss, 100000, scaledSamples); for (std::size_t j = 0; j < scaledSamples.size(); ++j) { double logLikelihood = 0.0; BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors, filter.jointLogMarginalLikelihood( {scaledSamples[j]}, {weightsFuncs[s](varianceScales[i])}, logLikelihood)); differentialEntropy -= logLikelihood; } differentialEntropy /= static_cast<double>(scaledSamples.size()); LOG_DEBUG(<< "differentialEntropy = " << differentialEntropy << ", expectedDifferentialEntropy = " << expectedDifferentialEntropy); BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedDifferentialEntropy, differentialEntropy, 0.5); } } const maths_t::EDataType dataTypes[] = {maths_t::E_IntegerData, maths_t::E_ContinuousData}; const double maximumMeanError[] = {0.5, 0.5}; const double maximumVarianceError[] = {1.4, 1.0}; const double maximumMeanMeanError[] = {0.02, 0.01}; const double maximumMeanVarianceError[] = {0.18, 0.1}; for (std::size_t s = 0; s < std::size(weightsFuncs); ++s) { for (std::size_t t = 0; t < std::size(dataTypes); ++t) { const double means[] = {0.1, 1.0, 10.0, 100.0, 1000.0, 100000.0, 1000000.0}; const double variances[] = {0.1, 1.0, 10.0, 100.0, 1000.0, 100000.0, 1000000.0}; const double varianceScales[] = {0.1, 0.5, 1.0, 2.0, 10.0, 100.0}; TDoubleVec samples; maths_t::TDoubleWeightsAry1Vec weights; test::CRandomNumbers rng; TMeanAccumulator meanMeanError; TMeanAccumulator meanVarianceError; for (std::size_t i = 0; i < std::size(means); ++i) { for (std::size_t j = 0; j < std::size(variances); ++j) { double mean = means[i]; double variance = variances[j]; // We don't include very skewed distributions because they // are hard estimate accurately even without scaling due to // relatively frequent large outliers. if (mean <= 0.1 * variance) { continue; } // We purposely don't estimate true variance in this case. if (std::sqrt(variance) < mean * maths::common::MINIMUM_COEFFICIENT_OF_VARIATION) { continue; } double squareScale = std::log(1.0 + variance / (mean * mean)); double location = std::log(mean) - squareScale / 2.0; { boost::math::lognormal_distribution<> logNormal( location, std::sqrt(squareScale)); LOG_TRACE(<< "****** mean = " << boost::math::mean(logNormal) << ", variance = " << boost::math::variance(logNormal) << " ******"); LOG_TRACE(<< "location = " << location); } for (std::size_t k = 0; k < std::size(varianceScales); ++k) { double scale = varianceScales[k]; if (scale * variance >= 100.0 * mean) { continue; } LOG_TRACE(<< "*** scale = " << scale << " ***"); double scaledSquareScale = std::log(1.0 + variance * scale / (mean * mean)); double scaledLocation = std::log(mean) - scaledSquareScale / 2.0; { boost::math::lognormal_distribution<> logNormal( scaledLocation, std::sqrt(scaledSquareScale)); LOG_TRACE(<< "scaled mean = " << boost::math::mean(logNormal) << ", scaled variance = " << boost::math::variance(logNormal)); LOG_TRACE(<< "scaled location = " << scaledLocation); } TMeanAccumulator meanError; TMeanAccumulator varianceError; for (unsigned int test = 0; test < 5; ++test) { CLogNormalMeanPrecConjugate filter(makePrior(dataTypes[t])); rng.generateLogNormalSamples(location, squareScale, 200, samples); weights.clear(); weights.resize(samples.size(), maths_t::CUnitWeights::UNIT); filter.addSamples(samples, weights); rng.generateLogNormalSamples( scaledLocation, scaledSquareScale, 200, samples); weights.clear(); weights.resize(samples.size(), weightsFuncs[s](scale)); filter.addSamples(samples, weights); boost::math::lognormal_distribution<> logNormal( filter.normalMean(), std::sqrt(1.0 / filter.normalPrecision())); double dm = (dataTypes[t] == maths_t::E_IntegerData ? 0.5 : 0.0); double dv = (dataTypes[t] == maths_t::E_IntegerData ? 1.0 / 12.0 : 0.0); double trialMeanError = std::fabs(boost::math::mean(logNormal) - (mean + dm)) / std::max(1.0, mean); double trialVarianceError = std::fabs(boost::math::variance(logNormal) - (variance + dv)) / std::max(1.0, variance); LOG_TRACE(<< "trial mean error = " << trialMeanError); LOG_TRACE(<< "trial variance error = " << trialVarianceError); meanError.add(trialMeanError); varianceError.add(trialVarianceError); } LOG_TRACE(<< "mean error = " << maths::common::CBasicStatistics::mean(meanError)); LOG_TRACE(<< "variance error = " << maths::common::CBasicStatistics::mean(varianceError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean( meanError) < maximumMeanError[t]); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(varianceError) < maximumVarianceError[t]); meanMeanError += meanError; meanVarianceError += varianceError; } } } LOG_DEBUG(<< "mean mean error = " << maths::common::CBasicStatistics::mean(meanMeanError)); LOG_DEBUG(<< "mean variance error = " << maths::common::CBasicStatistics::mean(meanVarianceError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanMeanError) < maximumMeanMeanError[t]); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanVarianceError) < maximumMeanVarianceError[t]); } } } BOOST_AUTO_TEST_CASE(testNegativeSample) { // Test that we recover roughly the same distribution after adjusting // the offset. The idea of this test is to run two priors side by side, // one with a large enough offset that it never needs to adjust the // offset and the other which will adjust and check that we get broadly // similar distributions at the end. const double location = std::log(2.0); const double squareScale = 1.0; test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(location, squareScale, 100, samples); CLogNormalMeanPrecConjugate filter1 = CLogNormalMeanPrecConjugate::nonInformativePrior( maths_t::E_ContinuousData, 0.0, 0.0, 0.2); CLogNormalMeanPrecConjugate filter2 = CLogNormalMeanPrecConjugate::nonInformativePrior( maths_t::E_ContinuousData, 1.74524, 0.0, 0.2); filter1.addSamples(samples); filter2.addSamples(samples); TDouble1Vec negative(1, -0.29); filter1.addSamples(negative); filter2.addSamples(negative); BOOST_REQUIRE_EQUAL(filter1.numberSamples(), filter2.numberSamples()); using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 0.1); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } BOOST_AUTO_TEST_CASE(testBadValues) { test::CRandomNumbers rng; TDoubleVec samples; rng.generateLogNormalSamples(std::log(2.0), 1.0, 100, samples); CLogNormalMeanPrecConjugate filter = CLogNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData); filter.addSamples(samples); std::uint64_t checksum{filter.checksum()}; TDouble1Vec badSample{std::numeric_limits<double>::quiet_NaN()}; auto badWeights = maths_t::CUnitWeights::SINGLE_UNIT; maths_t::setSeasonalVarianceScale(std::numeric_limits<double>::quiet_NaN(), badWeights[0]); filter.addSamples(badSample, maths_t::CUnitWeights::SINGLE_UNIT); filter.addSamples({1.0}, badWeights); // We've just ignored the bad input. BOOST_TEST_REQUIRE(checksum, filter.checksum()); double lowerBound; double upperBound; maths_t::ETail tail; BOOST_TEST_REQUIRE(!filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, badSample, maths_t::CUnitWeights::SINGLE_UNIT, lowerBound, upperBound, tail)); BOOST_REQUIRE_EQUAL(0.0, lowerBound); BOOST_REQUIRE_EQUAL(1.0, upperBound); BOOST_REQUIRE_EQUAL(maths_t::E_UndeterminedTail, tail); BOOST_TEST_REQUIRE(!filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {1.0}, badWeights, lowerBound, upperBound, tail)); BOOST_REQUIRE_EQUAL(0.0, lowerBound); BOOST_REQUIRE_EQUAL(1.0, upperBound); BOOST_REQUIRE_EQUAL(maths_t::E_UndeterminedTail, tail); } BOOST_AUTO_TEST_SUITE_END()