lib/maths/common/unittest/CGammaRateConjugateTest.cc

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <core/CJsonStatePersistInserter.h> #include <core/CJsonStateRestoreTraverser.h> #include <core/CLogger.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CGammaRateConjugate.h> #include <maths/common/CRestoreParams.h> #include <maths/common/CTools.h> #include <test/BoostTestCloseAbsolute.h> #include <test/CRandomNumbers.h> #include "TestUtils.h" #include <boost/math/distributions/gamma.hpp> #include <boost/test/unit_test.hpp> #include <algorithm> #include <cmath> #include <fstream> #include <functional> #include <iterator> #include <vector> BOOST_AUTO_TEST_SUITE(CGammaRateConjugateTest) using namespace ml; using namespace handy_typedefs; namespace { using TDoubleVec = std::vector<double>; using TDoubleDoublePr = std::pair<double, double>; using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>; using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator; using TMeanVarAccumulator = maths::common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator; using CGammaRateConjugate = CPriorTestInterfaceMixin<maths::common::CGammaRateConjugate>; using TWeightFunc = maths_t::TDoubleWeightsAry (*)(double); CGammaRateConjugate makePrior(maths_t::EDataType dataType = maths_t::E_ContinuousData, const double& offset = 0.0, const double& decayRate = 0.0) { return CGammaRateConjugate::nonInformativePrior(dataType, offset, decayRate, 0.0); } } BOOST_AUTO_TEST_CASE(testMultipleUpdate) { // Test that we get the same result updating once with a vector of 100 // samples of an R.V. versus updating individually 100 times. const maths_t::EDataType dataTypes[]{maths_t::E_IntegerData, maths_t::E_ContinuousData}; const double shape = 2.0; const double scale = 3.0; test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 100, samples); for (size_t i = 0; i < std::size(dataTypes); ++i) { CGammaRateConjugate filter1(makePrior(dataTypes[i])); CGammaRateConjugate filter2(filter1); for (std::size_t j = 0; j < samples.size(); ++j) { filter1.addSamples(TDouble1Vec{samples[j]}); } filter2.addSamples(samples); using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 1e-3); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } // Test with variance scale. TDoubleVec scaledSamples; rng.generateGammaSamples(shape / 2.0, 2.0 * scale, 100, scaledSamples); for (size_t i = 0; i < std::size(dataTypes); ++i) { CGammaRateConjugate filter1(makePrior(dataTypes[i])); filter1.addSamples(samples); CGammaRateConjugate filter2(filter1); for (std::size_t j = 0; j < scaledSamples.size(); ++j) { filter1.addSamples({scaledSamples[j]}, {ml::maths_t::countVarianceScaleWeight(2.0)}); } filter2.addSamples(scaledSamples, maths_t::TDoubleWeightsAry1Vec( scaledSamples.size(), maths_t::countVarianceScaleWeight(2.0))); using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 0.03); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } // Test the count weight is equivalent to adding repeated samples. for (size_t i = 0; i < std::size(dataTypes); ++i) { CGammaRateConjugate filter1(makePrior(dataTypes[i])); CGammaRateConjugate filter2(filter1); double x = 3.0; std::size_t count = 10; for (std::size_t j = 0; j < count; ++j) { filter1.addSamples(TDouble1Vec{x}); } filter2.addSamples({x}, {maths_t::countWeight(static_cast<double>(count))}); using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 0.01); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } } BOOST_AUTO_TEST_CASE(testPropagation) { // The quantities are preserved up to the solving tolerance given that // the updated count is still relatively large so the digamma function // is very nearly equal to the log function. const double eps = 1e-3; test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(1.0, 3.0, 500, samples); CGammaRateConjugate filter(makePrior(maths_t::E_ContinuousData, 0.1)); for (std::size_t i = 0; i < samples.size(); ++i) { filter.addSamples(TDouble1Vec(1, samples[i])); } double shape = filter.likelihoodShape(); double rate = filter.likelihoodRate(); filter.propagateForwardsByTime(5.0); double propagatedShape = filter.likelihoodShape(); double propagatedRate = filter.likelihoodRate(); LOG_DEBUG(<< "shape = " << shape << ", rate = " << rate << ", propagatedShape = " << propagatedShape << ", propagatedRate = " << propagatedRate); BOOST_REQUIRE_CLOSE_ABSOLUTE(shape, propagatedShape, eps); BOOST_REQUIRE_CLOSE_ABSOLUTE(rate, propagatedRate, eps); } BOOST_AUTO_TEST_CASE(testShapeEstimation) { // The idea here is to check that the likelihood shape estimate converges // to the correct value for a range of distribution parameters. We do not // use any explicit bounds on the convergence rates so simply check that // we do get closer as the number of samples increases. const double decayRates[] = {0.0, 0.001, 0.01}; for (size_t i = 0; i < std::size(decayRates); ++i) { test::CRandomNumbers rng; double tests = 0.0; double errorIncreased = 0.0; for (unsigned int test = 0; test < 100; ++test) { double shape = 0.5 * (test + 1.0); double scale = 2.0; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 5050, samples); using TGammaRateConjugateVec = std::vector<CGammaRateConjugate>; unsigned int nAggregate = 50; TGammaRateConjugateVec filters( nAggregate, makePrior(maths_t::E_ContinuousData, 0.0, decayRates[i])); double previousError = std::numeric_limits<double>::max(); double averageShape = 0.0; for (std::size_t j = 0; j < samples.size() / nAggregate; ++j) { double error = 0.0; averageShape = 0.0; for (std::size_t k = 0; k < nAggregate; ++k) { filters[k].addSamples(TDouble1Vec(1, samples[nAggregate * j + k])); filters[k].propagateForwardsByTime(1.0); error += fabs(shape - filters[k].likelihoodShape()); averageShape += filters[k].likelihoodShape(); } error /= static_cast<double>(nAggregate); averageShape /= static_cast<double>(nAggregate); if (j > 0u && j % 20u == 0u) { if (error > previousError) { errorIncreased += 1.0; } tests += 1.0; previousError = error; } } LOG_TRACE(<< "shape = " << shape << ", averageShape = " << averageShape); // Average error after 100 updates should be less than 8%. BOOST_REQUIRE_CLOSE_ABSOLUTE(shape, averageShape, 0.08 * shape); } // Error should only increase in at most 7% of measurements. BOOST_REQUIRE_CLOSE_ABSOLUTE(0.0, errorIncreased, 0.07 * tests); } } BOOST_AUTO_TEST_CASE(testRateEstimation) { // We are going to test that we correctly estimate a distribution // for the rate of the gamma process by checking that the true // rate of a gamma process lies in various confidence intervals // the correct percentage of the times. const double decayRates[] = {0.0, 0.001, 0.01}; const unsigned int nTests = 100; const double testIntervals[] = {50.0, 60.0, 70.0, 80.0, 85.0, 90.0, 95.0, 99.0}; for (size_t i = 0; i < std::size(decayRates); ++i) { test::CRandomNumbers rng; unsigned int errors[] = {0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u}; for (unsigned int test = 0; test < nTests; ++test) { double shape = 2.0; double scale = 0.2 * (test + 1.0); double rate = 1.0 / scale; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 100, samples); CGammaRateConjugate filter(makePrior(maths_t::E_ContinuousData, decayRates[i])); for (std::size_t j = 0; j < samples.size(); ++j) { filter.addSamples(TDouble1Vec(1, samples[j])); filter.propagateForwardsByTime(1.0); } for (size_t j = 0; j < std::size(testIntervals); ++j) { TDoubleDoublePr confidenceInterval = filter.confidenceIntervalRate(testIntervals[j]); if (rate < confidenceInterval.first || rate > confidenceInterval.second) { ++errors[j]; } } } for (size_t j = 0; j < std::size(testIntervals); ++j) { // The number of errors should be inside the percentile bounds. unsigned int maximumErrors = static_cast<unsigned int>( std::ceil((1.0 - testIntervals[j] / 100.0) * nTests)); LOG_TRACE(<< "errors = " << errors[j] << ", maximumErrors = " << maximumErrors); BOOST_TEST_REQUIRE(errors[j] <= maximumErrors + 2); } } } BOOST_AUTO_TEST_CASE(testMarginalLikelihood) { // Check that the c.d.f. <= 1 at extreme. maths_t::EDataType dataTypes[] = {maths_t::E_ContinuousData, maths_t::E_IntegerData}; for (std::size_t t = 0; t < std::size(dataTypes); ++t) { CGammaRateConjugate filter(makePrior()); const double shape = 1.0; const double scale = 1.0; test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 200, samples); filter.addSamples(samples); TWeightFunc weightsFuncs[]{static_cast<TWeightFunc>(maths_t::countWeight), static_cast<TWeightFunc>(maths_t::outlierWeight)}; double weights[]{0.1, 0.9, 10.0}; for (std::size_t i = 0; i < std::size(weightsFuncs); ++i) { for (std::size_t j = 0; j < std::size(weights); ++j) { double lb, ub; filter.minusLogJointCdf({1000.0}, {weightsFuncs[i](weights[j])}, lb, ub); LOG_DEBUG(<< "-log(c.d.f) = " << (lb + ub) / 2.0); BOOST_TEST_REQUIRE(lb >= 0.0); BOOST_TEST_REQUIRE(ub >= 0.0); } } } // Check that the marginal likelihood and c.d.f. agree for some // test data and that the c.d.f. <= 1. const double decayRates[] = {0.0, 0.001, 0.01}; const double shape = 5.0; const double scale = 1.0; boost::math::gamma_distribution<> gamma(shape, scale); double mean = boost::math::mean(gamma); double variance = boost::math::variance(gamma); test::CRandomNumbers rng; unsigned int numberSamples[] = {4u, 10u, 300u, 500u}; const double tolerances[] = {1e-8, 1e-8, 0.01, 0.001}; for (size_t i = 0; i < std::size(numberSamples); ++i) { TDoubleVec samples; rng.generateGammaSamples(shape, scale, numberSamples[i], samples); for (size_t j = 0; j < std::size(decayRates); ++j) { CGammaRateConjugate filter( makePrior(maths_t::E_ContinuousData, 0.0, decayRates[j])); for (std::size_t k = 0; k < samples.size(); ++k) { filter.addSamples(TDouble1Vec(1, samples[k])); filter.propagateForwardsByTime(1.0); } // We'll check that the p.d.f. is close to the derivative // of the c.d.f. at a range of deltas from the true mean. const double eps = 1e-4; double deltas[] = {-2.0, -1.6, -1.2, -0.8, -0.4, -0.2, 0.0, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0}; for (size_t k = 0; k < std::size(deltas); ++k) { double x = mean + deltas[k] * std::sqrt(variance); TDouble1Vec sample(1, x); LOG_TRACE(<< "number = " << numberSamples[i] << ", sample = " << sample[0]); double logLikelihood = 0.0; BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors, filter.jointLogMarginalLikelihood(sample, logLikelihood)); double pdf = std::exp(logLikelihood); double lowerBound = 0.0, upperBound = 0.0; sample[0] -= eps; BOOST_TEST_REQUIRE(filter.minusLogJointCdf(sample, lowerBound, upperBound)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); double minusLogCdf = (lowerBound + upperBound) / 2.0; double cdfAtMinusEps = std::exp(-minusLogCdf); BOOST_TEST_REQUIRE(minusLogCdf >= 0.0); sample[0] += 2.0 * eps; BOOST_TEST_REQUIRE(filter.minusLogJointCdf(sample, lowerBound, upperBound)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); minusLogCdf = (lowerBound + upperBound) / 2.0; double cdfAtPlusEps = std::exp(-minusLogCdf); BOOST_TEST_REQUIRE(minusLogCdf >= 0.0); double dcdfdx = (cdfAtPlusEps - cdfAtMinusEps) / 2.0 / eps; LOG_TRACE(<< "pdf(x) = " << pdf << ", d(cdf)/dx = " << dcdfdx); BOOST_REQUIRE_CLOSE_ABSOLUTE(pdf, dcdfdx, tolerances[i]); } } } { // Test that the sample expectation of the log likelihood tends to // the expected log likelihood for a gamma distribution (uniform law // of large numbers), which is just the differential entropy of a // gamma R.V. double expectedDifferentialEntropy = maths::common::CTools::differentialEntropy(gamma); CGammaRateConjugate filter(makePrior()); double differentialEntropy = 0.0; TDoubleVec seedSamples; rng.generateGammaSamples(shape, scale, 100, seedSamples); filter.addSamples(seedSamples); TDoubleVec samples; rng.generateGammaSamples(shape, scale, 100000, samples); for (std::size_t i = 0; i < samples.size(); ++i) { TDouble1Vec sample(1, samples[i]); filter.addSamples(sample); double logLikelihood = 0.0; BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors, filter.jointLogMarginalLikelihood(sample, logLikelihood)); differentialEntropy -= logLikelihood; } differentialEntropy /= static_cast<double>(samples.size()); LOG_DEBUG(<< "differentialEntropy = " << differentialEntropy << ", expectedDifferentialEntropy = " << expectedDifferentialEntropy); BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedDifferentialEntropy, differentialEntropy, 0.0025); } const double varianceScales[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0}; CGammaRateConjugate filter(makePrior()); TDoubleVec samples; rng.generateGammaSamples(shape, scale, 1000, samples); filter.addSamples(samples); const double percentages[] = {5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 95.0}; { // Test that marginal likelihood confidence intervals are // what we'd expect for various variance scales. TMeanAccumulator error; for (std::size_t i = 0; i < std::size(percentages); ++i) { double q1, q2; filter.marginalLikelihoodQuantileForTest(50.0 - percentages[i] / 2.0, 1e-3, q1); filter.marginalLikelihoodQuantileForTest(50.0 + percentages[i] / 2.0, 1e-3, q2); TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval(percentages[i]); LOG_TRACE(<< "[q1, q2] = [" << q1 << ", " << q2 << "]" << ", interval = " << interval); BOOST_REQUIRE_CLOSE_ABSOLUTE(q1, interval.first, 0.02); BOOST_REQUIRE_CLOSE_ABSOLUTE(q2, interval.second, 0.02); error.add(std::fabs(interval.first - q1)); error.add(std::fabs(interval.second - q2)); } LOG_DEBUG(<< "error = " << maths::common::CBasicStatistics::mean(error)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(error) < 4e-3); } { maths_t::TDoubleWeightsAry weight(maths_t::CUnitWeights::UNIT); TMeanAccumulator totalError; for (std::size_t i = 0; i < std::size(varianceScales); ++i) { TMeanAccumulator error; double vs = varianceScales[i]; maths_t::setCountVarianceScale(vs, weight); LOG_DEBUG(<< "*** vs = " << vs << " ***"); for (std::size_t j = 0; j < std::size(percentages); ++j) { boost::math::gamma_distribution<> scaledGamma(shape / vs, vs * scale); double q1 = boost::math::quantile( scaledGamma, (50.0 - percentages[j] / 2.0) / 100.0); double q2 = boost::math::quantile( scaledGamma, (50.0 + percentages[j] / 2.0) / 100.0); TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval(percentages[j], weight); LOG_TRACE(<< "[q1, q2] = [" << q1 << ", " << q2 << "]" << ", interval = " << interval); BOOST_REQUIRE_CLOSE_ABSOLUTE(q1, interval.first, 0.4); BOOST_REQUIRE_CLOSE_ABSOLUTE(q2, interval.second, 0.4); error.add(std::fabs(interval.first - q1)); error.add(std::fabs(interval.second - q2)); } LOG_DEBUG(<< "error = " << maths::common::CBasicStatistics::mean(error)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(error) < 0.09); totalError += error; } LOG_DEBUG(<< "totalError = " << maths::common::CBasicStatistics::mean(totalError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(totalError) < 0.042); } } BOOST_AUTO_TEST_CASE(testMarginalLikelihoodMean) { // Test that the expectation of the marginal likelihood matches // the expected mean of the marginal likelihood. const double shapes[] = {5.0, 20.0, 40.0}; const double scales[] = {1.0, 10.0, 20.0}; test::CRandomNumbers rng; for (std::size_t i = 0; i < std::size(shapes); ++i) { for (std::size_t j = 0; j < std::size(scales); ++j) { LOG_DEBUG(<< "*** shape = " << shapes[i] << ", scale = " << scales[j] << " ***"); CGammaRateConjugate filter(makePrior()); TDoubleVec seedSamples; rng.generateGammaSamples(shapes[i], scales[j], 3, seedSamples); filter.addSamples(seedSamples); TDoubleVec samples; rng.generateGammaSamples(shapes[i], scales[j], 100, samples); for (std::size_t k = 0; k < samples.size(); ++k) { filter.addSamples(TDouble1Vec(1, samples[k])); double expectedMean; BOOST_TEST_REQUIRE(filter.marginalLikelihoodMeanForTest(expectedMean)); LOG_TRACE(<< "marginalLikelihoodMean = " << filter.marginalLikelihoodMean() << ", expectedMean = " << expectedMean); // The error is mainly due to the truncation in the // integration range used to compute the expected mean. BOOST_REQUIRE_CLOSE_ABSOLUTE( expectedMean, filter.marginalLikelihoodMean(), 1e-3 * expectedMean); } } } } BOOST_AUTO_TEST_CASE(testMarginalLikelihoodMode) { // Test that the marginal likelihood mode is what we'd expect // with variances variance scales. const double shapes[] = {5.0, 20.0, 40.0}; const double scales[] = {1.0, 10.0, 20.0}; const double varianceScales[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.2, 1.5, 2.0, 2.5, 3.0, 4.0}; test::CRandomNumbers rng; for (std::size_t i = 0; i < std::size(shapes); ++i) { for (std::size_t j = 0; j < std::size(scales); ++j) { LOG_DEBUG(<< "*** shape = " << shapes[i] << ", scale = " << scales[j] << " ***"); CGammaRateConjugate filter(makePrior()); TDoubleVec samples; rng.generateGammaSamples(shapes[i], scales[j], 1000, samples); filter.addSamples(samples); TMeanAccumulator relativeError; maths_t::TDoubleWeightsAry weight(maths_t::CUnitWeights::UNIT); for (std::size_t k = 0; k < std::size(varianceScales); ++k) { double vs = varianceScales[k]; maths_t::setCountVarianceScale(vs, weight); boost::math::gamma_distribution<> scaledGamma(shapes[i] / vs, vs * scales[j]); double expectedMode = boost::math::mode(scaledGamma); LOG_TRACE(<< "marginalLikelihoodMode = " << filter.marginalLikelihoodMode(weight) << ", expectedMode = " << expectedMode); BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedMode, filter.marginalLikelihoodMode(weight), 0.28 * expectedMode + 0.3); double error = std::fabs(filter.marginalLikelihoodMode(weight) - expectedMode); relativeError.add(error == 0.0 ? 0.0 : error / expectedMode); } LOG_DEBUG(<< "relativeError = " << maths::common::CBasicStatistics::mean(relativeError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(relativeError) < 0.08); } } } BOOST_AUTO_TEST_CASE(testMarginalLikelihoodVariance) { // Test that the expectation of the residual from the mean for // the marginal likelihood matches the expected variance of the // marginal likelihood. const double shapes[] = {5.0, 20.0, 40.0}; const double scales[] = {1.0, 10.0, 20.0}; test::CRandomNumbers rng; for (std::size_t i = 0; i < std::size(shapes); ++i) { for (std::size_t j = 0; j < std::size(scales); ++j) { LOG_DEBUG(<< "*** shape = " << shapes[i] << ", scale = " << scales[j] << " ***"); CGammaRateConjugate filter(makePrior()); TDoubleVec seedSamples; rng.generateGammaSamples(shapes[i], scales[j], 10, seedSamples); filter.addSamples(seedSamples); TDoubleVec samples; rng.generateGammaSamples(shapes[i], scales[j], 100, samples); TMeanAccumulator relativeError; for (std::size_t k = 0; k < samples.size(); ++k) { filter.addSamples(TDouble1Vec(1, samples[k])); double expectedVariance; BOOST_TEST_REQUIRE(filter.marginalLikelihoodVarianceForTest(expectedVariance)); LOG_TRACE(<< "marginalLikelihoodVariance = " << filter.marginalLikelihoodVariance() << ", expectedVariance = " << expectedVariance); // The error is mainly due to the truncation in the // integration range used to compute the expected mean. BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedVariance, filter.marginalLikelihoodVariance(), 0.01 * expectedVariance); relativeError.add(std::fabs(expectedVariance - filter.marginalLikelihoodVariance()) / expectedVariance); } LOG_DEBUG(<< "relativeError = " << maths::common::CBasicStatistics::mean(relativeError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(relativeError) < 0.0012); } } } BOOST_AUTO_TEST_CASE(testSampleMarginalLikelihood) { // We're going to test two properties of the sampling: // 1) That the sample mean is equal to the marginal // likelihood mean. // 2) That the sample percentiles match the distribution // percentiles. // I want to cross check these with the implementations of the // jointLogMarginalLikelihood and minusLogJointCdf so use these // to compute the mean and percentiles. const double shape = 5.0; const double scale = 1.0; const double eps = 1e-3; test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 50, samples); CGammaRateConjugate filter(makePrior()); TDouble1Vec sampled; TMeanVarAccumulator sampleMeanVar; for (std::size_t i = 0; i < 3; ++i) { sampleMeanVar.add(samples[i]); filter.addSamples(TDouble1Vec(1, samples[i])); sampled.clear(); filter.sampleMarginalLikelihood(10, sampled); TMeanVarAccumulator sampledMeanVar; sampledMeanVar = std::for_each(sampled.begin(), sampled.end(), sampledMeanVar); BOOST_REQUIRE_EQUAL(i + 1, sampled.size()); BOOST_REQUIRE_CLOSE_ABSOLUTE( maths::common::CBasicStatistics::mean(sampleMeanVar), maths::common::CBasicStatistics::mean(sampledMeanVar), eps); BOOST_REQUIRE_CLOSE_ABSOLUTE( maths::common::CBasicStatistics::variance(sampleMeanVar), maths::common::CBasicStatistics::variance(sampledMeanVar), eps); } TMeanAccumulator meanVarError; std::size_t numberSampled = 20; for (std::size_t i = 3; i < samples.size(); ++i) { filter.addSamples(TDouble1Vec(1, samples[i])); sampled.clear(); filter.sampleMarginalLikelihood(numberSampled, sampled); BOOST_REQUIRE_EQUAL(numberSampled, sampled.size()); TMeanVarAccumulator sampledMoments; sampledMoments = std::for_each(sampled.begin(), sampled.end(), sampledMoments); LOG_TRACE(<< "expectedMean = " << filter.marginalLikelihoodMean() << ", sampledMean = " << maths::common::CBasicStatistics::mean(sampledMoments)); LOG_TRACE(<< "expectedVar = " << filter.marginalLikelihoodVariance() << ", sampledVar = " << maths::common::CBasicStatistics::variance(sampledMoments)); BOOST_REQUIRE_CLOSE_ABSOLUTE( filter.marginalLikelihoodMean(), maths::common::CBasicStatistics::mean(sampledMoments), 1e-8); BOOST_REQUIRE_CLOSE_ABSOLUTE(filter.marginalLikelihoodVariance(), maths::common::CBasicStatistics::variance(sampledMoments), 0.25 * filter.marginalLikelihoodVariance()); meanVarError.add(std::fabs(filter.marginalLikelihoodVariance() - maths::common::CBasicStatistics::variance(sampledMoments)) / filter.marginalLikelihoodVariance()); std::sort(sampled.begin(), sampled.end()); for (std::size_t j = 1; j < sampled.size(); ++j) { double q = 100.0 * static_cast<double>(j) / static_cast<double>(numberSampled); double expectedQuantile; BOOST_TEST_REQUIRE(filter.marginalLikelihoodQuantileForTest(q, eps, expectedQuantile)); LOG_TRACE(<< "quantile = " << q << ", x_quantile = " << expectedQuantile << ", quantile range = [" << sampled[j - 1u] << "," << sampled[j] << "]"); BOOST_TEST_REQUIRE(expectedQuantile >= sampled[j - 1u]); BOOST_TEST_REQUIRE(expectedQuantile <= sampled[j]); } } LOG_DEBUG(<< "mean variance error = " << maths::common::CBasicStatistics::mean(meanVarError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanVarError) < 0.025); } BOOST_AUTO_TEST_CASE(testCdf) { // Test error cases. // // Test some invariants: // "cdf" + "cdf complement" = 1 // cdf x for x < 0 = 1 // cdf complement x for x < 0 = 0 double shape = 5.0; double scale = 0.5; test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 100, samples); CGammaRateConjugate filter(makePrior()); filter.addSamples(samples); double lowerBound; double upperBound; BOOST_TEST_REQUIRE(!filter.minusLogJointCdf(TDouble1Vec(), lowerBound, upperBound)); BOOST_TEST_REQUIRE(!filter.minusLogJointCdfComplement(TDouble1Vec(), lowerBound, upperBound)); BOOST_TEST_REQUIRE(filter.minusLogJointCdf(TDouble1Vec(1, -1.0), lowerBound, upperBound)); double f = (lowerBound + upperBound) / 2.0; BOOST_TEST_REQUIRE(filter.minusLogJointCdfComplement(TDouble1Vec(1, -1.0), lowerBound, upperBound)); double fComplement = (lowerBound + upperBound) / 2.0; LOG_DEBUG(<< "log(F(x)) = " << -f << ", log(1 - F(x)) = " << fComplement); BOOST_REQUIRE_CLOSE_ABSOLUTE(std::log(std::numeric_limits<double>::min()), -f, 1e-10); BOOST_REQUIRE_EQUAL(1.0, std::exp(-fComplement)); for (std::size_t i = 1; i < 500; ++i) { double x = static_cast<double>(i) / 5.0; BOOST_TEST_REQUIRE(filter.minusLogJointCdf(TDouble1Vec(1, x), lowerBound, upperBound)); f = (lowerBound + upperBound) / 2.0; BOOST_TEST_REQUIRE(filter.minusLogJointCdfComplement( TDouble1Vec(1, x), lowerBound, upperBound)); fComplement = (lowerBound + upperBound) / 2.0; LOG_TRACE(<< "log(F(x)) = " << (f == 0.0 ? f : -f) << ", log(1 - F(x)) = " << (fComplement == 0.0 ? fComplement : -fComplement)); BOOST_REQUIRE_CLOSE_ABSOLUTE(1.0, std::exp(-f) + std::exp(-fComplement), 1e-10); } } BOOST_AUTO_TEST_CASE(testProbabilityOfLessLikelySamples) { // We test that the probability of less likely samples calculation // agrees with the chance of seeing a sample with lower marginal // likelihood, up to the sampling error. // // We also check that the tail calculation attributes samples to // the appropriate tail of the distribution. const double shapes[] = {0.4, 10.0, 200.0}; const double scales[] = {0.1, 5.0, 50.0}; const double vs[] = {0.5, 1.0, 2.0}; test::CRandomNumbers rng; TMeanAccumulator meanError; for (size_t i = 0; i < std::size(shapes); ++i) { for (size_t j = 0; j < std::size(scales); ++j) { LOG_DEBUG(<< "shape = " << shapes[i] << ", scale = " << scales[j]); TDoubleVec samples; rng.generateGammaSamples(shapes[i], scales[j], 1000, samples); CGammaRateConjugate filter(makePrior()); filter.addSamples(samples); double shape_ = filter.likelihoodShape(); double rate_ = filter.likelihoodRate(); TDoubleVec likelihoods; for (std::size_t k = 0; k < samples.size(); ++k) { double likelihood; filter.jointLogMarginalLikelihood(TDouble1Vec(1, samples[k]), likelihood); likelihoods.push_back(likelihood); } std::sort(likelihoods.begin(), likelihoods.end()); boost::math::gamma_distribution<> gamma(shape_, 1.0 / rate_); for (std::size_t k = 1; k < 10; ++k) { double x = boost::math::quantile(gamma, static_cast<double>(k) / 10.0); TDouble1Vec sample(1, x); double fx; filter.jointLogMarginalLikelihood(sample, fx); double px = static_cast<double>(std::lower_bound(likelihoods.begin(), likelihoods.end(), fx) - likelihoods.begin()) / static_cast<double>(likelihoods.size()); double lb, ub; filter.probabilityOfLessLikelySamples(maths_t::E_TwoSided, sample, lb, ub); double ssd = std::sqrt(px * (1.0 - px) / static_cast<double>(samples.size())); LOG_TRACE(<< "expected P(x) = " << px << ", actual P(x) = " << (lb + ub) / 2.0 << " sample sd = " << ssd); BOOST_REQUIRE_CLOSE_ABSOLUTE(px, (lb + ub) / 2.0, 3.0 * ssd); meanError.add(std::fabs(px - (lb + ub) / 2.0)); } for (std::size_t k = 0; k < std::size(vs); ++k) { double mode = filter.marginalLikelihoodMode( maths_t::countVarianceScaleWeight(vs[k])); double ss[] = {0.9 * mode, 1.1 * mode}; LOG_DEBUG(<< "vs = " << vs[k] << ", mode = " << mode); double lb, ub; maths_t::ETail tail; { filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {ss[0]}, {maths_t::countVarianceScaleWeight(vs[k])}, lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_LeftTail, tail); if (mode > 0.0) { filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_MixedOrNeitherTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedBelow, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_LeftTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedAbove, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_RightTail, tail); } } if (mode > 0.0) { filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {ss[1]}, {maths_t::countVarianceScaleWeight(vs[k])}, lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_RightTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_MixedOrNeitherTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedBelow, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_LeftTail, tail); filter.probabilityOfLessLikelySamples( maths_t::E_OneSidedAbove, TDouble1Vec(ss, ss + 2), maths_t::TDoubleWeightsAry1Vec( 2, maths_t::countVarianceScaleWeight(vs[k])), lb, ub, tail); BOOST_REQUIRE_EQUAL(maths_t::E_RightTail, tail); } } } } LOG_DEBUG(<< "mean error = " << maths::common::CBasicStatistics::mean(meanError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanError) < 0.01); } BOOST_AUTO_TEST_CASE(testAnomalyScore) { // This test pushes 500 samples through the filter and adds in // anomalous signals in the bins at 30, 120, 300 and 420 with // magnitude 4, 5, 10 and 15 standard deviations, respectively, // and checks the anomaly score has: // 1) high probability of detecting the anomalies, and // 2) a very low rate of false positives. using TUIntVec = std::vector<unsigned int>; const double decayRates[] = {0.0, 0.001, 0.01}; const double shapes[] = {0.4, 10.0, 200.0}; const double scales[] = {0.1, 5.0, 50.0}; const double threshold = 0.02; const unsigned int anomalyTimes[] = {30u, 120u, 300u, 420u}; const double anomalies[] = {4.0, 5.0, 10.0, 15.0, 0.0}; test::CRandomNumbers rng; unsigned int test = 0; //std::ofstream file; //file.open("results.m"); double totalFalsePositiveRate = 0.0; std::size_t totalPositives[] = {0u, 0u, 0u}; for (size_t i = 0; i < std::size(shapes); ++i) { for (size_t j = 0; j < std::size(scales); ++j) { LOG_DEBUG(<< "shape = " << shapes[i] << ", scale = " << scales[j]); boost::math::gamma_distribution<> gamma(shapes[i], scales[j]); TDoubleVec samples; rng.generateGammaSamples(shapes[i], scales[j], 500, samples); for (size_t k = 0; k < std::size(decayRates); ++k) { CGammaRateConjugate filter( makePrior(maths_t::E_ContinuousData, 0.0, decayRates[k])); ++test; //std::ostringstream x; //std::ostringstream scores; //x << "x" << test << " = ["; //scores << "score" << test << " = ["; TUIntVec candidateAnomalies; for (unsigned int time = 0; time < samples.size(); ++time) { double sample = samples[time] + (anomalies[std::find(std::begin(anomalyTimes), std::end(anomalyTimes), time) - std::begin(anomalyTimes)] * boost::math::standard_deviation(gamma)); TDouble1Vec sampleVec(1, sample); filter.addSamples(sampleVec); double score; filter.anomalyScore(maths_t::E_TwoSided, sampleVec, score); if (score > threshold) { candidateAnomalies.push_back(time); } filter.propagateForwardsByTime(1.0); //x << time << " "; //scores << score << " "; } //x << "];\n"; //scores << "];\n"; //file << x.str() << scores.str() << "plot(x" << test << ", score" // << test << ");\n" // << "input(\"Hit any key for next test\");\n\n"; TUIntVec falsePositives; std::set_difference(candidateAnomalies.begin(), candidateAnomalies.end(), std::begin(anomalyTimes), std::end(anomalyTimes), std::back_inserter(falsePositives)); double falsePositiveRate = static_cast<double>(falsePositives.size()) / static_cast<double>(samples.size()); totalFalsePositiveRate += falsePositiveRate; TUIntVec positives; std::set_intersection(candidateAnomalies.begin(), candidateAnomalies.end(), std::begin(anomalyTimes), std::end(anomalyTimes), std::back_inserter(positives)); LOG_DEBUG(<< "falsePositiveRate = " << falsePositiveRate << ", positives = " << positives.size()); // False alarm rate should be less than 0.6%. BOOST_TEST_REQUIRE(falsePositiveRate <= 0.006); // Should detect at least the two big anomalies. BOOST_TEST_REQUIRE(positives.size() >= 2); totalPositives[k] += positives.size(); } } } totalFalsePositiveRate /= static_cast<double>(test); LOG_DEBUG(<< "totalFalsePositiveRate = " << totalFalsePositiveRate); for (size_t i = 0; i < std::size(totalPositives); ++i) { LOG_DEBUG(<< "positives = " << totalPositives[i]); BOOST_TEST_REQUIRE(totalPositives[i] >= 24); } // Total false alarm rate should be less than 0.11%. BOOST_TEST_REQUIRE(totalFalsePositiveRate < 0.0011); } BOOST_AUTO_TEST_CASE(testOffset) { // The idea of this test is to check that the offset correctly cancels // out a translation applied to a gamma distributed data set. const maths_t::EDataType dataTypes[] = {maths_t::E_IntegerData, maths_t::E_ContinuousData}; const double offsets[] = {-0.5, 0.5}; const double decayRates[] = {0.0, 0.001, 0.01}; const double shape = 5.0; const double scale = 1.0; const double eps = 1e-8; test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 100, samples); for (size_t i = 0; i < std::size(dataTypes); ++i) { for (size_t j = 0; j < std::size(offsets); ++j) { for (size_t k = 0; k < std::size(decayRates); ++k) { CGammaRateConjugate filter1( makePrior(dataTypes[i], offsets[j], decayRates[k])); CGammaRateConjugate filter2(makePrior(dataTypes[i], 0.0, decayRates[k])); for (std::size_t l = 0; l < samples.size(); ++l) { double offsetSample = samples[l] - offsets[j]; TDouble1Vec offsetSampleVec(1, offsetSample); filter1.addSamples(offsetSampleVec); filter1.propagateForwardsByTime(1.0); double x = samples[l]; TDouble1Vec sample(1, x); filter2.addSamples(sample); filter2.propagateForwardsByTime(1.0); double likelihood1; filter1.jointLogMarginalLikelihood(offsetSampleVec, likelihood1); double lowerBound1, upperBound1; filter1.probabilityOfLessLikelySamples( maths_t::E_TwoSided, offsetSampleVec, lowerBound1, upperBound1); BOOST_REQUIRE_EQUAL(lowerBound1, upperBound1); double probability1 = (lowerBound1 + upperBound1) / 2.0; double likelihood2; filter2.jointLogMarginalLikelihood(sample, likelihood2); double lowerBound2, upperBound2; filter2.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, lowerBound2, upperBound2); BOOST_REQUIRE_EQUAL(lowerBound2, upperBound2); double probability2 = (lowerBound2 + upperBound2) / 2.0; BOOST_REQUIRE_CLOSE_ABSOLUTE(likelihood1, likelihood2, eps); BOOST_REQUIRE_CLOSE_ABSOLUTE(probability1, probability2, eps); } using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_AbsoluteTolerance, eps); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } } } } BOOST_AUTO_TEST_CASE(testIntegerData) { // If the data are discrete then we approximate the discrete distribution // by saying it is uniform on the intervals [n,n+1] for each integral n. // The idea of this test is to check that the inferred model agrees in the // limit (large n) with the model inferred from such data. const double shapes[] = {0.2, 1.0, 4.5}; const double scales[] = {0.2, 1.0, 4.5}; const std::size_t nSamples = 25000; for (size_t i = 0; i < std::size(shapes); ++i) { for (size_t j = 0; j < std::size(scales); ++j) { test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(shapes[i], scales[j], nSamples, samples); TDoubleVec uniform; rng.generateUniformSamples(0.0, 1.0, nSamples, uniform); CGammaRateConjugate filter1(makePrior(maths_t::E_IntegerData, 0.1)); CGammaRateConjugate filter2(makePrior(maths_t::E_ContinuousData, 0.1)); for (std::size_t k = 0; k < nSamples; ++k) { double x = std::floor(samples[k]); TDouble1Vec sample(1, x); filter1.addSamples(sample); sample[0] += uniform[k]; filter2.addSamples(sample); } using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 0.02); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } } TMeanAccumulator meanError; for (size_t i = 0; i < std::size(shapes); ++i) { for (size_t j = 0; j < std::size(scales); ++j) { test::CRandomNumbers rng; TDoubleVec seedSamples; rng.generateGammaSamples(shapes[i], scales[j], 100, seedSamples); CGammaRateConjugate filter1(makePrior(maths_t::E_IntegerData, 0.1)); filter1.addSamples(seedSamples); CGammaRateConjugate filter2 = filter1; filter2.dataType(maths_t::E_ContinuousData); TDoubleVec samples; rng.generateGammaSamples(shapes[i], scales[j], nSamples, samples); TDoubleVec uniform; rng.generateUniformSamples(0.0, 1.0, nSamples, uniform); TMeanAccumulator meanProbability1; TMeanAccumulator meanProbability2; for (std::size_t k = 0; k < nSamples; ++k) { double x = std::floor(samples[k]); TDouble1Vec sample(1, x); double l1, u1; BOOST_TEST_REQUIRE(filter1.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, l1, u1)); BOOST_REQUIRE_EQUAL(l1, u1); double p1 = (l1 + u1) / 2.0; meanProbability1.add(p1); sample[0] += uniform[k]; double l2, u2; BOOST_TEST_REQUIRE(filter2.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, l2, u2)); BOOST_REQUIRE_EQUAL(l2, u2); double p2 = (l2 + u2) / 2.0; meanProbability2.add(p2); } double p1 = maths::common::CBasicStatistics::mean(meanProbability1); double p2 = maths::common::CBasicStatistics::mean(meanProbability2); LOG_DEBUG(<< "shape = " << shapes[i] << ", rate = " << scales[j] << ", p1 = " << p1 << ", p2 = " << p2); BOOST_REQUIRE_CLOSE_ABSOLUTE(p1, p2, 0.15 * p1); meanError.add(fabs(p1 - p2)); } } LOG_DEBUG(<< "mean error = " << maths::common::CBasicStatistics::mean(meanError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanError) < 0.016); } BOOST_AUTO_TEST_CASE(testLowVariationData) { { CGammaRateConjugate filter(makePrior(maths_t::E_IntegerData)); for (std::size_t i = 0; i < 100; ++i) { filter.addSamples(TDouble1Vec(1, 430.0)); } TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval(68.0); double sigma = (interval.second - interval.first) / 2.0; LOG_DEBUG(<< "68% confidence interval " << interval << ", approximate variance = " << sigma * sigma); BOOST_REQUIRE_CLOSE_ABSOLUTE(12.0, 1.0 / (sigma * sigma), 0.5); } { CGammaRateConjugate filter(makePrior(maths_t::E_ContinuousData)); for (std::size_t i = 0; i < 100; ++i) { filter.addSamples(TDouble1Vec(1, 430.0)); } TDoubleDoublePr interval = filter.marginalLikelihoodConfidenceInterval(68.0); double sigma = (interval.second - interval.first) / 2.0; LOG_DEBUG(<< "68% confidence interval " << interval << ", approximate s.t.d. = " << sigma); BOOST_REQUIRE_CLOSE_ABSOLUTE(1e-4, sigma / 430.5, 5e-6); } } BOOST_AUTO_TEST_CASE(testPersist) { test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(1.0, 3.0, 500, samples); maths::common::CGammaRateConjugate origFilter(makePrior(maths_t::E_ContinuousData, 0.1)); for (std::size_t i = 0; i < samples.size(); ++i) { origFilter.addSamples({samples[i]}, maths_t::CUnitWeights::SINGLE_UNIT); } double decayRate = origFilter.decayRate(); std::ostringstream origJson; core::CJsonStatePersistInserter::persist( origJson, std::bind_front(&maths::common::CGammaRateConjugate::acceptPersistInserter, &origFilter)); LOG_DEBUG(<< "Gamma rate conjugate JSON representation:\n" << origJson.str()); // Restore the JSON into a new filter std::istringstream origJsonStrm{"{\"topLevel\" : " + origJson.str() + "}"}; core::CJsonStateRestoreTraverser traverser(origJsonStrm); maths::common::SDistributionRestoreParams params( maths_t::E_ContinuousData, decayRate + 0.1, maths::common::MINIMUM_CLUSTER_SPLIT_FRACTION, maths::common::MINIMUM_CLUSTER_SPLIT_COUNT, maths::common::MINIMUM_CATEGORY_COUNT); maths::common::CGammaRateConjugate restoredFilter(params, traverser); std::uint64_t checksum = origFilter.checksum(); LOG_DEBUG(<< "orig checksum = " << checksum << " restored checksum = " << restoredFilter.checksum()); BOOST_REQUIRE_EQUAL(checksum, restoredFilter.checksum()); // The JSON representation of the new filter should be the same as the original std::ostringstream newJson; core::CJsonStatePersistInserter::persist( newJson, std::bind_front(&maths::common::CGammaRateConjugate::acceptPersistInserter, &restoredFilter)); BOOST_REQUIRE_EQUAL(origJson.str(), newJson.str()); } BOOST_AUTO_TEST_CASE(testVarianceScale) { // The strategy for this test is to check we correctly account // for variance scaling by scaling the variance of a collection // of samples and then checking that the percentiles for those // samples "probability of less likely sample" are correct. In // particular, we expect that the calculation correctly predicts // the fraction of samples which have lower probability for all // percentiles (if the variance scaling was off we'd either get // too few or too many). // // We use the same idea for testing the variance scaling for // the marginal likelihood. The sample mean of the scaled log // likelihood tends to the expected log likelihood for a scaled // gamma distribution (uniform law of large numbers), which is // just the differential entropy of a scaled gamma R.V. // // Finally, we test update with scaled samples produces the // correct posterior. TWeightFunc weightsFuncs[]{ static_cast<TWeightFunc>(maths_t::seasonalVarianceScaleWeight), static_cast<TWeightFunc>(maths_t::countVarianceScaleWeight)}; for (std::size_t s = 0; s < std::size(weightsFuncs); ++s) { const double shape = 3.0; const double scale = 3.0; const double varianceScales[] = {0.20, 0.50, 0.75, 1.50, 2.00, 5.00}; test::CRandomNumbers rng; LOG_DEBUG(<< ""); LOG_DEBUG(<< "****** probabilityOfLessLikelySamples ******"); { const double percentiles[] = {10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0}; const std::size_t nSamples = 1000; const std::size_t nScaledSamples = 10000; TDoubleVec samples; rng.generateGammaSamples(shape, scale, nSamples, samples); CGammaRateConjugate filter(makePrior()); filter.addSamples(samples); TDoubleVec expectedPercentileErrors; double expectedTotalError = 0.0; { TDoubleVec unscaledSamples; rng.generateGammaSamples(shape, scale, nScaledSamples, unscaledSamples); TDoubleVec probabilities; probabilities.reserve(nScaledSamples); for (std::size_t i = 0; i < unscaledSamples.size(); ++i) { TDouble1Vec sample(1, unscaledSamples[i]); double lowerBound, upperBound; BOOST_TEST_REQUIRE(filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, sample, lowerBound, upperBound)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); double probability = (lowerBound + upperBound) / 2.0; probabilities.push_back(probability); } std::sort(probabilities.begin(), probabilities.end()); for (size_t i = 0; i < std::size(percentiles); ++i) { std::size_t index = static_cast<std::size_t>( static_cast<double>(nScaledSamples) * percentiles[i] / 100.0); double error = fabs(probabilities[index] - percentiles[i] / 100.0); expectedPercentileErrors.push_back(error); expectedTotalError += error; } } for (size_t i = 0; i < std::size(varianceScales); ++i) { LOG_DEBUG(<< "**** variance scale = " << varianceScales[i] << " ****"); double scaledShape = shape / varianceScales[i]; double ss = varianceScales[i] * scale; boost::math::gamma_distribution<> gamma(scaledShape, ss); LOG_DEBUG(<< "mean = " << boost::math::mean(gamma) << ", variance = " << boost::math::variance(gamma)); TDoubleVec scaledSamples; rng.generateGammaSamples(scaledShape, ss, nScaledSamples, scaledSamples); TDoubleVec probabilities; probabilities.reserve(nScaledSamples); for (std::size_t j = 0; j < scaledSamples.size(); ++j) { double lowerBound, upperBound; maths_t::ETail tail; BOOST_TEST_REQUIRE(filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {scaledSamples[j]}, {weightsFuncs[s](varianceScales[i])}, lowerBound, upperBound, tail)); BOOST_REQUIRE_EQUAL(lowerBound, upperBound); double probability = (lowerBound + upperBound) / 2.0; probabilities.push_back(probability); } std::sort(probabilities.begin(), probabilities.end()); double totalError = 0.0; for (size_t j = 0; j < std::size(percentiles); ++j) { std::size_t index = static_cast<std::size_t>( static_cast<double>(nScaledSamples) * percentiles[j] / 100.0); double error = fabs(probabilities[index] - percentiles[j] / 100.0); totalError += error; double errorThreshold = 0.017 + expectedPercentileErrors[j]; LOG_TRACE(<< "percentile = " << percentiles[j] << ", probability = " << probabilities[index] << ", error = " << error << ", error threshold = " << errorThreshold); BOOST_TEST_REQUIRE(error < errorThreshold); } double totalErrorThreshold = 0.1 + expectedTotalError; LOG_DEBUG(<< "total error = " << totalError << ", totalError threshold = " << totalErrorThreshold); BOOST_TEST_REQUIRE(totalError < totalErrorThreshold); } } LOG_DEBUG(<< ""); LOG_DEBUG(<< "****** jointLogMarginalLikelihood ******"); for (size_t i = 0; i < std::size(varianceScales); ++i) { LOG_DEBUG(<< "**** variance scale = " << varianceScales[i] << " ****"); double scaledShape = shape / varianceScales[i]; double scaledScale = varianceScales[i] * scale; boost::math::gamma_distribution<> gamma(scaledShape, scaledScale); LOG_DEBUG(<< "mean = " << boost::math::mean(gamma) << ", variance = " << boost::math::variance(gamma)); double expectedDifferentialEntropy = maths::common::CTools::differentialEntropy(gamma); CGammaRateConjugate filter(makePrior()); double differentialEntropy = 0.0; TDoubleVec seedSamples; rng.generateGammaSamples(shape, scale, 150, seedSamples); filter.addSamples(seedSamples); TDoubleVec scaledSamples; rng.generateGammaSamples(scaledShape, scaledScale, 50000, scaledSamples); for (std::size_t j = 0; j < scaledSamples.size(); ++j) { double logLikelihood = 0.0; BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors, filter.jointLogMarginalLikelihood( {scaledSamples[j]}, {weightsFuncs[s](varianceScales[i])}, logLikelihood)); differentialEntropy -= logLikelihood; } differentialEntropy /= static_cast<double>(scaledSamples.size()); LOG_DEBUG(<< "differentialEntropy = " << differentialEntropy << ", expectedDifferentialEntropy = " << expectedDifferentialEntropy); BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedDifferentialEntropy, differentialEntropy, 0.05); } } const maths_t::EDataType dataTypes[] = {maths_t::E_IntegerData, maths_t::E_ContinuousData}; const double maximumMeanError[] = {0.08, 0.11}; const double maximumVarianceError[] = {1.0, 0.2}; const double maximumMeanMeanError[] = {0.01, 0.01}; const double maximumMeanVarianceError[] = {0.08, 0.05}; for (std::size_t s = 0; s < std::size(weightsFuncs); ++s) { for (std::size_t t = 0; t < std::size(dataTypes); ++t) { const double shapes[] = {1.0, 10.0, 100.0, 1000.0, 100000.0, 1000000.0}; const double rates[] = {1.0, 10.0, 100.0, 1000.0, 100000.0, 1000000.0}; const double varianceScales[] = {0.1, 0.5, 1.0, 2.0, 10.0, 100.0}; TDoubleVec samples; maths_t::TDoubleWeightsAry1Vec weights; test::CRandomNumbers rng; TMeanAccumulator meanMeanError; TMeanAccumulator meanVarianceError; for (std::size_t i = 0; i < std::size(shapes); ++i) { for (std::size_t j = 0; j < std::size(rates); ++j) { double shape = shapes[i]; double rate = rates[j]; // We purposely don't estimate true variance in this case. if (shape < rate * rate * maths::common::MINIMUM_COEFFICIENT_OF_VARIATION) { continue; } LOG_TRACE(<< "****** shape = " << shape << ", rate = " << rate << " ******"); double mean = shape / rate; double variance = mean / rate; for (std::size_t k = 0; k < std::size(varianceScales); ++k) { double scale = varianceScales[k]; LOG_TRACE(<< "*** scale = " << scale << " ***"); double scaledShape = shape / scale; double scaledRate = rate / scale; LOG_TRACE(<< "scaled shape = " << scaledShape << ", scaled rate = " << scaledRate); TMeanAccumulator meanError; TMeanAccumulator varianceError; for (unsigned int test = 0; test < 5; ++test) { CGammaRateConjugate filter(makePrior(dataTypes[t])); rng.generateGammaSamples(shape, 1.0 / rate, 200, samples); weights.clear(); weights.resize(samples.size(), maths_t::CUnitWeights::UNIT); filter.addSamples(samples, weights); rng.generateGammaSamples(scaledShape, 1.0 / scaledRate, 200, samples); weights.clear(); weights.resize(samples.size(), weightsFuncs[s](scale)); filter.addSamples(samples, weights); double estimatedMean = filter.likelihoodShape() / filter.likelihoodRate(); double estimatedVariance = estimatedMean / filter.likelihoodRate(); double dm = (dataTypes[t] == maths_t::E_IntegerData ? 0.5 : 0.0); double dv = (dataTypes[t] == maths_t::E_IntegerData ? 1.0 / 12.0 : 0.0); double trialMeanError = std::fabs(estimatedMean - (mean + dm)) / std::max(1.0, mean + dm); double trialVarianceError = std::fabs(estimatedVariance - (variance + dv)) / std::max(1.0, variance + dv); LOG_TRACE(<< "trial mean error = " << trialMeanError); LOG_TRACE(<< "trial variance error = " << trialVarianceError); meanError.add(trialMeanError); varianceError.add(trialVarianceError); } LOG_TRACE(<< "mean error = " << maths::common::CBasicStatistics::mean(meanError)); LOG_TRACE(<< "variance error = " << maths::common::CBasicStatistics::mean(varianceError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean( meanError) < maximumMeanError[t]); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(varianceError) < maximumVarianceError[t]); meanMeanError += meanError; meanVarianceError += varianceError; } } } LOG_DEBUG(<< "mean mean error = " << maths::common::CBasicStatistics::mean(meanMeanError)); LOG_DEBUG(<< "mean variance error = " << maths::common::CBasicStatistics::mean(meanVarianceError)); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanMeanError) < maximumMeanMeanError[t]); BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanVarianceError) < maximumMeanVarianceError[t]); } } } BOOST_AUTO_TEST_CASE(testNegativeSample) { // Test that we recover roughly the same distribution after adjusting // the offset. The idea of this test is to run two priors side by side, // one with a large enough offset that it never needs to adjust the // offset and the other which will adjust and check that we get broadly // similar distributions at the end. const double shape = 4.0; const double scale = 1.0; test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(shape, scale, 100, samples); CGammaRateConjugate filter1(CGammaRateConjugate::nonInformativePrior( maths_t::E_ContinuousData, 0.0, 0.0, 0.2)); CGammaRateConjugate filter2(CGammaRateConjugate::nonInformativePrior( maths_t::E_ContinuousData, 1.2586, 0.0, 0.2)); filter1.addSamples(samples); filter2.addSamples(samples); TDouble1Vec negative(1, -0.29); filter1.addSamples(negative); filter2.addSamples(negative); BOOST_REQUIRE_EQUAL(filter1.numberSamples(), filter2.numberSamples()); using TEqual = maths::common::CEqualWithTolerance<double>; TEqual equal(maths::common::CToleranceTypes::E_RelativeTolerance, 0.1); BOOST_TEST_REQUIRE(filter1.equalTolerance(filter2, equal)); } BOOST_AUTO_TEST_CASE(testBadValues) { test::CRandomNumbers rng; TDoubleVec samples; rng.generateGammaSamples(4.0, 1.0, 100, samples); CGammaRateConjugate filter = CGammaRateConjugate::nonInformativePrior(maths_t::E_ContinuousData); filter.addSamples(samples); std::uint64_t checksum{filter.checksum()}; TDouble1Vec badSample{std::numeric_limits<double>::quiet_NaN()}; auto badWeights = maths_t::CUnitWeights::SINGLE_UNIT; maths_t::setSeasonalVarianceScale(std::numeric_limits<double>::quiet_NaN(), badWeights[0]); filter.addSamples(badSample, maths_t::CUnitWeights::SINGLE_UNIT); filter.addSamples({1.0}, badWeights); // We've just ignored the bad input. BOOST_TEST_REQUIRE(checksum, filter.checksum()); double lowerBound; double upperBound; maths_t::ETail tail; BOOST_TEST_REQUIRE(!filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, badSample, maths_t::CUnitWeights::SINGLE_UNIT, lowerBound, upperBound, tail)); BOOST_REQUIRE_EQUAL(0.0, lowerBound); BOOST_REQUIRE_EQUAL(1.0, upperBound); BOOST_REQUIRE_EQUAL(maths_t::E_UndeterminedTail, tail); BOOST_TEST_REQUIRE(!filter.probabilityOfLessLikelySamples( maths_t::E_TwoSided, {1.0}, badWeights, lowerBound, upperBound, tail)); BOOST_REQUIRE_EQUAL(0.0, lowerBound); BOOST_REQUIRE_EQUAL(1.0, upperBound); BOOST_REQUIRE_EQUAL(maths_t::E_UndeterminedTail, tail); } BOOST_AUTO_TEST_SUITE_END()

lib/maths/common/unittest/CGammaRateConjugateTest.cc (1,120 lines of code) (raw):