lib/maths/common/unittest/CMultimodalPriorTest.cc (1,525 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <core/CJsonStatePersistInserter.h>
#include <core/CJsonStateRestoreTraverser.h>
#include <core/CLogger.h>
#include <maths/common/CBasicStatistics.h>
#include <maths/common/CBasicStatisticsPersist.h>
#include <maths/common/CGammaRateConjugate.h>
#include <maths/common/CLogNormalMeanPrecConjugate.h>
#include <maths/common/CMixtureDistribution.h>
#include <maths/common/CMultimodalPrior.h>
#include <maths/common/CNormalMeanPrecConjugate.h>
#include <maths/common/COneOfNPrior.h>
#include <maths/common/CPriorDetail.h>
#include <maths/common/CRestoreParams.h>
#include <maths/common/CTools.h>
#include <maths/common/CToolsDetail.h>
#include <maths/common/CXMeansOnline1d.h>
#include <test/BoostTestCloseAbsolute.h>
#include <test/CRandomNumbers.h>
#include "TestUtils.h"
#include <boost/math/distributions/gamma.hpp>
#include <boost/math/distributions/lognormal.hpp>
#include <boost/math/distributions/normal.hpp>
#include <boost/test/unit_test.hpp>
#include <fstream>
#include <memory>
#include <vector>
BOOST_AUTO_TEST_SUITE(CMultimodalPriorTest)
using namespace ml;
using namespace handy_typedefs;
namespace {
using TDoubleVec = std::vector<double>;
using TDoubleDoublePr = std::pair<double, double>;
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TPriorPtr = std::unique_ptr<maths::common::CPrior>;
using CGammaRateConjugate = CPriorTestInterfaceMixin<maths::common::CGammaRateConjugate>;
using CLogNormalMeanPrecConjugate =
CPriorTestInterfaceMixin<maths::common::CLogNormalMeanPrecConjugate>;
using CNormalMeanPrecConjugate = CPriorTestInterfaceMixin<maths::common::CNormalMeanPrecConjugate>;
using CMultimodalPrior = CPriorTestInterfaceMixin<maths::common::CMultimodalPrior>;
using COneOfNPrior = CPriorTestInterfaceMixin<maths::common::COneOfNPrior>;
using TWeightFunc = maths_t::TDoubleWeightsAry (*)(double);
//! Make the default mode prior.
COneOfNPrior makeModePrior(const double& decayRate = 0.0) {
CGammaRateConjugate gamma(maths::common::CGammaRateConjugate::nonInformativePrior(
maths_t::E_ContinuousData, 0.01, decayRate, 0.0));
CLogNormalMeanPrecConjugate logNormal(maths::common::CLogNormalMeanPrecConjugate::nonInformativePrior(
maths_t::E_ContinuousData, 0.01, decayRate, 0.0));
CNormalMeanPrecConjugate normal(maths::common::CNormalMeanPrecConjugate::nonInformativePrior(
maths_t::E_ContinuousData, decayRate));
COneOfNPrior::TPriorPtrVec priors;
priors.push_back(COneOfNPrior::TPriorPtr(gamma.clone()));
priors.push_back(COneOfNPrior::TPriorPtr(logNormal.clone()));
priors.push_back(COneOfNPrior::TPriorPtr(normal.clone()));
return COneOfNPrior(
maths::common::COneOfNPrior(priors, maths_t::E_ContinuousData, decayRate));
}
//! Make a vanilla multimodal prior.
CMultimodalPrior makePrior(const maths::common::CPrior* modePrior, const double& decayRate) {
maths::common::CXMeansOnline1d clusterer(
maths_t::E_ContinuousData, maths::common::CAvailableModeDistributions::ALL,
maths_t::E_ClustersFractionWeight, decayRate);
if (modePrior) {
return maths::common::CMultimodalPrior(maths_t::E_ContinuousData,
clusterer, *modePrior, decayRate);
}
return maths::common::CMultimodalPrior(maths_t::E_ContinuousData, clusterer,
makeModePrior(decayRate), decayRate);
}
CMultimodalPrior makePrior(const maths::common::CPrior* modePrior) {
return makePrior(modePrior, 0.0);
}
CMultimodalPrior makePrior(double decayRate) {
return makePrior(nullptr, decayRate);
}
CMultimodalPrior makePrior() {
return makePrior(nullptr, 0.0);
}
test::CRandomNumbers RNG;
void sample(const boost::math::normal_distribution<>& normal,
std::size_t numberSamples,
TDoubleVec& result) {
RNG.generateNormalSamples(boost::math::mean(normal),
boost::math::variance(normal), numberSamples, result);
}
void sample(const boost::math::lognormal_distribution<>& lognormal,
std::size_t numberSamples,
TDoubleVec& result) {
RNG.generateLogNormalSamples(lognormal.location(),
lognormal.scale() * lognormal.scale(),
numberSamples, result);
}
void sample(const boost::math::gamma_distribution<>& gamma,
std::size_t numberSamples,
TDoubleVec& result) {
RNG.generateGammaSamples(gamma.shape(), gamma.scale(), numberSamples, result);
}
template<typename T>
void probabilityOfLessLikelySample(const maths::common::CMixtureDistribution<T>& mixture,
const double& x,
double& probability,
double& deviation) {
using TModeVec = typename maths::common::CMixtureDistribution<T>::TModeVec;
static const double NUMBER_SAMPLES = 10000.0;
probability = 0.0;
double fx = pdf(mixture, x);
const TDoubleVec& weights = mixture.weights();
const TModeVec& modes = mixture.modes();
for (std::size_t i = 0; i < modes.size(); ++i) {
TDoubleVec samples;
sample(modes[i], static_cast<std::size_t>(NUMBER_SAMPLES * weights[i]), samples);
for (std::size_t j = 0; j < samples.size(); ++j) {
if (pdf(mixture, samples[j]) < fx) {
probability += 1.0 / NUMBER_SAMPLES;
}
}
}
// For a discussion of the deviation see the paper:
// "Anomaly Detection in Application Performance Monitoring Data"
deviation = std::sqrt(probability * (1.0 - probability) / NUMBER_SAMPLES);
}
}
BOOST_AUTO_TEST_CASE(testMultipleUpdate) {
// Test that we get the same result updating once with a vector of 100
// samples of an R.V. versus updating individually 100 times.
const maths_t::EDataType dataTypes[] = {maths_t::E_IntegerData, maths_t::E_ContinuousData};
const double shape = 2.0;
const double scale = 3.0;
const double decayRate = 0.0;
test::CRandomNumbers rng;
TDoubleVec samples;
rng.generateNormalSamples(shape, scale, 100, samples);
for (size_t i = 0; i < std::size(dataTypes); ++i) {
maths::common::CXMeansOnline1d clusterer(
dataTypes[i], maths::common::CAvailableModeDistributions::ALL,
maths_t::E_ClustersFractionWeight);
CMultimodalPrior filter1(maths::common::CMultimodalPrior(
dataTypes[i], clusterer,
maths::common::CNormalMeanPrecConjugate::nonInformativePrior(dataTypes[i], decayRate)));
CMultimodalPrior filter2(filter1);
for (std::size_t j = 0; j < samples.size(); ++j) {
filter1.addSamples(TDouble1Vec(1, samples[j]));
}
filter2.addSamples(samples);
LOG_DEBUG(<< "checksum 1 " << filter1.checksum());
LOG_DEBUG(<< "checksum 2 " << filter2.checksum());
BOOST_REQUIRE_EQUAL(filter1.checksum(), filter2.checksum());
}
}
BOOST_AUTO_TEST_CASE(testPropagation) {
// Test that propagation doesn't affect the marginal likelihood
// mean and the marginal likelihood confidence intervals increase
// (due to influence of the prior uncertainty) after propagation.
double eps = 0.01;
test::CRandomNumbers rng;
TDoubleVec samples1;
rng.generateNormalSamples(3.0, 1.0, 200, samples1);
TDoubleVec samples2;
rng.generateNormalSamples(10.0, 4.0, 200, samples2);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
rng.random_shuffle(samples.begin(), samples.end());
const double decayRate = 0.1;
CMultimodalPrior filter(makePrior(decayRate));
for (std::size_t i = 0; i < samples.size(); ++i) {
filter.addSamples(TDouble1Vec(1, static_cast<double>(samples[i])));
BOOST_TEST_REQUIRE(filter.checkInvariants());
}
double mean = filter.marginalLikelihoodMean();
TDoubleDoublePr percentiles[] = {
filter.marginalLikelihoodConfidenceInterval(60.0),
filter.marginalLikelihoodConfidenceInterval(70.0),
filter.marginalLikelihoodConfidenceInterval(80.0),
filter.marginalLikelihoodConfidenceInterval(90.0)};
filter.propagateForwardsByTime(40.0);
BOOST_TEST_REQUIRE(filter.checkInvariants());
double propagatedMean = filter.marginalLikelihoodMean();
TDoubleDoublePr propagatedPercentiles[] = {
filter.marginalLikelihoodConfidenceInterval(60.0),
filter.marginalLikelihoodConfidenceInterval(70.0),
filter.marginalLikelihoodConfidenceInterval(80.0),
filter.marginalLikelihoodConfidenceInterval(90.0)};
LOG_DEBUG(<< "mean = " << mean << ", propagatedMean = " << propagatedMean);
LOG_DEBUG(<< "percentiles = " << percentiles);
LOG_DEBUG(<< "propagatedPercentiles = " << propagatedPercentiles);
BOOST_REQUIRE_CLOSE_ABSOLUTE(mean, propagatedMean, eps * mean);
for (std::size_t i = 0; i < std::size(percentiles); ++i) {
BOOST_TEST_REQUIRE(propagatedPercentiles[i].first < percentiles[i].first);
BOOST_TEST_REQUIRE(propagatedPercentiles[i].second > percentiles[i].second);
}
}
BOOST_AUTO_TEST_CASE(testSingleMode) {
// We test the log likelihood of the data for the estimated
// distributions versus the generating distributions. Note
// that the generating distribution doesn't necessarily have
// a larger likelihood because we are using a finite sample.
using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator;
test::CRandomNumbers rng;
LOG_DEBUG(<< "Gaussian");
{
COneOfNPrior modePrior(makeModePrior());
CMultimodalPrior filter1(makePrior(&modePrior));
COneOfNPrior filter2 = modePrior;
const double mean = 10.0;
const double variance = 2.0;
TDoubleVec samples;
rng.generateNormalSamples(mean, std::sqrt(variance), 1000, samples);
for (std::size_t i = 0; i < samples.size(); ++i) {
TDouble1Vec sample(1, samples[i]);
filter1.addSamples(sample);
filter2.addSamples(sample);
BOOST_TEST_REQUIRE(filter1.checkInvariants());
}
TMeanAccumulator L1G;
TMeanAccumulator L12;
TMeanAccumulator differentialEntropy;
boost::math::normal_distribution<> f(mean, std::sqrt(variance));
for (std::size_t i = 0; i < samples.size(); ++i) {
double fx = boost::math::pdf(f, samples[i]);
TDouble1Vec sample(1, samples[i]);
double l1;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter1.jointLogMarginalLikelihood(sample, l1));
L1G.add(std::log(fx) - l1);
double l2;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter2.jointLogMarginalLikelihood(sample, l2));
L12.add(l2 - l1);
differentialEntropy.add(-std::log(fx));
}
LOG_DEBUG(<< "L1G = " << maths::common::CBasicStatistics::mean(L1G)
<< ", L12 = " << maths::common::CBasicStatistics::mean(L12)
<< ", differential entropy " << differentialEntropy);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(L1G) /
maths::common::CBasicStatistics::mean(differentialEntropy) <
0.0);
}
LOG_DEBUG(<< "Log-Normal");
{
COneOfNPrior modePrior(makeModePrior());
CMultimodalPrior filter1(makePrior(&modePrior));
COneOfNPrior filter2 = modePrior;
const double location = 1.5;
const double squareScale = 0.9;
TDoubleVec samples;
rng.generateLogNormalSamples(location, squareScale, 1000, samples);
for (std::size_t i = 0; i < samples.size(); ++i) {
TDouble1Vec sample(1, samples[i]);
filter1.addSamples(sample);
filter2.addSamples(sample);
BOOST_TEST_REQUIRE(filter1.checkInvariants());
}
TMeanAccumulator L1G;
TMeanAccumulator L12;
TMeanAccumulator differentialEntropy;
boost::math::lognormal_distribution<> f(location, std::sqrt(squareScale));
for (std::size_t i = 0; i < samples.size(); ++i) {
double fx = boost::math::pdf(f, samples[i]);
TDouble1Vec sample(1, samples[i]);
double l1;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter1.jointLogMarginalLikelihood(sample, l1));
L1G.add(std::log(fx) - l1);
double l2;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter2.jointLogMarginalLikelihood(sample, l2));
L12.add(l2 - l1);
differentialEntropy.add(-std::log(fx));
}
LOG_DEBUG(<< "L1G = " << maths::common::CBasicStatistics::mean(L1G)
<< ", L12 = " << maths::common::CBasicStatistics::mean(L12)
<< ", differential entropy " << differentialEntropy);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(L1G) /
maths::common::CBasicStatistics::mean(differentialEntropy) <
0.0);
}
LOG_DEBUG(<< "Gamma");
{
COneOfNPrior modePrior(makeModePrior());
CMultimodalPrior filter1(makePrior(&modePrior));
COneOfNPrior filter2 = modePrior;
const double shape = 1.0;
const double scale = 0.5;
TDoubleVec samples;
rng.generateGammaSamples(shape, scale, 1000, samples);
for (std::size_t i = 0; i < samples.size(); ++i) {
TDouble1Vec sample(1, samples[i]);
filter1.addSamples(sample);
filter2.addSamples(sample);
BOOST_TEST_REQUIRE(filter1.checkInvariants());
}
TMeanAccumulator L1G;
TMeanAccumulator L12;
TMeanAccumulator differentialEntropy;
boost::math::gamma_distribution<> f(shape, scale);
for (std::size_t i = 0; i < samples.size(); ++i) {
double fx = boost::math::pdf(f, samples[i]);
TDouble1Vec sample(1, samples[i]);
double l1;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter1.jointLogMarginalLikelihood(sample, l1));
L1G.add(std::log(fx) - l1);
double l2;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter2.jointLogMarginalLikelihood(sample, l2));
L12.add(l2 - l1);
differentialEntropy.add(-std::log(fx));
}
LOG_DEBUG(<< "L1G = " << maths::common::CBasicStatistics::mean(L1G)
<< ", L12 = " << maths::common::CBasicStatistics::mean(L12)
<< ", differential entropy " << differentialEntropy);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(L1G) /
maths::common::CBasicStatistics::mean(differentialEntropy) <
0.1);
}
}
BOOST_AUTO_TEST_CASE(testMultipleModes) {
// We check that for data generated from multiple modes
// we get something close to the generating distribution.
// In particular, we test the log likelihood of the data
// for the estimated distribution versus the generating
// distribution and versus an unclustered distribution.
// Note that the generating distribution doesn't necessarily
// have a larger likelihood because we are using a finite
// sample.
using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator;
test::CRandomNumbers rng;
{
LOG_DEBUG(<< "Mixture Normals");
const std::size_t n1 = 400;
const double mean1 = 10.0;
const double variance1 = 2.0;
const std::size_t n2 = 600;
const double mean2 = 20.0;
const double variance2 = 5.0;
TDoubleVec samples1;
rng.generateNormalSamples(mean1, variance1, n1, samples1);
TDoubleVec samples2;
rng.generateNormalSamples(mean2, variance2, n2, samples2);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
LOG_DEBUG(<< "# samples = " << samples.size());
double w1 = n1 / static_cast<double>(n1 + n2);
double w2 = n2 / static_cast<double>(n1 + n2);
boost::math::normal_distribution<> mode1Distribution(mean1, std::sqrt(variance1));
boost::math::normal_distribution<> mode2Distribution(mean2, std::sqrt(variance2));
double loss = 0.0;
TMeanAccumulator differentialEntropy_;
for (std::size_t j = 0; j < samples.size(); ++j) {
double fx = w1 * boost::math::pdf(mode1Distribution, samples[j]) +
w2 * boost::math::pdf(mode2Distribution, samples[j]);
differentialEntropy_.add(-std::log(fx));
}
double differentialEntropy = maths::common::CBasicStatistics::mean(differentialEntropy_);
for (std::size_t i = 0; i < 10; ++i) {
rng.random_shuffle(samples.begin(), samples.end());
COneOfNPrior modePrior(makeModePrior());
CMultimodalPrior filter1(makePrior(&modePrior));
COneOfNPrior filter2 = modePrior;
for (std::size_t j = 0; j < samples.size(); ++j) {
TDouble1Vec sample(1, samples[j]);
filter1.addSamples(sample);
filter2.addSamples(sample);
BOOST_TEST_REQUIRE(filter1.checkInvariants());
}
BOOST_REQUIRE_EQUAL(2, filter1.numberModes());
TMeanAccumulator loss1G;
TMeanAccumulator loss12;
for (std::size_t j = 0; j < samples.size(); ++j) {
double fx = w1 * boost::math::pdf(mode1Distribution, samples[j]) +
w2 * boost::math::pdf(mode2Distribution, samples[j]);
TDouble1Vec sample(1, samples[j]);
double l1;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter1.jointLogMarginalLikelihood(sample, l1));
loss1G.add(std::log(fx) - l1);
double l2;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter2.jointLogMarginalLikelihood(sample, l2));
loss12.add(l2 - l1);
}
LOG_DEBUG(<< "loss1G = " << maths::common::CBasicStatistics::mean(loss1G)
<< ", loss12 = " << maths::common::CBasicStatistics::mean(loss12)
<< ", differential entropy " << differentialEntropy);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(loss12) < 0.0);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(loss1G) / differentialEntropy <
0.0);
loss += maths::common::CBasicStatistics::mean(loss1G);
}
loss /= 10.0;
LOG_DEBUG(<< "loss = " << loss << ", differential entropy = " << differentialEntropy);
BOOST_TEST_REQUIRE(loss / differentialEntropy < 0.0);
}
{
LOG_DEBUG(<< "Mixture Log-Normals");
const std::size_t n1 = 600;
const double location1 = 2.0;
const double squareScale1 = 0.04;
const std::size_t n2 = 300;
const double location2 = 3.0;
const double squareScale2 = 0.08;
const std::size_t n3 = 100;
const double location3 = 4.0;
const double squareScale3 = 0.01;
TDoubleVec samples1;
rng.generateLogNormalSamples(location1, squareScale1, n1, samples1);
TDoubleVec samples2;
rng.generateLogNormalSamples(location2, squareScale2, n2, samples2);
TDoubleVec samples3;
rng.generateLogNormalSamples(location3, squareScale3, n3, samples3);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
samples.insert(samples.end(), samples3.begin(), samples3.end());
LOG_DEBUG(<< "# samples = " << samples.size());
double w1 = n1 / static_cast<double>(n1 + n2 + n3);
double w2 = n2 / static_cast<double>(n1 + n2 + n3);
double w3 = n3 / static_cast<double>(n1 + n2 + n3);
boost::math::lognormal_distribution<> mode1Distribution(
location1, std::sqrt(squareScale1));
boost::math::lognormal_distribution<> mode2Distribution(
location2, std::sqrt(squareScale2));
boost::math::lognormal_distribution<> mode3Distribution(
location3, std::sqrt(squareScale3));
double loss = 0.0;
TMeanAccumulator differentialEntropy_;
for (std::size_t j = 0; j < samples.size(); ++j) {
double fx = w1 * boost::math::pdf(mode1Distribution, samples[j]) +
w2 * boost::math::pdf(mode2Distribution, samples[j]) +
w3 * boost::math::pdf(mode3Distribution, samples[j]);
differentialEntropy_.add(-std::log(fx));
}
double differentialEntropy = maths::common::CBasicStatistics::mean(differentialEntropy_);
for (std::size_t i = 0; i < 10; ++i) {
rng.random_shuffle(samples.begin(), samples.end());
COneOfNPrior modePrior(makeModePrior());
CMultimodalPrior filter1(makePrior(&modePrior));
COneOfNPrior filter2 = modePrior;
for (std::size_t j = 0; j < samples.size(); ++j) {
TDouble1Vec sample(1, samples[j]);
filter1.addSamples(sample);
filter2.addSamples(sample);
BOOST_TEST_REQUIRE(filter1.checkInvariants());
}
BOOST_REQUIRE_EQUAL(3, filter1.numberModes());
TMeanAccumulator loss1G;
TMeanAccumulator loss12;
for (std::size_t j = 0; j < samples.size(); ++j) {
double fx = w1 * boost::math::pdf(mode1Distribution, samples[j]) +
w2 * boost::math::pdf(mode2Distribution, samples[j]) +
w3 * boost::math::pdf(mode3Distribution, samples[j]);
TDouble1Vec sample(1, samples[j]);
double l1;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter1.jointLogMarginalLikelihood(sample, l1));
loss1G.add(std::log(fx) - l1);
double l2;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter2.jointLogMarginalLikelihood(sample, l2));
loss12.add(l2 - l1);
}
LOG_DEBUG(<< "loss1G = " << maths::common::CBasicStatistics::mean(loss1G)
<< ", loss12 = " << maths::common::CBasicStatistics::mean(loss12)
<< ", differential entropy " << differentialEntropy);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(loss12) < 0.0);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(loss1G) / differentialEntropy <
0.001);
loss += maths::common::CBasicStatistics::mean(loss1G);
}
loss /= 10.0;
LOG_DEBUG(<< "loss = " << loss << ", differential entropy = " << differentialEntropy);
BOOST_TEST_REQUIRE(loss / differentialEntropy < 0.0);
}
{
LOG_DEBUG(<< "Mixed Modes");
const std::size_t n1 = 400;
const double mean1 = 10.0;
const double variance1 = 1.0;
const std::size_t n2 = 200;
const double location2 = 3.0;
const double squareScale2 = 0.08;
const std::size_t n3 = 400;
const double shape3 = 120.0;
const double scale3 = 0.3;
TDoubleVec samples1;
rng.generateNormalSamples(mean1, variance1, n1, samples1);
TDoubleVec samples2;
rng.generateLogNormalSamples(location2, squareScale2, n2, samples2);
TDoubleVec samples3;
rng.generateGammaSamples(shape3, scale3, n3, samples3);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
samples.insert(samples.end(), samples3.begin(), samples3.end());
LOG_DEBUG(<< "# samples = " << samples.size());
double w1 = n1 / static_cast<double>(n1 + n2 + n3);
double w2 = n2 / static_cast<double>(n1 + n2 + n3);
double w3 = n3 / static_cast<double>(n1 + n2 + n3);
boost::math::normal_distribution<> mode1Distribution(mean1, std::sqrt(variance1));
boost::math::lognormal_distribution<> mode2Distribution(
location2, std::sqrt(squareScale2));
boost::math::gamma_distribution<> mode3Distribution(shape3, scale3);
double loss = 0.0;
TMeanAccumulator differentialEntropy_;
for (std::size_t j = 0; j < samples.size(); ++j) {
double fx = w1 * boost::math::pdf(mode1Distribution, samples[j]) +
w2 * boost::math::pdf(mode2Distribution, samples[j]) +
w3 * boost::math::pdf(mode3Distribution, samples[j]);
differentialEntropy_.add(-std::log(fx));
}
double differentialEntropy = maths::common::CBasicStatistics::mean(differentialEntropy_);
for (std::size_t i = 0; i < 10; ++i) {
rng.random_shuffle(samples.begin(), samples.end());
COneOfNPrior modePrior(makeModePrior());
CMultimodalPrior filter1(makePrior(&modePrior));
COneOfNPrior filter2 = modePrior;
for (std::size_t j = 0; j < samples.size(); ++j) {
TDouble1Vec sample(1, samples[j]);
filter1.addSamples(sample);
filter2.addSamples(sample);
BOOST_TEST_REQUIRE(filter1.checkInvariants());
}
BOOST_REQUIRE_EQUAL(3, filter1.numberModes());
TMeanAccumulator loss1G;
TMeanAccumulator loss12;
for (std::size_t j = 0; j < samples.size(); ++j) {
double fx = w1 * boost::math::pdf(mode1Distribution, samples[j]) +
w2 * boost::math::pdf(mode2Distribution, samples[j]) +
w3 * boost::math::pdf(mode3Distribution, samples[j]);
TDouble1Vec sample(1, samples[j]);
double l1;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter1.jointLogMarginalLikelihood(sample, l1));
loss1G.add(std::log(fx) - l1);
double l2;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter2.jointLogMarginalLikelihood(sample, l2));
loss12.add(l2 - l1);
}
LOG_DEBUG(<< "loss1G = " << maths::common::CBasicStatistics::mean(loss1G)
<< ", loss12 = " << maths::common::CBasicStatistics::mean(loss12)
<< ", differential entropy " << differentialEntropy);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(loss12) < 0.0);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(loss1G) / differentialEntropy <
0.01);
loss += maths::common::CBasicStatistics::mean(loss1G);
}
loss /= 10.0;
LOG_DEBUG(<< "loss = " << loss << ", differential entropy = " << differentialEntropy);
BOOST_TEST_REQUIRE(loss / differentialEntropy < 0.003);
}
}
BOOST_AUTO_TEST_CASE(testMarginalLikelihood) {
using TNormalVec = std::vector<boost::math::normal_distribution<>>;
// Check that the c.d.f. <= 1 at extreme.
{
CMultimodalPrior filter(makePrior());
const double shape = 1.0;
const double scale = 1.0;
const double location = 2.0;
const double squareScale = 0.5;
test::CRandomNumbers rng;
TDoubleVec samples;
rng.generateGammaSamples(shape, scale, 100, samples);
filter.addSamples(samples);
rng.generateLogNormalSamples(location, squareScale, 100, samples);
filter.addSamples(samples);
TWeightFunc weightsFuncs[]{static_cast<TWeightFunc>(maths_t::countWeight),
static_cast<TWeightFunc>(maths_t::outlierWeight)};
double weights[]{0.1, 1.0, 10.0};
for (std::size_t i = 0; i < std::size(weightsFuncs); ++i) {
for (std::size_t j = 0; j < std::size(weights); ++j) {
double lb, ub;
filter.minusLogJointCdf({20000.0}, {weightsFuncs[i](weights[j])}, lb, ub);
LOG_DEBUG(<< "-log(c.d.f) = " << (lb + ub) / 2.0);
BOOST_TEST_REQUIRE(lb >= 0.0);
BOOST_TEST_REQUIRE(ub >= 0.0);
}
}
}
// Check that the marginal likelihood and c.d.f. agree for some
// test data and that the c.d.f. <= 1 and that the expected value
// of the log likelihood tends to the differential entropy.
const double decayRates[] = {0.0, 0.001, 0.01};
unsigned int numberSamples[] = {2u, 20u, 500u};
const double tolerance = 0.01;
test::CRandomNumbers rng;
const double w1 = 0.5;
const double mean1 = 10.0;
const double variance1 = 1.0;
const double w2 = 0.3;
const double mean2 = 15.0;
const double variance2 = 2.0;
const double w3 = 0.2;
const double mean3 = 25.0;
const double variance3 = 3.0;
TDoubleVec samples1;
rng.generateNormalSamples(mean1, variance1,
static_cast<std::size_t>(w1 * 500.0), samples1);
TDoubleVec samples2;
rng.generateNormalSamples(mean2, variance2,
static_cast<std::size_t>(w2 * 500.0), samples2);
TDoubleVec samples3;
rng.generateNormalSamples(mean3, variance3,
static_cast<std::size_t>(w3 * 500.0), samples3);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
samples.insert(samples.end(), samples3.begin(), samples3.end());
rng.random_shuffle(samples.begin(), samples.end());
for (size_t i = 0; i < std::size(numberSamples); ++i) {
for (size_t j = 0; j < std::size(decayRates); ++j) {
CMultimodalPrior filter(makePrior(decayRates[j]));
for (std::size_t k = 0; k < samples.size(); ++k) {
filter.addSamples(TDouble1Vec(1, samples[k]));
filter.propagateForwardsByTime(1.0);
BOOST_TEST_REQUIRE(filter.checkInvariants());
}
LOG_DEBUG(<< "# modes = " << filter.numberModes());
// We'll check that the p.d.f. is close to the derivative of the
// c.d.f. at a range of points on the p.d.f.
const double eps = 1e-4;
for (size_t k = 5; k < 31; ++k) {
TDouble1Vec sample(1, static_cast<double>(k));
LOG_DEBUG(<< "number = " << numberSamples[i] << ", sample = " << sample[0]);
double logLikelihood = 0.0;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter.jointLogMarginalLikelihood(sample, logLikelihood));
double pdf = std::exp(logLikelihood);
double lowerBound = 0.0, upperBound = 0.0;
sample[0] -= eps;
BOOST_TEST_REQUIRE(filter.minusLogJointCdf(sample, lowerBound, upperBound));
BOOST_REQUIRE_CLOSE_ABSOLUTE(lowerBound, upperBound, 1e-3);
double minusLogCdf = (lowerBound + upperBound) / 2.0;
double cdfAtMinusEps = std::exp(-minusLogCdf);
BOOST_TEST_REQUIRE(minusLogCdf >= 0.0);
sample[0] += 2.0 * eps;
BOOST_TEST_REQUIRE(filter.minusLogJointCdf(sample, lowerBound, upperBound));
BOOST_REQUIRE_CLOSE_ABSOLUTE(lowerBound, upperBound, 1e-3);
minusLogCdf = (lowerBound + upperBound) / 2.0;
double cdfAtPlusEps = std::exp(-minusLogCdf);
BOOST_TEST_REQUIRE(minusLogCdf >= 0.0);
double dcdfdx = (cdfAtPlusEps - cdfAtMinusEps) / 2.0 / eps;
LOG_DEBUG(<< "pdf(x) = " << pdf << ", d(cdf)/dx = " << dcdfdx);
BOOST_REQUIRE_CLOSE_ABSOLUTE(pdf, dcdfdx, tolerance);
}
}
}
{
// Test that the sample expectation of the log likelihood tends
// to the expected log likelihood, which is just the differential
// entropy.
CMultimodalPrior filter(makePrior());
filter.addSamples(samples);
LOG_DEBUG(<< "# modes = " << filter.numberModes());
TDoubleVec manySamples1;
rng.generateNormalSamples(mean1, variance1,
static_cast<std::size_t>(w1 * 100000.0), manySamples1);
TDoubleVec manySamples2;
rng.generateNormalSamples(mean2, variance2,
static_cast<std::size_t>(w2 * 100000.0), manySamples2);
TDoubleVec manySamples3;
rng.generateNormalSamples(mean3, variance3,
static_cast<std::size_t>(w3 * 100000.0), manySamples3);
TDoubleVec manySamples;
manySamples.insert(manySamples.end(), manySamples1.begin(), manySamples1.end());
manySamples.insert(manySamples.end(), manySamples2.begin(), manySamples2.end());
manySamples.insert(manySamples.end(), manySamples3.begin(), manySamples3.end());
rng.random_shuffle(manySamples.begin(), manySamples.end());
TDoubleVec weights;
weights.push_back(w1);
weights.push_back(w2);
weights.push_back(w3);
TNormalVec modes;
modes.push_back(boost::math::normal_distribution<>(mean1, variance1));
modes.push_back(boost::math::normal_distribution<>(mean2, variance2));
modes.push_back(boost::math::normal_distribution<>(mean3, variance3));
maths::common::CMixtureDistribution<boost::math::normal_distribution<>> f(weights, modes);
double expectedDifferentialEntropy = maths::common::CTools::differentialEntropy(f);
double differentialEntropy = 0.0;
for (std::size_t i = 0; i < manySamples.size(); ++i) {
if (i % 1000 == 0) {
LOG_DEBUG(<< "Processed " << i << " samples");
}
TDouble1Vec sample(1, manySamples[i]);
filter.addSamples(sample);
double logLikelihood = 0.0;
BOOST_REQUIRE_EQUAL(maths_t::E_FpNoErrors,
filter.jointLogMarginalLikelihood(sample, logLikelihood));
differentialEntropy -= logLikelihood;
}
differentialEntropy /= static_cast<double>(manySamples.size());
LOG_DEBUG(<< "differentialEntropy = " << differentialEntropy
<< ", expectedDifferentialEntropy = " << expectedDifferentialEntropy);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedDifferentialEntropy, differentialEntropy,
0.05 * expectedDifferentialEntropy);
}
}
BOOST_AUTO_TEST_CASE(testMarginalLikelihoodMode) {
// Test that the marginal likelihood mode is at a local
// minimum of the likelihood function. And we don't find
// a higher likelihood location with high probability.
test::CRandomNumbers rng;
double w1 = 0.1;
double mean1 = 1.0;
double variance1 = 1.0;
double w2 = 0.9;
double mean2 = 8.0;
double variance2 = 1.5;
TDoubleVec samples1;
rng.generateNormalSamples(mean1, variance1,
static_cast<std::size_t>(w1 * 500.0), samples1);
TDoubleVec samples2;
rng.generateNormalSamples(mean2, variance2,
static_cast<std::size_t>(w2 * 500.0), samples2);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
rng.random_shuffle(samples.begin(), samples.end());
const double varianceScales[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
0.7, 0.8, 0.9, 1.0, 1.2, 1.5,
2.0, 2.5, 3.0, 4.0, 5.0};
CMultimodalPrior filter(makePrior());
filter.addSamples(samples);
maths_t::TDoubleWeightsAry weight(maths_t::CUnitWeights::UNIT);
std::size_t totalCount = 0;
for (std::size_t i = 0; i < std::size(varianceScales); ++i) {
double vs = varianceScales[i];
maths_t::setCountVarianceScale(vs, weight);
LOG_DEBUG(<< "*** vs = " << vs << " ***");
double mode = filter.marginalLikelihoodMode(weight);
LOG_DEBUG(<< "marginalLikelihoodMode = " << mode);
// Should be near 8.
BOOST_REQUIRE_CLOSE_ABSOLUTE(8.0, filter.marginalLikelihoodMode(weight), 2.0);
double eps = 0.01;
double modeMinusEps = mode - eps;
double modePlusEps = mode + eps;
double fMode, fModeMinusEps, fModePlusEps;
filter.jointLogMarginalLikelihood({mode}, {weight}, fMode);
filter.jointLogMarginalLikelihood({modeMinusEps}, {weight}, fModeMinusEps);
filter.jointLogMarginalLikelihood({modePlusEps}, {weight}, fModePlusEps);
fMode = std::exp(fMode);
fModeMinusEps = std::exp(fModeMinusEps);
fModePlusEps = std::exp(fModePlusEps);
double gradient = (fModePlusEps - fModeMinusEps) / 2.0 / eps;
LOG_DEBUG(<< "f(mode) = " << fMode << ", f(mode-eps) = " << fModeMinusEps
<< ", f(mode + eps) = " << fModePlusEps);
LOG_DEBUG(<< "gradient = " << gradient);
BOOST_TEST_REQUIRE(std::fabs(gradient) < 0.05);
BOOST_TEST_REQUIRE(fMode > 0.999 * fModeMinusEps);
BOOST_TEST_REQUIRE(fMode > 0.999 * fModePlusEps);
TDoubleVec trials;
rng.generateUniformSamples(mean1, mean2, 500, trials);
std::size_t count = 0;
TDoubleVec fTrials;
for (std::size_t j = 0; j < trials.size(); ++j) {
double fTrial;
filter.jointLogMarginalLikelihood({trials[j]}, {weight}, fTrial);
fTrial = std::exp(fTrial);
if (fTrial > fMode) {
LOG_TRACE(<< "f(" << trials[j] << ") = " << fTrial << " > " << fMode);
++count;
}
fTrials.push_back(fTrial);
}
LOG_DEBUG(<< "count = " << count);
BOOST_TEST_REQUIRE(count < 6);
totalCount += count;
}
LOG_DEBUG(<< "totalCount = " << totalCount);
BOOST_TEST_REQUIRE(totalCount < 11);
}
BOOST_AUTO_TEST_CASE(testMarginalLikelihoodConfidenceInterval) {
// Test that marginal likelihood confidence intervals are
// what we'd expect for various variance scales.
using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator;
test::CRandomNumbers rng;
LOG_DEBUG(<< "Synthetic");
{
double w1 = 0.2;
double location1 = 0.1;
double squareScale1 = 0.2;
double w2 = 0.8;
double mean2 = 8.0;
double variance2 = 2.0;
TDoubleVec samples1;
rng.generateLogNormalSamples(location1, squareScale1,
static_cast<std::size_t>(w1 * 2000.0), samples1);
TDoubleVec samples2;
rng.generateNormalSamples(mean2, variance2,
static_cast<std::size_t>(w2 * 2000.0), samples2);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
rng.random_shuffle(samples.begin(), samples.end());
const double varianceScales[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6,
0.7, 0.8, 0.9, 1.0, 1.2, 1.5,
2.0, 2.5, 3.0, 4.0, 5.0};
const double percentages[] = {5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0,
70.0, 80.0, 95.0, 99.0, 99.9, 99.99};
CMultimodalPrior filter(makePrior());
filter.addSamples(samples);
for (std::size_t i = 0; i < std::size(varianceScales); ++i) {
LOG_DEBUG(<< "*** vs = " << varianceScales[i] << " ***");
TMeanAccumulator error;
for (std::size_t j = 0; j < std::size(percentages); ++j) {
LOG_DEBUG(<< "** percentage = " << percentages[j] << " **");
double q1, q2;
filter.marginalLikelihoodQuantileForTest(50.0 - percentages[j] / 2.0,
1e-3, q1);
filter.marginalLikelihoodQuantileForTest(50.0 + percentages[j] / 2.0,
1e-3, q2);
TDoubleDoublePr interval =
filter.marginalLikelihoodConfidenceInterval(percentages[j]);
LOG_DEBUG(<< "[q1, q2] = [" << q1 << ", " << q2 << "]"
<< ", interval = " << interval);
BOOST_REQUIRE_CLOSE_ABSOLUTE(q1, interval.first, 0.1);
BOOST_REQUIRE_CLOSE_ABSOLUTE(q2, interval.second, 0.05);
error.add(std::fabs(interval.first - q1));
error.add(std::fabs(interval.second - q2));
}
LOG_DEBUG(<< "error = " << maths::common::CBasicStatistics::mean(error));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(error) < 5e-3);
}
std::sort(samples.begin(), samples.end());
TMeanAccumulator error;
for (std::size_t i = 0; i < std::size(percentages); ++i) {
LOG_DEBUG(<< "** percentage = " << percentages[i] << " **");
std::size_t i1 = static_cast<std::size_t>(
static_cast<double>(samples.size()) * (50.0 - percentages[i] / 2.0) / 100.0 + 0.5);
std::size_t i2 = static_cast<std::size_t>(
static_cast<double>(samples.size()) * (50.0 + percentages[i] / 2.0) / 100.0 + 0.5);
double q1 = samples[i1];
double q2 = samples[std::min(i2, samples.size() - 1)];
TDoubleDoublePr interval =
filter.marginalLikelihoodConfidenceInterval(percentages[i]);
LOG_DEBUG(<< "[q1, q2] = [" << q1 << ", " << q2 << "]"
<< ", interval = " << interval);
BOOST_REQUIRE_CLOSE_ABSOLUTE(q1, interval.first, std::max(0.1 * q1, 0.15));
BOOST_REQUIRE_CLOSE_ABSOLUTE(q2, interval.second, 0.1 * q2);
error.add(std::fabs(interval.first - q1) / q1);
error.add(std::fabs(interval.second - q2) / q2);
}
LOG_DEBUG(<< "error = " << maths::common::CBasicStatistics::mean(error));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(error) < 0.05);
}
LOG_DEBUG(<< "Problem Case (Issue 439)");
{
std::ifstream file{"testfiles/poorly_conditioned_multimodal.txt"};
BOOST_TEST_REQUIRE(file.is_open());
std::ostringstream state;
state << file.rdbuf();
std::istringstream stateStrm(state.str());
core::CJsonStateRestoreTraverser traverser(stateStrm);
maths::common::SDistributionRestoreParams params(
maths_t::E_ContinuousData, 0.0, maths::common::MINIMUM_CLUSTER_SPLIT_FRACTION,
maths::common::MINIMUM_CLUSTER_SPLIT_COUNT,
maths::common::MINIMUM_CATEGORY_COUNT);
TPriorPtr prior;
maths::common::CPriorStateSerialiser restorer;
BOOST_TEST_REQUIRE(restorer(params, prior, traverser));
TDoubleDoublePr median = prior->marginalLikelihoodConfidenceInterval(
0, maths_t::CUnitWeights::UNIT);
TDoubleDoublePr i90 = prior->marginalLikelihoodConfidenceInterval(
90, maths_t::CUnitWeights::UNIT);
LOG_DEBUG(<< "median = " << maths::common::CBasicStatistics::mean(median));
LOG_DEBUG(<< "confidence interval = " << i90);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(median) > i90.first);
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(median) < i90.second);
double lowerBound[2];
double upperBound[2];
prior->minusLogJointCdf({i90.first}, maths_t::CUnitWeights::SINGLE_UNIT,
lowerBound[0], upperBound[0]);
prior->minusLogJointCdf({i90.second}, maths_t::CUnitWeights::SINGLE_UNIT,
lowerBound[1], upperBound[1]);
BOOST_REQUIRE_CLOSE_ABSOLUTE(
0.05, std::exp(-(lowerBound[0] + upperBound[0]) / 2.0), 1e-3);
BOOST_REQUIRE_CLOSE_ABSOLUTE(
0.95, std::exp(-(lowerBound[1] + upperBound[1]) / 2.0), 1e-3);
}
LOG_DEBUG(<< "Non-unit count weight");
{
// The confidence interval should be independent of the sample
// count weight.
TDoubleVec modes[2];
rng.generateNormalSamples(10.0, 2.0, 50, modes[0]);
rng.generateNormalSamples(20.0, 2.0, 50, modes[1]);
TDoubleVec samples(modes[0].begin(), modes[0].end());
samples.insert(samples.end(), modes[1].begin(), modes[1].end());
CMultimodalPrior filter(makePrior());
filter.addSamples(samples);
BOOST_REQUIRE_EQUAL(2, filter.numberModes());
TDoubleDoublePr interval{filter.marginalLikelihoodConfidenceInterval(
90.0, maths_t::countWeight(1.0))};
TDoubleDoublePr weightedInterval{filter.marginalLikelihoodConfidenceInterval(
90.0, maths_t::countWeight(0.3))};
LOG_DEBUG(<< "interval = " << interval);
LOG_DEBUG(<< "weightedInterval = " << weightedInterval);
BOOST_REQUIRE_EQUAL(core::CContainerPrinter::print(interval),
core::CContainerPrinter::print(weightedInterval));
}
}
BOOST_AUTO_TEST_CASE(testSampleMarginalLikelihood) {
// We're going to test two properties of the sampling:
// 1) That the sample mean is equal to the marginal likelihood
// mean.
// 2) That the sample percentiles match the distribution
// percentiles.
// 3) That the sample mean, variance and skew are all close to
// the corresponding quantities in the training data.
//
// I want to cross check these with the implementations of the
// jointLogMarginalLikelihood and minusLogJointCdf so use these
// to compute the mean and percentiles.
using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator;
using TMeanVarAccumulator =
maths::common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator;
using TMeanVarSkewAccumulator =
maths::common::CBasicStatistics::SSampleMeanVarSkew<double>::TAccumulator;
const double eps = 1e-3;
test::CRandomNumbers rng;
TDoubleVec samples1;
rng.generateNormalSamples(50.0, 1.0, 150, samples1);
TDoubleVec samples2;
rng.generateNormalSamples(57.0, 1.0, 100, samples2);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
rng.random_shuffle(samples.begin(), samples.end());
CMultimodalPrior filter(makePrior());
TDouble1Vec sampled;
TMeanVarSkewAccumulator sampleMoments;
// A number of sample moments less than 3.5 is considered "non-informative"
for (std::size_t i = 0; i < 3; ++i) {
LOG_DEBUG(<< "sample = " << samples[i]);
sampleMoments.add(samples[i]);
filter.addSamples(TDouble1Vec(1, samples[i]));
BOOST_REQUIRE_EQUAL(std::string("\nmultimodal non-informative"), filter.print());
filter.sampleMarginalLikelihood(10, sampled);
TMeanVarSkewAccumulator sampledMoments;
sampledMoments.add(sampled);
LOG_DEBUG(<< "Sample moments = " << sampleMoments
<< ", sampled moments = " << sampledMoments);
LOG_DEBUG(<< "sampleMean = " << maths::common::CBasicStatistics::mean(sampleMoments)
<< ", sampledMean = "
<< maths::common::CBasicStatistics::mean(sampledMoments));
LOG_DEBUG(<< "sampleVariance = "
<< maths::common::CBasicStatistics::variance(sampleMoments) << ", sampledVariance = "
<< maths::common::CBasicStatistics::variance(sampledMoments));
BOOST_REQUIRE_CLOSE_ABSOLUTE(
maths::common::CBasicStatistics::mean(sampleMoments),
maths::common::CBasicStatistics::mean(sampledMoments),
0.05 * maths::common::CBasicStatistics::mean(sampleMoments));
BOOST_REQUIRE_CLOSE_ABSOLUTE(
maths::common::CBasicStatistics::variance(sampleMoments),
maths::common::CBasicStatistics::variance(sampledMoments),
std::max(1e-6, 0.55 * maths::common::CBasicStatistics::variance(sampleMoments)));
}
TMeanAccumulator meanMeanError;
TMeanAccumulator meanVarError;
std::size_t numberSampled = 20;
for (std::size_t i = 3; i < samples.size(); ++i) {
LOG_DEBUG(<< "sample = " << samples[i]);
sampleMoments.add(samples[i]);
filter.addSamples(TDouble1Vec(1, samples[i]));
sampled.clear();
filter.sampleMarginalLikelihood(numberSampled, sampled);
BOOST_REQUIRE_EQUAL(numberSampled, sampled.size());
{
TMeanVarAccumulator sampledMoments;
sampledMoments = std::for_each(sampled.begin(), sampled.end(), sampledMoments);
LOG_DEBUG(<< "expectedMean = " << filter.marginalLikelihoodMean() << ", sampledMean = "
<< maths::common::CBasicStatistics::mean(sampledMoments));
LOG_DEBUG(<< "expectedVariance = " << filter.marginalLikelihoodVariance() << ", sampledVariance = "
<< maths::common::CBasicStatistics::variance(sampledMoments));
BOOST_REQUIRE_CLOSE_ABSOLUTE(filter.marginalLikelihoodMean(),
maths::common::CBasicStatistics::mean(sampledMoments),
0.005 * filter.marginalLikelihoodMean());
BOOST_REQUIRE_CLOSE_ABSOLUTE(filter.marginalLikelihoodVariance(),
maths::common::CBasicStatistics::variance(sampledMoments),
0.2 * filter.marginalLikelihoodVariance());
meanMeanError.add(std::fabs(filter.marginalLikelihoodMean() -
maths::common::CBasicStatistics::mean(sampledMoments)) /
filter.marginalLikelihoodMean());
meanVarError.add(std::fabs(filter.marginalLikelihoodVariance() -
maths::common::CBasicStatistics::variance(sampledMoments)) /
filter.marginalLikelihoodVariance());
}
std::sort(sampled.begin(), sampled.end());
for (std::size_t j = 1; j < sampled.size(); ++j) {
double q = 100.0 * static_cast<double>(j) /
static_cast<double>(sampled.size());
double expectedQuantile;
BOOST_TEST_REQUIRE(filter.marginalLikelihoodQuantileForTest(q, eps, expectedQuantile));
LOG_TRACE(<< "quantile = " << q << ", x_quantile = " << expectedQuantile << ", quantile range = ["
<< sampled[j - 1] << "," << sampled[j] << "]");
BOOST_TEST_REQUIRE(expectedQuantile >= 0.98 * sampled[j - 1]);
BOOST_TEST_REQUIRE(expectedQuantile <= 1.02 * sampled[j]);
}
}
LOG_DEBUG(<< "mean mean error = " << maths::common::CBasicStatistics::mean(meanMeanError));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanMeanError) < 0.0015);
LOG_DEBUG(<< "mean variance error = "
<< maths::common::CBasicStatistics::mean(meanVarError));
BOOST_TEST_REQUIRE(maths::common::CBasicStatistics::mean(meanVarError) < 0.04);
sampled.clear();
filter.sampleMarginalLikelihood(numberSampled, sampled);
TMeanVarSkewAccumulator sampledMoments;
for (std::size_t i = 0; i < sampled.size(); ++i) {
sampledMoments.add(sampled[i]);
}
LOG_DEBUG(<< "Sample moments = " << sampledMoments << ", sampled moments = " << sampleMoments);
BOOST_REQUIRE_CLOSE_ABSOLUTE(
maths::common::CBasicStatistics::mean(sampleMoments),
maths::common::CBasicStatistics::mean(sampledMoments),
1e-4 * maths::common::CBasicStatistics::mean(sampleMoments));
BOOST_REQUIRE_CLOSE_ABSOLUTE(
maths::common::CBasicStatistics::variance(sampleMoments),
maths::common::CBasicStatistics::variance(sampledMoments),
0.05 * maths::common::CBasicStatistics::variance(sampleMoments));
BOOST_REQUIRE_CLOSE_ABSOLUTE(
maths::common::CBasicStatistics::skewness(sampleMoments),
maths::common::CBasicStatistics::skewness(sampledMoments),
0.1 * maths::common::CBasicStatistics::skewness(sampleMoments));
}
BOOST_AUTO_TEST_CASE(testCdf) {
// Test error cases.
//
// Test some invariants:
// "cdf" + "cdf complement" = 1
// cdf x for x < 0 = 1
// cdf complement x for x < 0 = 0
const double locations[] = {1.0, 3.0};
const double squareScales[] = {0.5, 0.3};
const std::size_t n[] = {100u, 100u};
test::CRandomNumbers rng;
CGammaRateConjugate gamma(maths::common::CGammaRateConjugate::nonInformativePrior(
maths_t::E_ContinuousData));
CLogNormalMeanPrecConjugate logNormal(
maths::common::CLogNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData));
COneOfNPrior::TPriorPtrVec priors;
priors.push_back(COneOfNPrior::TPriorPtr(gamma.clone()));
priors.push_back(COneOfNPrior::TPriorPtr(logNormal.clone()));
COneOfNPrior modePrior(maths::common::COneOfNPrior(priors, maths_t::E_ContinuousData));
CMultimodalPrior filter(makePrior(&modePrior));
for (std::size_t i = 0; i < std::size(n); ++i) {
TDoubleVec samples;
rng.generateLogNormalSamples(locations[i], squareScales[i], n[i], samples);
filter.addSamples(samples);
}
double lowerBound;
double upperBound;
BOOST_TEST_REQUIRE(!filter.minusLogJointCdf(TDouble1Vec(), lowerBound, upperBound));
BOOST_TEST_REQUIRE(!filter.minusLogJointCdfComplement(TDouble1Vec(), lowerBound, upperBound));
BOOST_TEST_REQUIRE(filter.minusLogJointCdf(TDouble1Vec(1, -1.0), lowerBound, upperBound));
double f = (lowerBound + upperBound) / 2.0;
BOOST_TEST_REQUIRE(filter.minusLogJointCdfComplement(TDouble1Vec(1, -1.0),
lowerBound, upperBound));
double fComplement = (lowerBound + upperBound) / 2.0;
LOG_DEBUG(<< "log(F(x)) = " << -f << ", log(1 - F(x)) = " << fComplement);
BOOST_REQUIRE_CLOSE_ABSOLUTE(std::log(std::numeric_limits<double>::min()), -f, 1e-8);
BOOST_REQUIRE_CLOSE_ABSOLUTE(1.0, std::exp(-fComplement), 1e-8);
for (std::size_t j = 1; j < 1000; ++j) {
double x = static_cast<double>(j) / 2.0;
BOOST_TEST_REQUIRE(filter.minusLogJointCdf(TDouble1Vec(1, x), lowerBound, upperBound));
f = (lowerBound + upperBound) / 2.0;
BOOST_TEST_REQUIRE(filter.minusLogJointCdfComplement(
TDouble1Vec(1, x), lowerBound, upperBound));
fComplement = (lowerBound + upperBound) / 2.0;
LOG_TRACE(<< "log(F(x)) = " << (f == 0.0 ? f : -f) << ", log(1 - F(x)) = "
<< (fComplement == 0.0 ? fComplement : -fComplement));
BOOST_REQUIRE_CLOSE_ABSOLUTE(1.0, std::exp(-f) + std::exp(-fComplement), 1e-8);
}
}
BOOST_AUTO_TEST_CASE(testProbabilityOfLessLikelySamples) {
using TNormalVec = std::vector<boost::math::normal_distribution<>>;
using TLogNormalVec = std::vector<boost::math::lognormal_distribution<>>;
using TGammaVec = std::vector<boost::math::gamma_distribution<>>;
test::CRandomNumbers rng;
{
double weight1 = 0.5, weight2 = 0.5;
double mean1 = 50.0, mean2 = 57.0;
double variance1 = 1.0, variance2 = 1.0;
TDoubleVec samples1;
rng.generateNormalSamples(mean1, variance1,
static_cast<std::size_t>(10000.0 * weight1), samples1);
TDoubleVec samples2;
rng.generateNormalSamples(mean2, variance2,
static_cast<std::size_t>(10000.0 * weight2), samples2);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
rng.random_shuffle(samples.begin(), samples.end());
TDoubleVec weights;
weights.push_back(weight1);
weights.push_back(weight2);
TNormalVec modes;
modes.push_back(boost::math::normal_distribution<>(mean1, variance1));
modes.push_back(boost::math::normal_distribution<>(mean2, variance2));
maths::common::CMixtureDistribution<boost::math::normal_distribution<>> mixture(
weights, modes);
CMultimodalPrior filter(makePrior());
filter.addSamples(samples);
LOG_DEBUG(<< "# modes = " << filter.numberModes());
double x[] = {46.0, 49.0, 54.0, 55.0, 68.0};
double error = 0.0;
for (std::size_t i = 0; i < std::size(x); ++i) {
double expectedProbability;
double deviation;
probabilityOfLessLikelySample(mixture, x[i], expectedProbability, deviation);
double lowerBound;
double upperBound;
filter.probabilityOfLessLikelySamples(
maths_t::E_TwoSided, TDouble1Vec(1, x[i]), lowerBound, upperBound);
LOG_DEBUG(<< "lowerBound = " << lowerBound << ", upperBound = " << upperBound
<< ", expectedProbability = " << expectedProbability
<< ", deviation = " << deviation);
double probability = (lowerBound + upperBound) / 2.0;
error += probability < expectedProbability - 2.0 * deviation
? (expectedProbability - 2.0 * deviation) - probability
: (probability > expectedProbability + 2.0 * deviation
? probability - (expectedProbability + 2.0 * deviation)
: 0.0);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedProbability, probability,
std::max(3.0 * deviation, 3e-5));
}
error /= static_cast<double>(std::size(x));
LOG_DEBUG(<< "error = " << error);
BOOST_TEST_REQUIRE(error < 0.001);
double lb, ub;
maths_t::ETail tail;
filter.probabilityOfLessLikelySamples(maths_t::E_TwoSided, {49.0},
maths_t::CUnitWeights::SINGLE_UNIT,
lb, ub, tail);
BOOST_REQUIRE_EQUAL(maths_t::E_LeftTail, tail);
filter.probabilityOfLessLikelySamples(maths_t::E_TwoSided, {54.0},
maths_t::CUnitWeights::SINGLE_UNIT,
lb, ub, tail);
BOOST_REQUIRE_EQUAL(maths_t::E_MixedOrNeitherTail, tail);
filter.probabilityOfLessLikelySamples(maths_t::E_TwoSided, {59.0},
maths_t::CUnitWeights::SINGLE_UNIT,
lb, ub, tail);
BOOST_REQUIRE_EQUAL(maths_t::E_RightTail, tail);
}
{
double weights[] = {0.6, 0.2, 0.2};
double locations[] = {1.0, 2.5, 4.0};
double squareScales[] = {0.1, 0.05, 0.3};
TDoubleVec samples;
samples.reserve(20000u);
for (std::size_t i = 0; i < std::size(weights); ++i) {
TDoubleVec modeSamples;
rng.generateLogNormalSamples(
locations[i], squareScales[i],
static_cast<std::size_t>(20000.0 * weights[i]), modeSamples);
samples.insert(samples.end(), modeSamples.begin(), modeSamples.end());
}
rng.random_shuffle(samples.begin(), samples.end());
TDoubleVec mixtureWeights(std::begin(weights), std::end(weights));
TLogNormalVec modes;
modes.push_back(boost::math::lognormal_distribution<>(
locations[0], std::sqrt(squareScales[0])));
modes.push_back(boost::math::lognormal_distribution<>(
locations[1], std::sqrt(squareScales[1])));
modes.push_back(boost::math::lognormal_distribution<>(
locations[2], std::sqrt(squareScales[2])));
maths::common::CMixtureDistribution<boost::math::lognormal_distribution<>> mixture(
mixtureWeights, modes);
CMultimodalPrior filter(makePrior());
filter.addSamples(samples);
LOG_DEBUG(<< "# modes = " << filter.numberModes());
double x[] = {2.0, 3.0, 9.0, 15.0, 18.0, 22.0, 40.0, 60.0, 80.0, 110.0};
double error = 0.0;
for (std::size_t i = 0; i < std::size(x); ++i) {
double expectedProbability;
double deviation;
probabilityOfLessLikelySample(mixture, x[i], expectedProbability, deviation);
double lowerBound;
double upperBound;
filter.probabilityOfLessLikelySamples(
maths_t::E_TwoSided, TDouble1Vec(1, x[i]), lowerBound, upperBound);
LOG_DEBUG(<< "lowerBound = " << lowerBound << ", upperBound = " << upperBound
<< ", expectedProbability = " << expectedProbability
<< ", deviation = " << deviation);
double probability = (lowerBound + upperBound) / 2.0;
error += probability < expectedProbability - 2.0 * deviation
? (expectedProbability - 2.0 * deviation) - probability
: (probability > expectedProbability + 2.0 * deviation
? probability - (expectedProbability + 2.0 * deviation)
: 0.0);
BOOST_REQUIRE_CLOSE_ABSOLUTE(
expectedProbability, probability,
std::min(0.2 * expectedProbability + std::max(3.0 * deviation, 1e-10), 0.06));
}
error /= static_cast<double>(std::size(x));
LOG_DEBUG(<< "error = " << error);
BOOST_TEST_REQUIRE(error < 0.009);
}
{
double weights[] = {0.6, 0.4};
double shapes[] = {2.0, 300.0};
double scales[] = {0.5, 1.5};
TDoubleVec samples;
samples.reserve(20000u);
for (std::size_t i = 0; i < std::size(weights); ++i) {
TDoubleVec modeSamples;
rng.generateGammaSamples(shapes[i], scales[i],
static_cast<std::size_t>(20000.0 * weights[i]),
modeSamples);
samples.insert(samples.end(), modeSamples.begin(), modeSamples.end());
}
rng.random_shuffle(samples.begin(), samples.end());
TDoubleVec mixtureWeights(std::begin(weights), std::end(weights));
TGammaVec modes;
modes.push_back(boost::math::gamma_distribution<>(shapes[0], scales[0]));
modes.push_back(boost::math::gamma_distribution<>(shapes[1], scales[1]));
maths::common::CMixtureDistribution<boost::math::gamma_distribution<>> mixture(
mixtureWeights, modes);
CMultimodalPrior filter(makePrior());
filter.addSamples(samples);
LOG_DEBUG(<< "# modes = " << filter.numberModes());
double x[] = {0.5, 1.5, 3.0, 35.0, 100.0,
320.0, 340.0, 360.0, 380.0, 410.0};
double error = 0.0;
for (std::size_t i = 0; i < std::size(x); ++i) {
double expectedProbability;
double deviation;
probabilityOfLessLikelySample(mixture, x[i], expectedProbability, deviation);
double lowerBound;
double upperBound;
filter.probabilityOfLessLikelySamples(
maths_t::E_TwoSided, TDouble1Vec(1, x[i]), lowerBound, upperBound);
LOG_DEBUG(<< "lowerBound = " << lowerBound << ", upperBound = " << upperBound
<< ", expectedProbability = " << expectedProbability
<< ", deviation = " << deviation);
double probability = (lowerBound + upperBound) / 2.0;
error += probability < expectedProbability - 2.0 * deviation
? (expectedProbability - 2.0 * deviation) - probability
: (probability > expectedProbability + 2.0 * deviation
? probability - (expectedProbability + 2.0 * deviation)
: 0.0);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedProbability, probability,
0.18 * expectedProbability +
std::max(2.5 * deviation, 1e-3));
}
error /= static_cast<double>(std::size(x));
LOG_DEBUG(<< "error = " << error);
BOOST_TEST_REQUIRE(error < 0.02);
}
}
BOOST_AUTO_TEST_CASE(testLargeValues) {
// Check that the confidence interval calculation stays
// well conditioned for very large values.
TDoubleVec values{
7.324121e+10, 7.251927e+10, 7.152208e+10, 7.089604e+10,
7.018831e+10, 6.94266e+10, 6.890659e+10, 6.837292e+10,
6.794372e+10, 6.793463e+10, 6.785385e+10, 6.773589e+10,
6.787609e+10, 6.760049e+10, 6.709596e+10, 6.701824e+10,
6.672568e+10, 6.617609e+10, 6.620431e+10, 6.627069e+10,
6.617393e+10, 6.633176e+10, 6.600326e+10, 6.530363e+10,
6.494482e+10, 6.433443e+10, 6.362233e+10, 6.317814e+10,
6.296127e+10, 6.272491e+10, 6.243567e+10, 6.19567e+10,
6.13123e+10, 6.150823e+10, 6.160438e+10, 6.106396e+10,
6.128276e+10, 6.13318e+10, 6.161243e+10, 6.182719e+10,
6.177156e+10, 6.174539e+10, 6.216147e+10, 6.272091e+10,
6.286637e+10, 6.310137e+10, 6.315882e+10, 6.312109e+10,
6.312296e+10, 6.312432e+10, 6.328676e+10, 6.37708e+10,
6.421867e+10, 6.490675e+10, 6.547625e+10, 6.593425e+10,
6.67186e+10, 6.755033e+10, 6.754501e+10, 6.730381e+10,
6.76163e+10, 6.761007e+10, 6.745505e+10, 6.777796e+10,
6.783472e+10, 6.779558e+10, 6.787643e+10, 6.800003e+10,
6.840413e+10, 6.856255e+10, 6.939239e+10, 6.907512e+10,
6.914988e+10, 6.901868e+10, 6.884531e+10, 6.934499e+10,
6.955862e+10, 6.938019e+10, 6.942022e+10, 6.950912e+10,
6.979618e+10, 7.064871e+10, 7.152501e+10, 7.178129e+10,
7.2239e+10, 7.257321e+10, 7.28913e+10, 7.365193e+10,
7.432521e+10, 7.475098e+10, 7.553025e+10, 7.654561e+10,
7.698032e+10, 7.768267e+10, 7.826669e+10, 7.866854e+10,
7.924608e+10, 7.998602e+10, 8.038091e+10, 8.094976e+10,
8.145126e+10, 8.132123e+10, 8.142747e+10, 8.148276e+10,
8.118588e+10, 8.122279e+10, 8.078815e+10, 8.008936e+10,
7.991103e+10, 7.981722e+10, 7.932372e+10, 7.900164e+10,
7.881053e+10, 7.837734e+10, 7.847101e+10, 7.816575e+10,
7.789224e+10, 7.803634e+10, 7.827226e+10, 7.812112e+10,
7.814848e+10, 7.812407e+10, 7.779805e+10, 7.783394e+10,
7.768365e+10, 7.74484e+10, 7.740301e+10, 7.725512e+10,
7.666682e+10, 7.635862e+10, 7.592468e+10, 7.539656e+10,
7.529974e+10, 7.501661e+10, 7.442706e+10, 7.406878e+10,
7.347894e+10, 7.268775e+10, 7.23729e+10, 7.171337e+10,
7.146626e+10, 7.130693e+10, 7.066356e+10, 6.977915e+10,
6.915126e+10, 6.830462e+10, 6.73021e+10, 6.67686e+10,
6.600806e+10, 6.504958e+10, 6.427045e+10, 6.35093e+10,
6.277891e+10, 6.258429e+10, 6.184866e+10, 6.114754e+10,
6.093035e+10, 6.063859e+10, 5.999596e+10, 5.952608e+10,
5.927059e+10, 5.831014e+10, 5.763428e+10, 5.77239e+10,
5.82414e+10, 5.911797e+10, 5.987076e+10, 5.976584e+10,
6.017487e+10, 6.023042e+10, 6.029144e+10, 6.068466e+10,
6.139924e+10, 6.208432e+10, 6.259237e+10, 6.300856e+10,
6.342197e+10, 6.423638e+10, 6.494938e+10, 6.478293e+10,
6.444705e+10, 6.432593e+10, 6.437474e+10, 6.447832e+10,
6.450247e+10, 6.398122e+10, 6.399681e+10, 6.406744e+10,
6.404553e+10, 6.417746e+10, 6.39819e+10, 6.389218e+10,
6.453242e+10, 6.491168e+10, 6.493824e+10, 6.524365e+10,
6.537463e+10, 6.543864e+10, 6.583769e+10, 6.596521e+10,
6.641129e+10, 6.718787e+10, 6.741177e+10, 6.776819e+10,
6.786579e+10, 6.783788e+10, 6.790788e+10, 6.77233e+10,
6.738099e+10, 6.718351e+10, 6.739131e+10, 6.752051e+10,
6.747344e+10, 6.757187e+10, 6.739908e+10, 6.702725e+10,
6.70474e+10, 6.708783e+10, 6.72989e+10, 6.75298e+10,
6.727323e+10, 6.677787e+10, 6.686342e+10, 6.687026e+10,
6.714555e+10, 6.750766e+10, 6.807156e+10, 6.847816e+10,
6.915895e+10, 6.958225e+10, 6.970934e+10, 6.972807e+10,
6.973312e+10, 6.970858e+10, 6.962325e+10, 6.968693e+10,
6.965446e+10, 6.983768e+10, 6.974386e+10, 6.992195e+10,
7.010707e+10, 7.004337e+10, 7.006336e+10, 7.06312e+10,
7.078169e+10, 7.080609e+10, 7.107845e+10, 7.084754e+10,
7.032667e+10, 7.052029e+10, 7.031464e+10, 7.006906e+10,
7.018558e+10, 7.022278e+10, 7.012379e+10, 7.043974e+10,
7.016036e+10, 6.975801e+10, 6.95197e+10, 6.92444e+10,
6.85828e+10, 6.808828e+10, 6.74055e+10, 6.663602e+10,
6.588224e+10, 6.52747e+10, 6.412303e+10, 6.315978e+10,
6.268569e+10, 6.219346e+10, 6.177174e+10, 6.101807e+10,
6.018369e+10, 5.97554e+10, 5.924427e+10, 5.867325e+10,
5.814079e+10, 5.745633e+10, 5.641881e+10, 5.608709e+10,
5.529503e+10, 5.450575e+10, 5.383054e+10, 5.297568e+10,
5.210389e+10, 5.139513e+10, 5.03026e+10, 4.922761e+10,
4.839502e+10, 4.739353e+10, 4.605013e+10, 4.486422e+10,
4.369101e+10, 4.241115e+10, 4.128026e+10, 4.025775e+10,
3.915851e+10, 3.819004e+10, 3.700971e+10, 3.581475e+10,
3.498126e+10, 3.384422e+10, 3.224959e+10, 3.108637e+10,
2.997983e+10, 2.86439e+10, 2.774108e+10, 2.682793e+10,
2.590098e+10, 2.500665e+10, 2.368987e+10, 2.24582e+10,
2.158596e+10, 2.062636e+10, 1.942922e+10, 1.873734e+10,
1.823214e+10, 1.726518e+10, 1.665115e+10, 1.582729e+10,
1.477715e+10, 1.406265e+10, 1.285904e+10, 1.145722e+10,
1.038312e+10, 9.181713e+09, 8.141138e+09, 7.45358e+09,
6.59996e+09, 5.72857e+09, 5.136189e+09, 4.51829e+09,
3.649536e+09, 2.990132e+09, 2.29392e+09, 1.390141e+09,
5.611192e+08, -1.62469e+08, -1.041465e+09, -1.804217e+09,
-2.923116e+09, -4.205691e+09, -5.09832e+09, -6.12155e+09,
-7.10503e+09, -7.957297e+09, -9.107372e+09, -1.039097e+10,
-1.133152e+10, -1.221205e+10, -1.318018e+10, -1.402195e+10,
-1.512e+10, -1.634369e+10, -1.710999e+10, -1.786548e+10,
-1.866482e+10, -1.938912e+10, -2.039964e+10, -2.160603e+10,
-2.259855e+10, -2.353314e+10, -2.449689e+10, -2.52005e+10,
-2.627104e+10, -2.730019e+10, -2.815777e+10, -2.920027e+10,
-3.03507e+10, -3.126021e+10, -3.212383e+10, -3.329089e+10,
-3.402306e+10, -3.475361e+10, -3.572698e+10, -3.644467e+10,
-3.721484e+10, -3.800023e+10, -3.865459e+10, -3.918282e+10,
-3.983764e+10, -4.051065e+10, -4.119051e+10, -4.202436e+10,
-4.24868e+10, -4.340278e+10, -4.418258e+10, -4.490206e+10,
-4.587365e+10, -4.697342e+10, -4.778222e+10, -4.882614e+10,
-4.984197e+10, -5.051089e+10, -5.143766e+10, -5.252824e+10,
-5.353136e+10, -5.436329e+10, -5.533555e+10, -5.623246e+10,
-5.689744e+10, -5.798439e+10, -5.882786e+10, -5.96284e+10,
-6.061507e+10, -6.145417e+10, -6.235327e+10, -6.335978e+10,
-6.405788e+10, -6.496648e+10, -6.600807e+10, -6.686964e+10,
-6.782611e+10, -6.890904e+10, -6.941638e+10, -7.012465e+10,
-7.113145e+10, -7.186233e+10, -7.2293e+10, -7.313894e+10,
-7.394114e+10, -7.475566e+10, -7.572029e+10, -7.660066e+10,
-7.738602e+10, -7.846013e+10, -7.921084e+10, -7.986093e+10,
-8.07113e+10, -8.159104e+10, -8.243174e+10, -8.305353e+10,
-8.346367e+10, -8.402575e+10, -8.482895e+10, -8.536747e+10,
-8.581526e+10, -8.640365e+10, -8.683093e+10, -8.724777e+10,
-8.746026e+10, -8.760338e+10, -8.809235e+10, -8.870936e+10,
-8.905536e+10, -8.953669e+10, -9.031665e+10, -9.090067e+10,
-9.135409e+10, -9.185499e+10, -9.225697e+10, -9.253896e+10,
-9.314785e+10, -9.354807e+10, -9.391591e+10, -9.436751e+10,
-9.471133e+10, -9.517393e+10, -9.587184e+10, -9.619209e+10,
-9.607482e+10, -9.593427e+10, -9.604743e+10, -9.619758e+10,
-9.62449e+10, -9.61466e+10, -9.636941e+10, -9.692289e+10,
-9.735416e+10, -9.774056e+10, -9.828883e+10, -9.859253e+10,
-9.888183e+10, -9.95351e+10, -1.001142e+11};
maths::common::CGammaRateConjugate gammaPrior =
maths::common::CGammaRateConjugate::nonInformativePrior(
maths_t::E_ContinuousData, 0.2, 0.001);
maths::common::CNormalMeanPrecConjugate normalPrior =
maths::common::CNormalMeanPrecConjugate::nonInformativePrior(
maths_t::E_ContinuousData, 0.001);
maths::common::CLogNormalMeanPrecConjugate logNormalPrior =
maths::common::CLogNormalMeanPrecConjugate::nonInformativePrior(
maths_t::E_ContinuousData, 0.2, 0.001);
maths::common::COneOfNPrior::TPriorPtrVec modePriors;
modePriors.reserve(3u);
modePriors.push_back(TPriorPtr(gammaPrior.clone()));
modePriors.push_back(TPriorPtr(logNormalPrior.clone()));
modePriors.push_back(TPriorPtr(normalPrior.clone()));
maths::common::COneOfNPrior modePrior(modePriors, maths_t::E_ContinuousData, 0.001);
maths::common::CXMeansOnline1d clusterer(
maths_t::E_ContinuousData, maths::common::CAvailableModeDistributions::ALL,
maths_t::E_ClustersFractionWeight, 0.001, 0.05, 12, 0.8 / 3.0);
maths::common::CMultimodalPrior multimodalPrior(maths_t::E_ContinuousData,
clusterer, modePrior, 0.001);
for (auto value : values) {
multimodalPrior.addSamples({value}, {maths_t::countWeight(1.0 / 3.0)});
if (!multimodalPrior.isNonInformative()) {
TDoubleDoublePr interval = multimodalPrior.marginalLikelihoodConfidenceInterval(
95.0, maths_t::CUnitWeights::UNIT);
if (interval.second - interval.first >= 3e11) {
LOG_DEBUG(<< "interval = " << interval.second - interval.first);
LOG_DEBUG(<< multimodalPrior.print());
}
BOOST_TEST_REQUIRE(interval.second - interval.first < 3e11);
}
}
}
BOOST_AUTO_TEST_CASE(testSeasonalVarianceScale) {
// We are test:
// 1) The marginal likelihood is normalized.
// 2) E[(X - m)^2] w.r.t. the log-likelihood is scaled.
// 3) E[(X - m)^2] is close to marginalLikelihoodVariance.
// 4) dF/dx = exp(log-likelihood) with different scales.
// 5) The probability of less likely sample transforms as
// expected.
// 6) Updating with scaled samples behaves as expected.
const double mean1 = 6.0;
const double variance1 = 4.0;
const double mean2 = 20.0;
const double variance2 = 20.0;
const double mean3 = 50.0;
const double variance3 = 20.0;
test::CRandomNumbers rng;
TDoubleVec samples1;
rng.generateNormalSamples(mean1, variance1, 100, samples1);
TDoubleVec samples2;
rng.generateNormalSamples(mean2, variance2, 100, samples2);
TDoubleVec samples3;
rng.generateNormalSamples(mean3, variance3, 100, samples3);
double varianceScales[] = {0.2, 0.5, 1.0, 2.0, 5.0};
maths_t::TDoubleWeightsAry weight(maths_t::CUnitWeights::UNIT);
double m;
double v;
{
CMultimodalPrior filter(makePrior());
filter.addSamples(samples1);
filter.addSamples(samples2);
filter.addSamples(samples3);
m = filter.marginalLikelihoodMean();
v = filter.marginalLikelihoodVariance();
LOG_DEBUG(<< "v = " << v);
double points[] = {0.5, 4.0, 12.0, 20.0, 40.0, 50.0, 60.0};
double unscaledExpectationVariance;
filter.expectation(CVarianceKernel(filter.marginalLikelihoodMean()), 50,
unscaledExpectationVariance);
LOG_DEBUG(<< "unscaledExpectationVariance = " << unscaledExpectationVariance);
BOOST_REQUIRE_CLOSE_ABSOLUTE(v, unscaledExpectationVariance,
1e-2 * unscaledExpectationVariance);
for (std::size_t i = 0; i < std::size(varianceScales); ++i) {
double vs = varianceScales[i];
maths_t::setSeasonalVarianceScale(vs, weight);
LOG_DEBUG(<< "*** variance scale = " << vs << " ***");
double Z;
filter.expectation(C1dUnitKernel(), 50, Z, weight);
LOG_DEBUG(<< "Z = " << Z);
BOOST_REQUIRE_CLOSE_ABSOLUTE(1.0, Z, 1e-3);
LOG_DEBUG(<< "sv = " << filter.marginalLikelihoodVariance(weight));
double expectationVariance;
filter.expectation(CVarianceKernel(filter.marginalLikelihoodMean()),
50, expectationVariance, weight);
LOG_DEBUG(<< "expectationVariance = " << expectationVariance);
BOOST_REQUIRE_CLOSE_ABSOLUTE(vs * unscaledExpectationVariance, expectationVariance,
1e-3 * vs * unscaledExpectationVariance);
BOOST_REQUIRE_CLOSE_ABSOLUTE(
filter.marginalLikelihoodVariance(weight), expectationVariance,
1e-3 * filter.marginalLikelihoodVariance(weight));
TDouble1Vec sample(1, 0.0);
for (std::size_t j = 0; j < std::size(points); ++j) {
TDouble1Vec x(1, points[j]);
double fx;
filter.jointLogMarginalLikelihood(x, {weight}, fx);
TDouble1Vec xMinusEps(1, points[j] - 1e-3);
TDouble1Vec xPlusEps(1, points[j] + 1e-3);
double lb, ub;
filter.minusLogJointCdf(xPlusEps, {weight}, lb, ub);
double FxPlusEps = std::exp(-(lb + ub) / 2.0);
filter.minusLogJointCdf(xMinusEps, {weight}, lb, ub);
double FxMinusEps = std::exp(-(lb + ub) / 2.0);
LOG_DEBUG(<< "x = " << points[j] << ", log(f(x)) = " << fx << ", log(dF/dx)) = "
<< std::log((FxPlusEps - FxMinusEps) / 2e-3));
BOOST_REQUIRE_CLOSE_ABSOLUTE(fx, std::log((FxPlusEps - FxMinusEps) / 2e-3),
0.05 * std::fabs(fx));
sample[0] = m + (points[j] - m) / std::sqrt(vs);
maths_t::setSeasonalVarianceScale(1.0, weight);
double expectedLowerBound;
double expectedUpperBound;
maths_t::ETail expectedTail;
filter.probabilityOfLessLikelySamples(maths_t::E_TwoSided, sample,
{weight}, expectedLowerBound,
expectedUpperBound, expectedTail);
sample[0] = points[j];
maths_t::setSeasonalVarianceScale(vs, weight);
double lowerBound;
double upperBound;
maths_t::ETail tail;
filter.probabilityOfLessLikelySamples(
maths_t::E_TwoSided, sample, {weight}, lowerBound, upperBound, tail);
LOG_DEBUG(<< "expectedLowerBound = " << expectedLowerBound);
LOG_DEBUG(<< "lowerBound = " << lowerBound);
LOG_DEBUG(<< "expectedUpperBound = " << expectedUpperBound);
LOG_DEBUG(<< "upperBound = " << upperBound);
LOG_DEBUG(<< "expectedTail = " << expectedTail);
LOG_DEBUG(<< "tail = " << tail);
if ((expectedLowerBound + expectedUpperBound) < 0.02) {
BOOST_REQUIRE_CLOSE_ABSOLUTE(
std::log(expectedLowerBound), std::log(lowerBound),
0.1 * std::fabs(std::log(expectedLowerBound)));
BOOST_REQUIRE_CLOSE_ABSOLUTE(
std::log(expectedUpperBound), std::log(upperBound),
0.1 * std::fabs(std::log(expectedUpperBound)));
} else {
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedLowerBound, lowerBound,
0.05 * expectedLowerBound);
BOOST_REQUIRE_CLOSE_ABSOLUTE(expectedUpperBound, upperBound,
0.05 * expectedUpperBound);
}
BOOST_REQUIRE_EQUAL(expectedTail, tail);
}
}
}
for (std::size_t i = 0; i < std::size(varianceScales); ++i) {
double vs = varianceScales[i];
TDouble1Vec samples(samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
samples.insert(samples.end(), samples3.begin(), samples3.end());
rng.random_shuffle(samples.begin(), samples.end());
CMultimodalPrior filter(makePrior());
maths_t::setSeasonalVarianceScale(vs, weight);
for (std::size_t j = 0; j < samples.size(); ++j) {
filter.addSamples({samples[j]}, {weight});
}
double sm = filter.marginalLikelihoodMean();
double sv = filter.marginalLikelihoodVariance();
LOG_DEBUG(<< "m = " << m << ", v = " << v);
LOG_DEBUG(<< "sm = " << sm << ", sv = " << sv);
BOOST_REQUIRE_CLOSE_ABSOLUTE(m, sm, 0.12 * m);
BOOST_REQUIRE_CLOSE_ABSOLUTE(v / vs, sv, 0.07 * v / vs);
}
}
BOOST_AUTO_TEST_CASE(testPersist) {
test::CRandomNumbers rng;
TDoubleVec samples1;
rng.generateNormalSamples(5.0, 1.0, 100, samples1);
TDoubleVec samples2;
rng.generateLogNormalSamples(3.0, 0.1, 200, samples2);
TDoubleVec samples;
samples.insert(samples.end(), samples1.begin(), samples1.end());
samples.insert(samples.end(), samples2.begin(), samples2.end());
rng.random_shuffle(samples.begin(), samples.end());
maths::common::CXMeansOnline1d clusterer(
maths_t::E_ContinuousData, maths::common::CAvailableModeDistributions::ALL,
maths_t::E_ClustersFractionWeight);
maths::common::CGammaRateConjugate gamma = maths::common::CGammaRateConjugate::nonInformativePrior(
maths_t::E_ContinuousData, 0.01);
maths::common::CLogNormalMeanPrecConjugate logNormal =
maths::common::CLogNormalMeanPrecConjugate::nonInformativePrior(
maths_t::E_ContinuousData, 0.01);
maths::common::CNormalMeanPrecConjugate normal =
maths::common::CNormalMeanPrecConjugate::nonInformativePrior(maths_t::E_ContinuousData);
COneOfNPrior::TPriorPtrVec priors;
priors.push_back(COneOfNPrior::TPriorPtr(gamma.clone()));
priors.push_back(COneOfNPrior::TPriorPtr(logNormal.clone()));
priors.push_back(COneOfNPrior::TPriorPtr(normal.clone()));
COneOfNPrior modePrior(maths::common::COneOfNPrior(priors, maths_t::E_ContinuousData));
maths::common::CMultimodalPrior origFilter(maths_t::E_ContinuousData, clusterer, modePrior);
for (std::size_t i = 0; i < samples.size(); ++i) {
origFilter.addSamples({samples[i]}, maths_t::CUnitWeights::SINGLE_UNIT);
}
double decayRate = origFilter.decayRate();
std::uint64_t checksum = origFilter.checksum();
std::ostringstream origJson;
core::CJsonStatePersistInserter::persist(
origJson, std::bind_front(&maths::common::CMultimodalPrior::acceptPersistInserter,
&origFilter));
LOG_DEBUG(<< "Multimodal JSON representation:\n" << origJson.str());
// Restore the JSON into a new filter
std::istringstream origJsonStrm{"{\"topLevel\" : " + origJson.str() + "}"};
core::CJsonStateRestoreTraverser traverser(origJsonStrm);
maths::common::SDistributionRestoreParams params(
maths_t::E_ContinuousData, decayRate + 0.1, maths::common::MINIMUM_CLUSTER_SPLIT_FRACTION,
maths::common::MINIMUM_CLUSTER_SPLIT_COUNT, maths::common::MINIMUM_CATEGORY_COUNT);
maths::common::CMultimodalPrior restoredFilter(params, traverser);
LOG_DEBUG(<< "orig checksum = " << checksum
<< " restored checksum = " << restoredFilter.checksum());
BOOST_REQUIRE_EQUAL(checksum, restoredFilter.checksum());
// The JSON representation of the new filter should be the same as the original
std::ostringstream newJson;
core::CJsonStatePersistInserter::persist(
newJson, std::bind_front(&maths::common::CMultimodalPrior::acceptPersistInserter,
&restoredFilter));
BOOST_REQUIRE_EQUAL(origJson.str(), newJson.str());
}
BOOST_AUTO_TEST_SUITE_END()