in lib/maths/common/CLogNormalMeanPrecConjugate.cc [723:933]
void CLogNormalMeanPrecConjugate::addSamples(const TDouble1Vec& samples,
const TDoubleWeightsAry1Vec& weights) {
if (samples.empty()) {
return;
}
if (samples.size() != weights.size()) {
LOG_ERROR(<< "Mismatch in samples '" << samples << "' and weights '"
<< weights << "'");
return;
}
this->adjustOffset(samples, weights);
this->CPrior::addSamples(samples, weights);
// We assume the data are described by X = exp(Y) - u where, Y is normally
// distributed and u is a constant offset.
//
// If y = {y(i)} denotes the sample vector, then x = {y(i) + u} are
// log-normally distributed with mean m' and inverse scale s', and the
// likelihood function is:
// likelihood(x | m', s') ~
// Product_i{ 1 / x(i) * exp(-s' * (log(x(i)) - m')^2 / 2) }.
//
// The conjugate joint prior for m' and s' is gamma-normal and the update
// of the posterior with n independent samples comes from:
// likelihood(x | m', s') * prior(m', s' | m, p, a, b) (1)
//
// where,
// prior(m', s' | m, p, a, b) ~
// (s' * p)^(1/2) * exp(-s' * p * (m' - m)^2 / 2)
// * s'^(a - 1) * exp(-b * s')
//
// i.e. the conditional distribution of the mean is Gaussian with mean m
// and precision s' * p and the marginal distribution of s' is gamma with
// shape a and rate b. Equation (1) implies that the parameters of the prior
// distribution update as follows:
// m -> (p * m + n * mean(log(x))) / (p + n)
// p -> p + n
// a -> a + n/2
// b -> b + 1/2 * (n * var(log(x)) + p * n * (mean(log(x)) - m)^2 / (p + n))
//
// where,
// mean(.) is the sample mean function.
// var(.) is the sample variance function.
//
// Note that the weight of the sample x(i) is interpreted as its count,
// i.e. n(i), so for example updating with {(x, 2)} is equivalent to
// updating with {x, x}.
//
// If the data are discrete then we approximate the discrete distribution
// by saying it is uniform on the intervals [n,n+1] for each integral n.
// This is like saying that the data are samples from:
// X' = X + Z
//
// where,
// Z is uniform in the interval [0,1].
//
// We care about the limiting behaviour of the filter, i.e. as the number
// of samples n->inf. In this case, the uniform law of large numbers gives
// that:
// Sum_i( f(x(i) + z(i)) ) -> E[ Sum_i( f(x(i) + Z) ) ]
//
// We use this to evaluate:
// mean(log(x(i) + z(i)))
// -> 1/n * Sum_i( (x(i) + 1) * log(x(i) + 1) - x(i) * log(x(i)) - 1 )
//
// To evaluate var(log(x(i) + z(i))) we use numerical integration for
// simplicity because naive calculation of the exact integral suffers from
// cancellation errors.
double numberSamples = 0.0;
double scaledNumberSamples = 0.0;
double logSamplesMean = 0.0;
double logSamplesSquareDeviation = 0.0;
double r = m_GammaRate / m_GammaShape;
double s = std::exp(-r);
if (this->isInteger()) {
// Filled in with samples rescaled to have approximately unit
// variance scale.
TDouble1Vec scaledSamples;
scaledSamples.resize(samples.size(), 1.0);
TMeanAccumulator logSamplesMean_;
for (std::size_t i = 0; i < samples.size(); ++i) {
double x = samples[i] + m_Offset;
if (x <= 0.0 || !CMathsFuncs::isFinite(x) ||
!CMathsFuncs::isFinite(weights[i])) {
LOG_ERROR(<< "Discarding sample = " << x << ", weights = " << weights[i]);
continue;
}
double n = maths_t::countForUpdate(weights[i]);
double varianceScale = maths_t::seasonalVarianceScale(weights[i]) *
maths_t::countVarianceScale(weights[i]);
numberSamples += n;
double t = varianceScale == 1.0
? r
: r + std::log(s + varianceScale * (1.0 - s));
double shift = (r - t) / 2.0;
double scale = r == t ? 1.0 : t / r;
scaledSamples[i] = scale;
double logxInvPlus1 = std::log(1.0 / x + 1.0);
double logxPlus1 = std::log(x + 1.0);
logSamplesMean_.add(x * logxInvPlus1 + logxPlus1 - 1.0 - shift, n / scale);
}
scaledNumberSamples = CBasicStatistics::count(logSamplesMean_);
logSamplesMean = CBasicStatistics::mean(logSamplesMean_);
double mean = (m_GaussianPrecision * m_GaussianMean + scaledNumberSamples * logSamplesMean) /
(m_GaussianPrecision + scaledNumberSamples);
for (std::size_t i = 0; i < scaledSamples.size(); ++i) {
double scale = scaledSamples[i];
scaledSamples[i] =
scale == 1.0 ? samples[i] + m_Offset
: std::exp(mean + (std::log(samples[i] + m_Offset) - mean) /
std::sqrt(scale));
}
detail::CLogSampleSquareDeviation deviationFunction(scaledSamples, weights,
logSamplesMean);
CIntegration::gaussLegendre<CIntegration::OrderFive>(
deviationFunction, 0.0, 1.0, logSamplesSquareDeviation);
} else {
TMeanVarAccumulator logSamplesMoments;
for (std::size_t i = 0; i < samples.size(); ++i) {
double x = samples[i] + m_Offset;
if (x <= 0.0 || !CMathsFuncs::isFinite(x) ||
!CMathsFuncs::isFinite(weights[i])) {
LOG_ERROR(<< "Discarding sample = " << x << ", weights = " << weights[i]);
continue;
}
double n = maths_t::countForUpdate(weights[i]);
double varianceScale = maths_t::seasonalVarianceScale(weights[i]) *
maths_t::countVarianceScale(weights[i]);
numberSamples += n;
double t = varianceScale == 1.0
? r
: r + std::log(s + varianceScale * (1.0 - s));
double scale = r == t ? 1.0 : t / r;
double shift = (r - t) / 2.0;
logSamplesMoments.add(std::log(x) - shift, n / scale);
}
scaledNumberSamples = CBasicStatistics::count(logSamplesMoments);
logSamplesMean = CBasicStatistics::mean(logSamplesMoments);
logSamplesSquareDeviation = (scaledNumberSamples - 1.0) *
CBasicStatistics::variance(logSamplesMoments);
}
m_GammaShape += 0.5 * numberSamples;
m_GammaRate += 0.5 * (logSamplesSquareDeviation +
m_GaussianPrecision * scaledNumberSamples *
CTools::pow2(logSamplesMean - m_GaussianMean) /
(m_GaussianPrecision + scaledNumberSamples));
m_GaussianMean = (m_GaussianPrecision * m_GaussianMean + scaledNumberSamples * logSamplesMean) /
(m_GaussianPrecision + scaledNumberSamples);
m_GaussianPrecision += scaledNumberSamples;
// If the coefficient of variation of the data is too small we run
// in to numerical problems. We truncate the variation by modeling
// the impact of an actual variation (standard deviation divided by
// mean) in the data of size MINIMUM_COEFFICIENT_OF_VARATION on the
// prior parameters.
if (m_GaussianPrecision > 1.5) {
// The idea is to model the impact of a coefficient of variation
// equal to MINIMUM_COEFFICIENT_OF_VARIATION on the parameters
// of the prior this will affect. In particular, this enters in
// to the gamma rate and shape by its impact on the mean of the
// log of the samples. Note that:
// mean(log(x'(i)) = 1/n * Sum_i{ log(m + (x'(i) - m)) }
// = 1/n * Sum_i{ log(m) + log(1 + d(i) / m) }
// ~ 1/n * Sum_i{ log(m) + d(i) / m - (d(i)/m)^2 / 2 }
//
// where x(i) are our true samples, which are all very nearly m
// (since their variation is tiny by assumption) and x'(i) are
// our samples assuming minimum coefficient of variation. Finally,
// we note that:
// E[d(i)] = 0
// E[(d(i)/m)^2] = "coefficient of variation"^2
//
// From which we derive the results below.
double minimumRate = (2.0 * m_GammaShape - 1.0) *
CTools::pow2(MINIMUM_COEFFICIENT_OF_VARIATION);
if (m_GammaRate < minimumRate) {
double extraVariation = (minimumRate - m_GammaRate) /
(m_GaussianPrecision - 1.0);
m_GammaRate = minimumRate;
m_GaussianMean -= 0.5 * extraVariation;
}
}
LOG_TRACE(<< "logSamplesMean = " << logSamplesMean << ", logSamplesSquareDeviation = "
<< logSamplesSquareDeviation << ", numberSamples = " << numberSamples
<< ", scaledNumberSamples = " << scaledNumberSamples);
LOG_TRACE(<< "m_GammaShape = " << m_GammaShape << ", m_GammaRate = " << m_GammaRate
<< ", m_GaussianMean = " << m_GaussianMean << ", m_GaussianPrecision = "
<< m_GaussianPrecision << ", m_Offset = " << m_Offset);
if (this->isBad()) {
LOG_ERROR(<< "Update failed (" << this->debug() << ")");
LOG_ERROR(<< "samples = " << samples);
LOG_ERROR(<< "weights = " << weights);
this->setToNonInformative(this->offsetMargin(), this->decayRate());
}
}