void CLogNormalMeanPrecConjugate::addSamples()

in lib/maths/common/CLogNormalMeanPrecConjugate.cc [723:933]


void CLogNormalMeanPrecConjugate::addSamples(const TDouble1Vec& samples,
                                             const TDoubleWeightsAry1Vec& weights) {
    if (samples.empty()) {
        return;
    }
    if (samples.size() != weights.size()) {
        LOG_ERROR(<< "Mismatch in samples '" << samples << "' and weights '"
                  << weights << "'");
        return;
    }

    this->adjustOffset(samples, weights);
    this->CPrior::addSamples(samples, weights);

    // We assume the data are described by X = exp(Y) - u where, Y is normally
    // distributed and u is a constant offset.
    //
    // If y = {y(i)} denotes the sample vector, then x = {y(i) + u} are
    // log-normally distributed with mean m' and inverse scale s', and the
    // likelihood function is:
    //   likelihood(x | m', s') ~
    //       Product_i{ 1 / x(i) * exp(-s' * (log(x(i)) - m')^2 / 2) }.
    //
    // The conjugate joint prior for m' and s' is gamma-normal and the update
    // of the posterior with n independent samples comes from:
    //   likelihood(x | m', s') * prior(m', s' | m, p, a, b)            (1)
    //
    // where,
    //   prior(m', s' | m, p, a, b) ~
    //       (s' * p)^(1/2) * exp(-s' * p * (m' - m)^2 / 2)
    //       * s'^(a - 1) * exp(-b * s')
    //
    // i.e. the conditional distribution of the mean is Gaussian with mean m
    // and precision s' * p and the marginal distribution of s' is gamma with
    // shape a and rate b. Equation (1) implies that the parameters of the prior
    // distribution update as follows:
    //   m -> (p * m + n * mean(log(x))) / (p + n)
    //   p -> p + n
    //   a -> a + n/2
    //   b -> b + 1/2 * (n * var(log(x)) + p * n * (mean(log(x)) - m)^2 / (p + n))
    //
    // where,
    //   mean(.) is the sample mean function.
    //   var(.) is the sample variance function.
    //
    // Note that the weight of the sample x(i) is interpreted as its count,
    // i.e. n(i), so for example updating with {(x, 2)} is equivalent to
    // updating with {x, x}.
    //
    // If the data are discrete then we approximate the discrete distribution
    // by saying it is uniform on the intervals [n,n+1] for each integral n.
    // This is like saying that the data are samples from:
    //   X' = X + Z
    //
    // where,
    //   Z is uniform in the interval [0,1].
    //
    // We care about the limiting behaviour of the filter, i.e. as the number
    // of samples n->inf. In this case, the uniform law of large numbers gives
    // that:
    //   Sum_i( f(x(i) + z(i)) ) -> E[ Sum_i( f(x(i) + Z) ) ]
    //
    // We use this to evaluate:
    //   mean(log(x(i) + z(i)))
    //       -> 1/n * Sum_i( (x(i) + 1) * log(x(i) + 1) - x(i) * log(x(i)) - 1 )
    //
    // To evaluate var(log(x(i) + z(i))) we use numerical integration for
    // simplicity because naive calculation of the exact integral suffers from
    // cancellation errors.

    double numberSamples = 0.0;
    double scaledNumberSamples = 0.0;
    double logSamplesMean = 0.0;
    double logSamplesSquareDeviation = 0.0;

    double r = m_GammaRate / m_GammaShape;
    double s = std::exp(-r);
    if (this->isInteger()) {
        // Filled in with samples rescaled to have approximately unit
        // variance scale.
        TDouble1Vec scaledSamples;
        scaledSamples.resize(samples.size(), 1.0);

        TMeanAccumulator logSamplesMean_;
        for (std::size_t i = 0; i < samples.size(); ++i) {
            double x = samples[i] + m_Offset;
            if (x <= 0.0 || !CMathsFuncs::isFinite(x) ||
                !CMathsFuncs::isFinite(weights[i])) {
                LOG_ERROR(<< "Discarding sample = " << x << ", weights = " << weights[i]);
                continue;
            }

            double n = maths_t::countForUpdate(weights[i]);
            double varianceScale = maths_t::seasonalVarianceScale(weights[i]) *
                                   maths_t::countVarianceScale(weights[i]);

            numberSamples += n;
            double t = varianceScale == 1.0
                           ? r
                           : r + std::log(s + varianceScale * (1.0 - s));
            double shift = (r - t) / 2.0;
            double scale = r == t ? 1.0 : t / r;
            scaledSamples[i] = scale;
            double logxInvPlus1 = std::log(1.0 / x + 1.0);
            double logxPlus1 = std::log(x + 1.0);
            logSamplesMean_.add(x * logxInvPlus1 + logxPlus1 - 1.0 - shift, n / scale);
        }
        scaledNumberSamples = CBasicStatistics::count(logSamplesMean_);
        logSamplesMean = CBasicStatistics::mean(logSamplesMean_);

        double mean = (m_GaussianPrecision * m_GaussianMean + scaledNumberSamples * logSamplesMean) /
                      (m_GaussianPrecision + scaledNumberSamples);
        for (std::size_t i = 0; i < scaledSamples.size(); ++i) {
            double scale = scaledSamples[i];
            scaledSamples[i] =
                scale == 1.0 ? samples[i] + m_Offset
                             : std::exp(mean + (std::log(samples[i] + m_Offset) - mean) /
                                                   std::sqrt(scale));
        }

        detail::CLogSampleSquareDeviation deviationFunction(scaledSamples, weights,
                                                            logSamplesMean);
        CIntegration::gaussLegendre<CIntegration::OrderFive>(
            deviationFunction, 0.0, 1.0, logSamplesSquareDeviation);
    } else {
        TMeanVarAccumulator logSamplesMoments;
        for (std::size_t i = 0; i < samples.size(); ++i) {
            double x = samples[i] + m_Offset;
            if (x <= 0.0 || !CMathsFuncs::isFinite(x) ||
                !CMathsFuncs::isFinite(weights[i])) {
                LOG_ERROR(<< "Discarding sample = " << x << ", weights = " << weights[i]);
                continue;
            }
            double n = maths_t::countForUpdate(weights[i]);
            double varianceScale = maths_t::seasonalVarianceScale(weights[i]) *
                                   maths_t::countVarianceScale(weights[i]);

            numberSamples += n;
            double t = varianceScale == 1.0
                           ? r
                           : r + std::log(s + varianceScale * (1.0 - s));
            double scale = r == t ? 1.0 : t / r;
            double shift = (r - t) / 2.0;
            logSamplesMoments.add(std::log(x) - shift, n / scale);
        }
        scaledNumberSamples = CBasicStatistics::count(logSamplesMoments);
        logSamplesMean = CBasicStatistics::mean(logSamplesMoments);
        logSamplesSquareDeviation = (scaledNumberSamples - 1.0) *
                                    CBasicStatistics::variance(logSamplesMoments);
    }

    m_GammaShape += 0.5 * numberSamples;
    m_GammaRate += 0.5 * (logSamplesSquareDeviation +
                          m_GaussianPrecision * scaledNumberSamples *
                              CTools::pow2(logSamplesMean - m_GaussianMean) /
                              (m_GaussianPrecision + scaledNumberSamples));

    m_GaussianMean = (m_GaussianPrecision * m_GaussianMean + scaledNumberSamples * logSamplesMean) /
                     (m_GaussianPrecision + scaledNumberSamples);
    m_GaussianPrecision += scaledNumberSamples;

    // If the coefficient of variation of the data is too small we run
    // in to numerical problems. We truncate the variation by modeling
    // the impact of an actual variation (standard deviation divided by
    // mean) in the data of size MINIMUM_COEFFICIENT_OF_VARATION on the
    // prior parameters.

    if (m_GaussianPrecision > 1.5) {
        // The idea is to model the impact of a coefficient of variation
        // equal to MINIMUM_COEFFICIENT_OF_VARIATION on the parameters
        // of the prior this will affect. In particular, this enters in
        // to the gamma rate and shape by its impact on the mean of the
        // log of the samples. Note that:
        //   mean(log(x'(i)) = 1/n * Sum_i{ log(m + (x'(i) - m)) }
        //                   = 1/n * Sum_i{ log(m) + log(1 + d(i) / m) }
        //                   ~ 1/n * Sum_i{ log(m) + d(i) / m - (d(i)/m)^2 / 2 }
        //
        // where x(i) are our true samples, which are all very nearly m
        // (since their variation is tiny by assumption) and x'(i) are
        // our samples assuming minimum coefficient of variation. Finally,
        // we note that:
        //   E[d(i)] = 0
        //   E[(d(i)/m)^2] = "coefficient of variation"^2
        //
        // From which we derive the results below.

        double minimumRate = (2.0 * m_GammaShape - 1.0) *
                             CTools::pow2(MINIMUM_COEFFICIENT_OF_VARIATION);

        if (m_GammaRate < minimumRate) {
            double extraVariation = (minimumRate - m_GammaRate) /
                                    (m_GaussianPrecision - 1.0);
            m_GammaRate = minimumRate;
            m_GaussianMean -= 0.5 * extraVariation;
        }
    }

    LOG_TRACE(<< "logSamplesMean = " << logSamplesMean << ", logSamplesSquareDeviation = "
              << logSamplesSquareDeviation << ", numberSamples = " << numberSamples
              << ", scaledNumberSamples = " << scaledNumberSamples);
    LOG_TRACE(<< "m_GammaShape = " << m_GammaShape << ", m_GammaRate = " << m_GammaRate
              << ", m_GaussianMean = " << m_GaussianMean << ", m_GaussianPrecision = "
              << m_GaussianPrecision << ", m_Offset = " << m_Offset);

    if (this->isBad()) {
        LOG_ERROR(<< "Update failed (" << this->debug() << ")");
        LOG_ERROR(<< "samples = " << samples);
        LOG_ERROR(<< "weights = " << weights);
        this->setToNonInformative(this->offsetMargin(), this->decayRate());
    }
}