bool CEventRatePopulationModel::computeProbability()

in lib/model/CEventRatePopulationModel.cc [592:784]


bool CEventRatePopulationModel::computeProbability(std::size_t pid,
                                                   core_t::TTime startTime,
                                                   core_t::TTime endTime,
                                                   CPartitioningFields& partitioningFields,
                                                   std::size_t numberAttributeProbabilities,
                                                   SAnnotatedProbability& result) const {
    const CDataGatherer& gatherer = this->dataGatherer();
    core_t::TTime bucketLength = gatherer.bucketLength();

    if (endTime != startTime + bucketLength) {
        LOG_ERROR(<< "Can only compute probability for single bucket");
        return false;
    }
    if (pid > gatherer.numberPeople()) {
        LOG_TRACE(<< "No person for pid = " << pid);
        return false;
    }

    LOG_TRACE(<< "computeProbability(" << gatherer.personName(pid) << ")");

    using TOptionalStr = std::optional<std::string>;
    using TOptionalStr1Vec = core::CSmallVector<TOptionalStr, 1>;
    using TSizeProbabilityAndInfluenceUMap =
        boost::unordered_map<std::size_t, CProbabilityAndInfluenceCalculator>;
    using TDoubleFeaturePr = std::pair<double, model_t::EFeature>;
    using TDoubleFeaturePrMinAccumulator =
        maths::common::CBasicStatistics::SMin<TDoubleFeaturePr>::TAccumulator;
    using TSizeDoubleFeaturePrMinAccumulatorUMap =
        boost::unordered_map<std::size_t, TDoubleFeaturePrMinAccumulator>;

    static const TOptionalStr1Vec NO_CORRELATED_ATTRIBUTES;
    static const TSizeDoublePr1Vec NO_CORRELATES;

    partitioningFields.add(gatherer.attributeFieldName(), EMPTY_STRING);

    CProbabilityAndInfluenceCalculator pConditionalTemplate(this->params().s_InfluenceCutoff);
    pConditionalTemplate.addAggregator(maths::common::CJointProbabilityOfLessLikelySamples());
    pConditionalTemplate.addAggregator(maths::common::CProbabilityOfExtremeSample());
    if (this->params().s_CacheProbabilities) {
        pConditionalTemplate.addCache(m_Probabilities);
    }
    TSizeProbabilityAndInfluenceUMap pConditional;

    TSizeDoubleFeaturePrMinAccumulatorUMap minimumProbabilityFeatures;

    maths::common::CMultinomialConjugate personAttributeProbabilityPrior(m_NewAttributeProbabilityPrior);

    CAnnotatedProbabilityBuilder resultBuilder(
        result, std::max(numberAttributeProbabilities, std::size_t(1)),
        function_t::function(gatherer.features()));
    resultBuilder.attributeProbabilityPrior(&m_AttributeProbabilityPrior);
    resultBuilder.personAttributeProbabilityPrior(&personAttributeProbabilityPrior);

    for (std::size_t i = 0; i < gatherer.numberFeatures(); ++i) {
        model_t::EFeature feature = gatherer.feature(i);
        LOG_TRACE(<< "feature = " << model_t::print(feature));

        if (feature == model_t::E_PopulationAttributeTotalCountByPerson) {
            const TSizeSizePrFeatureDataPrVec& data = this->featureData(feature, startTime);
            TSizeSizePr range = personRange(data, pid);
            for (std::size_t j = range.first; j < range.second; ++j) {
                TDouble1Vec category{
                    static_cast<double>(CDataGatherer::extractAttributeId(data[j]))};
                maths_t::TDoubleWeightsAry1Vec weights{maths_t::countWeight(
                    static_cast<double>(CDataGatherer::extractData(data[j]).s_Count))};
                personAttributeProbabilityPrior.addSamples(category, weights);
            }
            continue;
        }
        if (model_t::isCategorical(feature)) {
            continue;
        }

        const TSizeSizePrFeatureDataPrVec& featureData = this->featureData(feature, startTime);
        TSizeSizePr range = personRange(featureData, pid);

        for (std::size_t j = range.first; j < range.second; ++j) {
            std::size_t cid = CDataGatherer::extractAttributeId(featureData[j]);

            if (this->shouldSkipUpdate(feature, pid, cid,
                                       model_t::sampleTime(feature, startTime, bucketLength))) {
                result.s_ShouldUpdateQuantiles = false;
            }

            if (this->shouldIgnoreResult(feature, result.s_ResultType, pid, cid,
                                         model_t::sampleTime(feature, startTime, bucketLength))) {
                continue;
            }

            partitioningFields.back().second = TStrCRef(gatherer.attributeName(cid));

            if (this->correlates(feature, pid, cid, startTime)) {
                // TODO
            } else {
                CProbabilityAndInfluenceCalculator::SParams params(partitioningFields);
                if (this->fill(feature, pid, cid, startTime, result.isInterim(),
                               params) == false) {
                    continue;
                }
                model_t::CResultType type;
                TSize1Vec mostAnomalousCorrelate;
                if (pConditional.emplace(cid, pConditionalTemplate)
                        .first->second.addProbability(
                            feature, cid, *params.s_Model, params.s_ElapsedTime,
                            params.s_ComputeProbabilityParams, params.s_Time,
                            params.s_Value, params.s_Probability, params.s_Tail,
                            type, mostAnomalousCorrelate)) {
                    LOG_TRACE(<< "P(" << params.describe()
                              << ", attribute = " << gatherer.attributeName(cid)
                              << ", person = " << gatherer.personName(pid)
                              << ") = " << params.s_Probability);
                    CProbabilityAndInfluenceCalculator& calculator =
                        pConditional.emplace(cid, pConditionalTemplate).first->second;
                    const auto& influenceValues =
                        CDataGatherer::extractData(featureData[j]).s_InfluenceValues;
                    for (std::size_t k = 0; k < influenceValues.size(); ++k) {
                        if (const CInfluenceCalculator* influenceCalculator =
                                this->influenceCalculator(feature, k)) {
                            calculator.plugin(*influenceCalculator);
                            calculator.addInfluences(*(gatherer.beginInfluencers() + k),
                                                     influenceValues[k], params);
                        }
                    }
                    minimumProbabilityFeatures[cid].add({params.s_Probability, feature});
                } else {
                    LOG_ERROR(<< "Unable to compute P(" << params.describe()
                              << ", attribute = " << gatherer.attributeName(cid)
                              << ", person = " << gatherer.personName(pid) << ")");
                }
            }
        }
    }

    CProbabilityAndInfluenceCalculator pJoint(this->params().s_InfluenceCutoff);
    pJoint.addAggregator(maths::common::CJointProbabilityOfLessLikelySamples());

    for (const auto& pConditional_ : pConditional) {
        std::size_t cid = pConditional_.first;
        CProbabilityAndInfluenceCalculator pPersonAndAttribute(this->params().s_InfluenceCutoff);
        pPersonAndAttribute.addAggregator(
            maths::common::CJointProbabilityOfLessLikelySamples());
        pPersonAndAttribute.add(pConditional_.second);
        double pAttribute;
        if (m_AttributeProbabilities.lookup(cid, pAttribute)) {
            pPersonAndAttribute.addProbability(pAttribute);
        }
        LOG_TRACE(<< "P(" << gatherer.attributeName(cid) << ") = " << pAttribute);

        // The idea is we imagine drawing n samples from the person's total
        // attribute set, where n is the size of the person's attribute set,
        // and we weight each sample according to the probability it occurs
        // assuming the attributes are distributed according to the supplied
        // multinomial distribution.
        double w = 1.0;
        double pAttributeGivenPerson;
        if (personAttributeProbabilityPrior.probability(static_cast<double>(cid),
                                                        pAttributeGivenPerson)) {
            w = maths::common::CCategoricalTools::probabilityOfCategory(
                pConditional.size(), pAttributeGivenPerson);
        }
        LOG_TRACE(<< "w = " << w);

        pJoint.add(pPersonAndAttribute, w);

        auto feature = minimumProbabilityFeatures.find(cid);
        if (feature == minimumProbabilityFeatures.end()) {
            LOG_ERROR(<< "No feature for " << gatherer.attributeName(cid));
        } else {
            double p;
            pPersonAndAttribute.calculate(p);
            resultBuilder.addAttributeProbability(
                cid, gatherer.attributeName(cid), p, model_t::CResultType::E_Unconditional,
                (feature->second)[0].second, NO_CORRELATED_ATTRIBUTES, NO_CORRELATES);
        }
    }

    if (pJoint.empty()) {
        LOG_TRACE(<< "No samples in [" << startTime << "," << endTime << ")");
        return false;
    }

    double p;
    if (!pJoint.calculate(p, result.s_Influences)) {
        LOG_ERROR(<< "Failed to compute probability of " << this->personName(pid));
        return false;
    }
    LOG_TRACE(<< "probability(" << this->personName(pid) << ") = " << p);
    resultBuilder.probability(p);
    resultBuilder.anomalyScoreExplanation() = result.s_AnomalyScoreExplanation;
    resultBuilder.build();

    return true;
}