in lib/model/CEventRatePopulationModel.cc [592:784]
bool CEventRatePopulationModel::computeProbability(std::size_t pid,
core_t::TTime startTime,
core_t::TTime endTime,
CPartitioningFields& partitioningFields,
std::size_t numberAttributeProbabilities,
SAnnotatedProbability& result) const {
const CDataGatherer& gatherer = this->dataGatherer();
core_t::TTime bucketLength = gatherer.bucketLength();
if (endTime != startTime + bucketLength) {
LOG_ERROR(<< "Can only compute probability for single bucket");
return false;
}
if (pid > gatherer.numberPeople()) {
LOG_TRACE(<< "No person for pid = " << pid);
return false;
}
LOG_TRACE(<< "computeProbability(" << gatherer.personName(pid) << ")");
using TOptionalStr = std::optional<std::string>;
using TOptionalStr1Vec = core::CSmallVector<TOptionalStr, 1>;
using TSizeProbabilityAndInfluenceUMap =
boost::unordered_map<std::size_t, CProbabilityAndInfluenceCalculator>;
using TDoubleFeaturePr = std::pair<double, model_t::EFeature>;
using TDoubleFeaturePrMinAccumulator =
maths::common::CBasicStatistics::SMin<TDoubleFeaturePr>::TAccumulator;
using TSizeDoubleFeaturePrMinAccumulatorUMap =
boost::unordered_map<std::size_t, TDoubleFeaturePrMinAccumulator>;
static const TOptionalStr1Vec NO_CORRELATED_ATTRIBUTES;
static const TSizeDoublePr1Vec NO_CORRELATES;
partitioningFields.add(gatherer.attributeFieldName(), EMPTY_STRING);
CProbabilityAndInfluenceCalculator pConditionalTemplate(this->params().s_InfluenceCutoff);
pConditionalTemplate.addAggregator(maths::common::CJointProbabilityOfLessLikelySamples());
pConditionalTemplate.addAggregator(maths::common::CProbabilityOfExtremeSample());
if (this->params().s_CacheProbabilities) {
pConditionalTemplate.addCache(m_Probabilities);
}
TSizeProbabilityAndInfluenceUMap pConditional;
TSizeDoubleFeaturePrMinAccumulatorUMap minimumProbabilityFeatures;
maths::common::CMultinomialConjugate personAttributeProbabilityPrior(m_NewAttributeProbabilityPrior);
CAnnotatedProbabilityBuilder resultBuilder(
result, std::max(numberAttributeProbabilities, std::size_t(1)),
function_t::function(gatherer.features()));
resultBuilder.attributeProbabilityPrior(&m_AttributeProbabilityPrior);
resultBuilder.personAttributeProbabilityPrior(&personAttributeProbabilityPrior);
for (std::size_t i = 0; i < gatherer.numberFeatures(); ++i) {
model_t::EFeature feature = gatherer.feature(i);
LOG_TRACE(<< "feature = " << model_t::print(feature));
if (feature == model_t::E_PopulationAttributeTotalCountByPerson) {
const TSizeSizePrFeatureDataPrVec& data = this->featureData(feature, startTime);
TSizeSizePr range = personRange(data, pid);
for (std::size_t j = range.first; j < range.second; ++j) {
TDouble1Vec category{
static_cast<double>(CDataGatherer::extractAttributeId(data[j]))};
maths_t::TDoubleWeightsAry1Vec weights{maths_t::countWeight(
static_cast<double>(CDataGatherer::extractData(data[j]).s_Count))};
personAttributeProbabilityPrior.addSamples(category, weights);
}
continue;
}
if (model_t::isCategorical(feature)) {
continue;
}
const TSizeSizePrFeatureDataPrVec& featureData = this->featureData(feature, startTime);
TSizeSizePr range = personRange(featureData, pid);
for (std::size_t j = range.first; j < range.second; ++j) {
std::size_t cid = CDataGatherer::extractAttributeId(featureData[j]);
if (this->shouldSkipUpdate(feature, pid, cid,
model_t::sampleTime(feature, startTime, bucketLength))) {
result.s_ShouldUpdateQuantiles = false;
}
if (this->shouldIgnoreResult(feature, result.s_ResultType, pid, cid,
model_t::sampleTime(feature, startTime, bucketLength))) {
continue;
}
partitioningFields.back().second = TStrCRef(gatherer.attributeName(cid));
if (this->correlates(feature, pid, cid, startTime)) {
// TODO
} else {
CProbabilityAndInfluenceCalculator::SParams params(partitioningFields);
if (this->fill(feature, pid, cid, startTime, result.isInterim(),
params) == false) {
continue;
}
model_t::CResultType type;
TSize1Vec mostAnomalousCorrelate;
if (pConditional.emplace(cid, pConditionalTemplate)
.first->second.addProbability(
feature, cid, *params.s_Model, params.s_ElapsedTime,
params.s_ComputeProbabilityParams, params.s_Time,
params.s_Value, params.s_Probability, params.s_Tail,
type, mostAnomalousCorrelate)) {
LOG_TRACE(<< "P(" << params.describe()
<< ", attribute = " << gatherer.attributeName(cid)
<< ", person = " << gatherer.personName(pid)
<< ") = " << params.s_Probability);
CProbabilityAndInfluenceCalculator& calculator =
pConditional.emplace(cid, pConditionalTemplate).first->second;
const auto& influenceValues =
CDataGatherer::extractData(featureData[j]).s_InfluenceValues;
for (std::size_t k = 0; k < influenceValues.size(); ++k) {
if (const CInfluenceCalculator* influenceCalculator =
this->influenceCalculator(feature, k)) {
calculator.plugin(*influenceCalculator);
calculator.addInfluences(*(gatherer.beginInfluencers() + k),
influenceValues[k], params);
}
}
minimumProbabilityFeatures[cid].add({params.s_Probability, feature});
} else {
LOG_ERROR(<< "Unable to compute P(" << params.describe()
<< ", attribute = " << gatherer.attributeName(cid)
<< ", person = " << gatherer.personName(pid) << ")");
}
}
}
}
CProbabilityAndInfluenceCalculator pJoint(this->params().s_InfluenceCutoff);
pJoint.addAggregator(maths::common::CJointProbabilityOfLessLikelySamples());
for (const auto& pConditional_ : pConditional) {
std::size_t cid = pConditional_.first;
CProbabilityAndInfluenceCalculator pPersonAndAttribute(this->params().s_InfluenceCutoff);
pPersonAndAttribute.addAggregator(
maths::common::CJointProbabilityOfLessLikelySamples());
pPersonAndAttribute.add(pConditional_.second);
double pAttribute;
if (m_AttributeProbabilities.lookup(cid, pAttribute)) {
pPersonAndAttribute.addProbability(pAttribute);
}
LOG_TRACE(<< "P(" << gatherer.attributeName(cid) << ") = " << pAttribute);
// The idea is we imagine drawing n samples from the person's total
// attribute set, where n is the size of the person's attribute set,
// and we weight each sample according to the probability it occurs
// assuming the attributes are distributed according to the supplied
// multinomial distribution.
double w = 1.0;
double pAttributeGivenPerson;
if (personAttributeProbabilityPrior.probability(static_cast<double>(cid),
pAttributeGivenPerson)) {
w = maths::common::CCategoricalTools::probabilityOfCategory(
pConditional.size(), pAttributeGivenPerson);
}
LOG_TRACE(<< "w = " << w);
pJoint.add(pPersonAndAttribute, w);
auto feature = minimumProbabilityFeatures.find(cid);
if (feature == minimumProbabilityFeatures.end()) {
LOG_ERROR(<< "No feature for " << gatherer.attributeName(cid));
} else {
double p;
pPersonAndAttribute.calculate(p);
resultBuilder.addAttributeProbability(
cid, gatherer.attributeName(cid), p, model_t::CResultType::E_Unconditional,
(feature->second)[0].second, NO_CORRELATED_ATTRIBUTES, NO_CORRELATES);
}
}
if (pJoint.empty()) {
LOG_TRACE(<< "No samples in [" << startTime << "," << endTime << ")");
return false;
}
double p;
if (!pJoint.calculate(p, result.s_Influences)) {
LOG_ERROR(<< "Failed to compute probability of " << this->personName(pid));
return false;
}
LOG_TRACE(<< "probability(" << this->personName(pid) << ") = " << p);
resultBuilder.probability(p);
resultBuilder.anomalyScoreExplanation() = result.s_AnomalyScoreExplanation;
resultBuilder.build();
return true;
}