lib/model/CProbabilityAndInfluenceCalculator.cc (995 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #include <model/CProbabilityAndInfluenceCalculator.h> #include <core/CLogger.h> #include <maths/common/CBasicStatistics.h> #include <maths/common/CModel.h> #include <maths/common/COrderings.h> #include <maths/common/CTools.h> #include <model/CAnnotatedProbabilityBuilder.h> #include <model/CAnomalyDetectorModelConfig.h> #include <model/CPartitioningFields.h> namespace ml { namespace model { namespace { using TSize1Vec = CProbabilityAndInfluenceCalculator::TSize1Vec; using TSize2Vec = CProbabilityAndInfluenceCalculator::TSize2Vec; using TDouble1Vec = CProbabilityAndInfluenceCalculator::TDouble1Vec; using TDouble2Vec = CProbabilityAndInfluenceCalculator::TDouble2Vec; using TDouble2Vec1Vec = CProbabilityAndInfluenceCalculator::TDouble2Vec1Vec; using TDouble1VecDoublePr = CProbabilityAndInfluenceCalculator::TDouble1VecDoublePr; using TBool2Vec = CProbabilityAndInfluenceCalculator::TBool2Vec; using TTime2Vec = CProbabilityAndInfluenceCalculator::TTime2Vec; using TTime2Vec1Vec = CProbabilityAndInfluenceCalculator::TTime2Vec1Vec; using TStrCRefDouble1VecDoublePrPr = CProbabilityAndInfluenceCalculator::TStrCRefDouble1VecDoublePrPr; using TStrCRefDouble1VecDoublePrPrVec = CProbabilityAndInfluenceCalculator::TStrCRefDouble1VecDoublePrPrVec; using TStrCRefDouble1VecDouble1VecPrPr = CProbabilityAndInfluenceCalculator::TStrCRefDouble1VecDouble1VecPrPr; using TStrCRefDouble1VecDouble1VecPrPrVec = CProbabilityAndInfluenceCalculator::TStrCRefDouble1VecDouble1VecPrPrVec; using TOptionalStr = std::optional<std::string>; using TOptionalStrOptionalStrPr = CProbabilityAndInfluenceCalculator::TOptionalStrOptionalStrPr; using TOptionalStrOptionalStrPrDoublePr = CProbabilityAndInfluenceCalculator::TOptionalStrOptionalStrPrDoublePr; using TOptionalStrOptionalStrPrDoublePrVec = CProbabilityAndInfluenceCalculator::TOptionalStrOptionalStrPrDoublePrVec; using TTail2Vec = core::CSmallVector<maths_t::ETail, 2>; using TProbabilityCalculation2Vec = core::CSmallVector<maths_t::EProbabilityCalculation, 2>; using TSizeDoublePr = std::pair<std::size_t, double>; using TSizeDoublePr1Vec = core::CSmallVector<TSizeDoublePr, 1>; //! \brief Orders two value influences by decreasing influence. class CDecreasingValueInfluence { public: CDecreasingValueInfluence(maths_t::ETail tail) : m_Tail(tail) {} bool operator()(const TStrCRefDouble1VecDoublePrPr& lhs, const TStrCRefDouble1VecDoublePrPr& rhs) const { return m_Tail == maths_t::E_LeftTail ? lhs.second.first < rhs.second.first : lhs.second.first > rhs.second.first; } private: maths_t::ETail m_Tail; }; //! \brief Orders two mean influences by decreasing influence. class CDecreasingMeanInfluence { public: using TMeanAccumulator = maths::common::CBasicStatistics::SSampleMean<double>::TAccumulator; public: CDecreasingMeanInfluence(maths_t::ETail tail, const TDouble2Vec& value, double count) : m_Tail(tail), m_Mean(maths::common::CBasicStatistics::momentsAccumulator(count, value[0])) {} bool operator()(const TStrCRefDouble1VecDoublePrPr& lhs, const TStrCRefDouble1VecDoublePrPr& rhs) const { TMeanAccumulator l = m_Mean - maths::common::CBasicStatistics::momentsAccumulator( lhs.second.second, lhs.second.first[0]); TMeanAccumulator r = m_Mean - maths::common::CBasicStatistics::momentsAccumulator( rhs.second.second, rhs.second.first[0]); double ml = maths::common::CBasicStatistics::mean(l); double nl = maths::common::CBasicStatistics::count(l); double mr = maths::common::CBasicStatistics::mean(r); double nr = maths::common::CBasicStatistics::count(r); return m_Tail == maths_t::E_LeftTail ? maths::common::COrderings::lexicographicalCompare(mr, nl, ml, nr) : maths::common::COrderings::lexicographicalCompare(ml, nl, mr, nr); } private: maths_t::ETail m_Tail; TMeanAccumulator m_Mean; }; //! \brief Orders two variance influences by decreasing influence. class CDecreasingVarianceInfluence { public: using TMeanVarAccumulator = maths::common::CBasicStatistics::SSampleMeanVar<double>::TAccumulator; public: CDecreasingVarianceInfluence(maths_t::ETail tail, const TDouble2Vec& value, double count) : m_Tail(tail), m_Variance(maths::common::CBasicStatistics::momentsAccumulator(count, value[1], value[0])) {} bool operator()(const TStrCRefDouble1VecDoublePrPr& lhs, const TStrCRefDouble1VecDoublePrPr& rhs) const { TMeanVarAccumulator l = m_Variance - maths::common::CBasicStatistics::momentsAccumulator( lhs.second.second, lhs.second.first[1], lhs.second.first[0]); TMeanVarAccumulator r = m_Variance - maths::common::CBasicStatistics::momentsAccumulator( rhs.second.second, rhs.second.first[1], rhs.second.first[0]); double vl = maths::common::CBasicStatistics::maximumLikelihoodVariance(l); double nl = maths::common::CBasicStatistics::count(l); double vr = maths::common::CBasicStatistics::maximumLikelihoodVariance(r); double nr = maths::common::CBasicStatistics::count(r); return m_Tail == maths_t::E_LeftTail ? maths::common::COrderings::lexicographicalCompare(vr, nl, vl, nr) : maths::common::COrderings::lexicographicalCompare(vl, nl, vr, nr); } private: maths_t::ETail m_Tail; TMeanVarAccumulator m_Variance; }; //! A safe ratio function \p numerator / \p denominator dealing //! with the case \p n and/or \p d are zero. double ratio(double numerator, double denominator, double zeroDividedByZero) { if (denominator == 0.0) { if (numerator == 0.0) { return zeroDividedByZero; } return numerator < 0.0 ? -std::numeric_limits<double>::max() : std::numeric_limits<double>::max(); } return numerator / denominator; } // Functions to compute influence based on different criteria //! \brief Computes the value of summed statistics on the set difference. class CValueDifference { public: //! Features. bool operator()(const TDouble2Vec& v, double /*n*/, const TDouble1Vec& vi, double /*ni*/, maths::common::CModelProbabilityParams& /*params*/, TDouble2Vec& difference) const { for (std::size_t i = 0; i < v.size(); ++i) { difference[i] = v[i] - vi[i]; } return true; } //! Correlates. bool operator()(const TDouble2Vec& v, const TDouble2Vec& /*n*/, const TDouble1Vec& vi, const TDouble1Vec& /*ni*/, maths::common::CModelProbabilityParams& /*params*/, TDouble2Vec& difference) const { for (std::size_t d = 0; d < 2; ++d) { difference[d] = v[d] - vi[d]; } return true; } }; //! \brief Computes the value of min, max, dc, etc on the set intersection. class CValueIntersection { public: //! Features. bool operator()(const TDouble2Vec& /*v*/, double /*n*/, const TDouble1Vec& vi, double /*ni*/, maths::common::CModelProbabilityParams& /*params*/, TDouble2Vec& intersection) const { for (std::size_t i = 0; i < vi.size(); ++i) { intersection[i] = vi[i]; } return true; } //! Correlates. bool operator()(const TDouble2Vec& /*v*/, const TDouble2Vec& /*n*/, const TDouble1Vec& vi, const TDouble1Vec& /*ni*/, maths::common::CModelProbabilityParams& /*params*/, TDouble2Vec& intersection) const { for (std::size_t d = 0; d < 2; ++d) { intersection[d] = vi[d]; } return true; } }; //! \brief Computes the value of the mean statistic on a set difference. class CMeanDifference { public: //! Features. //! //! \param[in] v overall mean //! \param[in] n overall count //! \param[in] vi influencer mean //! \param[in] ni influencer count //! \param[out] params model parameters to be updated //! \param[out] difference computed mean difference bool operator()(const TDouble2Vec& v, double n, const TDouble1Vec& vi, double ni, maths::common::CModelProbabilityParams& params, TDouble2Vec& difference) const { if (n <= ni) { return false; } std::size_t dimension = v.size(); for (std::size_t d = 0; d < dimension; ++d) { difference[d] = maths::common::CBasicStatistics::mean( maths::common::CBasicStatistics::momentsAccumulator(n, v[d]) - maths::common::CBasicStatistics::momentsAccumulator(ni, vi[d])); } TDouble2Vec scale(dimension, n / (n - ni)); maths_t::multiplyCountVarianceScale(scale, params.weights()[0]); return true; } //! Correlates. bool operator()(const TDouble2Vec& v, const TDouble2Vec& n, const TDouble1Vec& vi, const TDouble1Vec& ni, maths::common::CModelProbabilityParams& params, TDouble2Vec& difference) const { if (n <= ni) { return false; } for (std::size_t d = 0; d < 2; ++d) { difference[d] = maths::common::CBasicStatistics::mean( maths::common::CBasicStatistics::momentsAccumulator(n[d], v[d]) - maths::common::CBasicStatistics::momentsAccumulator(ni[d], vi[d])); } TDouble2Vec scale{n[0] / (n[0] - ni[0]), n[1] / (n[1] - ni[1])}; maths_t::multiplyCountVarianceScale(scale, params.weights()[0]); return true; } }; //! \brief Computes the value of the variance statistic on a set difference. class CVarianceDifference { public: //! Features. //! //! \param[in] v overall variance and mean //! \param[in] n overall count //! \param[in] vi influencer variance and mean //! \param[in] ni influencer count //! \param[out] params model parameters to be updated //! \param[out] difference computed mean difference bool operator()(const TDouble2Vec& v, double n, const TDouble1Vec& vi, double ni, maths::common::CModelProbabilityParams& params, TDouble2Vec& difference) const { if (n <= ni) { return false; } std::size_t dimension = v.size() / 2; for (std::size_t d = 0; d < dimension; ++d) { difference[d] = maths::common::CBasicStatistics::maximumLikelihoodVariance( maths::common::CBasicStatistics::momentsAccumulator( n, v[dimension + d], v[d]) - maths::common::CBasicStatistics::momentsAccumulator( ni, vi[dimension + d], vi[d])); } TDouble2Vec scale(dimension, n / (n - ni)); maths_t::multiplyCountVarianceScale(scale, params.weights()[0]); return true; } //! Correlates. bool operator()(const TDouble2Vec& v, const TDouble2Vec& n, const TDouble1Vec& vi, const TDouble1Vec& ni, maths::common::CModelProbabilityParams& params, TDouble2Vec& difference) const { if (n <= ni) { return false; } for (std::size_t d = 0; d < 2; ++d) { difference[d] = maths::common::CBasicStatistics::maximumLikelihoodVariance( maths::common::CBasicStatistics::momentsAccumulator(n[d], v[2 + d], v[d]) - maths::common::CBasicStatistics::momentsAccumulator(ni[d], vi[2 + d], vi[d])); } TDouble2Vec scale{n[0] / (n[0] - ni[0]), n[1] / (n[1] - ni[1])}; maths_t::multiplyCountVarianceScale(scale, params.weights()[0]); return true; } }; //! Sets all influences to one. //! //! \param[in] influencerName The name of the influencer field. //! \param[in] influencerValues The feature values for the intersection //! of the records in \p value with distinct values of \p influenceName. //! \param[out] result Filled in with the influences of \p value. template<typename INFLUENCER_VALUES> void doComputeIndicatorInfluences(const TOptionalStr& influencerName, const INFLUENCER_VALUES& influencerValues, TOptionalStrOptionalStrPrDoublePrVec& result) { result.reserve(influencerValues.size()); for (const auto& influencerValue : influencerValues) { result.emplace_back(std::make_pair(influencerName, influencerValue.first), 1.0); } } //! The influence calculation for features using \p computeSample to //! get the statistics and \p computeInfluence to compute the influences //! from the corresponding probabilities. //! //! \param[in] computeInfluencedParamsAndValue The function to compute //! the influenced feature value for which to compute the probability. //! \param[in] computeInfluence The function to compute influence. //! \param[in] model The model to use to compute the probability. //! \param[in] elapsedTime The time elapsed since the model was created. //! \param[in] computeProbabilityParams The parameters need to compute //! the probability. //! \param[in] time The time of \p value. //! \param[in] value The influenced feature value. //! \param[in] count The measurement count in \p value. //! \param[in] influencerName The name of the influencer field. //! \param[in] influencerValues The feature values for the intersection //! of the records in \p value with distinct values of \p influenceName. //! \param[in] cutoff The value at which there is no influence. //! \param[in] includeCutoff If true then add in values for influences //! less than the cutoff with estimated influence. //! \param[out] result Filled in with the influences of \p value. template<typename COMPUTE_INFLUENCED_VALUE, typename COMPUTE_INFLUENCE> void doComputeInfluences(model_t::EFeature feature, COMPUTE_INFLUENCED_VALUE computeInfluencedParamsAndValue, COMPUTE_INFLUENCE computeInfluence, const maths::common::CModel& model, core_t::TTime elapsedTime, maths::common::CModelProbabilityParams& computeProbabilityParams, const TTime2Vec1Vec& time, const TDouble2Vec& value, double count, const TOptionalStr& influencerName, const TStrCRefDouble1VecDoublePrPrVec& influencerValues, double cutoff, bool includeCutoff, TOptionalStrOptionalStrPrDoublePrVec& result) { auto description = [&influencerName](const std::string& v) { return std::make_pair(influencerName, v); }; if (influencerValues.size() == 1) { result.emplace_back(description(influencerValues[0].first), 1.0); return; } auto probability = [feature, elapsedTime](const maths::common::SModelProbabilityResult& r) { double p{r.s_Probability}; p = maths::common::CTools::truncate( p, maths::common::CTools::smallestProbability(), 1.0); return model_t::adjustProbability(feature, elapsedTime, p); }; maths_t::TDouble2VecWeightsAry1Vec weights(computeProbabilityParams.weights()); computeProbabilityParams.weights(weights).useMultibucketFeatures(false).useAnomalyModel(false); maths::common::SModelProbabilityResult overallResult; model.probability(computeProbabilityParams, time, model_t::stripExtraStatistics(feature, {value}), overallResult); double overallProbability{probability(overallResult)}; if (overallProbability == 1.0) { doComputeIndicatorInfluences(influencerName, influencerValues, result); return; } result.reserve(influencerValues.size()); double logOverallProbability{maths::common::CTools::fastLog(overallProbability)}; // Declared outside the loop to minimize the number of times they are created. std::size_t dimension = model_t::dimension(feature); TDouble2Vec1Vec influencedValue{TDouble2Vec(dimension)}; maths::common::SModelProbabilityResult influenceResult; for (auto i = influencerValues.begin(); i != influencerValues.end(); ++i) { const auto& influenceValue = i->second.first; const auto& influenceCount = i->second.second; computeProbabilityParams.weights(weights); if (computeInfluencedParamsAndValue(value, count, influenceValue, influenceCount, computeProbabilityParams, influencedValue[0]) == false) { LOG_ERROR(<< "Failed to compute influencer value (value = " << value << " , count = " << count << " , influencer value = " << i->second.first << " , influencer count = " << i->second.second << ")"); continue; } if (model.probability(computeProbabilityParams, time, influencedValue, influenceResult) == false) { LOG_ERROR(<< "Failed to compute P(" << influencedValue[0] << " | influencer = " << *i << ")"); continue; } double influenceProbability{probability(influenceResult)}; double logInfluenceProbability{maths::common::CTools::fastLog(influenceProbability)}; double influence{computeInfluence(logOverallProbability, logInfluenceProbability)}; LOG_TRACE(<< "log(p) = " << logOverallProbability << ", v(i) = " << influencedValue << ", log(p(i)) = " << logInfluenceProbability << ", weight = " << weights << ", influence = " << influence << ", influencer field value = " << i->first.get()); if (dimension == 1 && influence >= cutoff) { result.emplace_back(description(i->first), influence); } else if (dimension == 1) { if (includeCutoff) { result.emplace_back(description(i->first), influence); for (++i; i != influencerValues.end(); ++i) { result.emplace_back(description(i->first), 0.5 * influence); } } break; } else if (influence >= cutoff) { result.emplace_back(description(i->first), influence); } else if (includeCutoff) { result.emplace_back(description(i->first), 0.5 * influence); } } } //! Implement the influence calculation for correlates of univariate //! features using \p computeSample to get the statistics and //! \p computeInfluence to compute the influences from the corresponding //! probabilities. template<typename COMPUTE_INFLUENCED_VALUE, typename COMPUTE_INFLUENCE> void doComputeCorrelateInfluences(model_t::EFeature feature, COMPUTE_INFLUENCED_VALUE computeInfluencedValue, COMPUTE_INFLUENCE computeInfluence, const maths::common::CModel& model, core_t::TTime elapsedTime, maths::common::CModelProbabilityParams& computeProbabilityParams, const TTime2Vec& time, const TDouble2Vec& value, const TDouble2Vec& count, const TOptionalStr& influencerName, const TStrCRefDouble1VecDouble1VecPrPrVec& influencerValues, double cutoff, bool includeCutoff, TOptionalStrOptionalStrPrDoublePrVec& result) { auto description = [&influencerName](const std::string& v) { return std::make_pair(influencerName, v); }; auto probability = [feature, elapsedTime](const maths::common::SModelProbabilityResult& r) { double p{r.s_Probability}; p = maths::common::CTools::truncate( p, maths::common::CTools::smallestProbability(), 1.0); return model_t::adjustProbability(feature, elapsedTime, p); }; if (influencerValues.size() == 1) { result.emplace_back(description(influencerValues[0].first), 1.0); return; } maths_t::TDouble2VecWeightsAry1Vec weights(computeProbabilityParams.weights()); computeProbabilityParams.weights(weights).useMultibucketFeatures(false).useAnomalyModel(false); maths::common::SModelProbabilityResult overallResult; model.probability(computeProbabilityParams, {time}, model_t::stripExtraStatistics(feature, {value}), overallResult); double overallProbability{probability(overallResult)}; if (overallProbability == 1.0) { doComputeIndicatorInfluences(influencerName, influencerValues, result); return; } result.reserve(influencerValues.size()); double logOverallProbability{maths::common::CTools::fastLog(overallProbability)}; // Declared outside the loop to minimize the number of times they are created. TDouble2Vec1Vec influencedValue{TDouble2Vec(2)}; maths::common::SModelProbabilityResult influenceResult; for (const auto& i : influencerValues) { const auto& influenceValue = i.second.first; const auto& influenceCount = i.second.second; computeProbabilityParams.weights(weights); computeInfluencedValue(value, count, influenceValue, influenceCount, computeProbabilityParams, influencedValue[0]); if (model.probability(computeProbabilityParams, {time}, influencedValue, influenceResult) == false) { LOG_ERROR(<< "Failed to compute P(" << influencedValue << " | influencer = " << i << ")"); continue; } double influenceProbability{probability(influenceResult)}; double logInfluenceProbability{maths::common::CTools::fastLog(influenceProbability)}; double influence{computeInfluence(logOverallProbability, logInfluenceProbability)}; LOG_TRACE(<< "log(p) = " << logOverallProbability << ", v(i) = " << influencedValue << ", log(p(i)) = " << logInfluenceProbability << ", weight = " << weights << ", influence = " << influence << ", influencer field value = " << i.first.get()); if (includeCutoff || influence >= cutoff) { result.emplace_back(description(i.first), influence); } } } } CProbabilityAndInfluenceCalculator::CProbabilityAndInfluenceCalculator(double cutoff) : m_Cutoff(cutoff), m_InfluenceCalculator(nullptr), m_ProbabilityTemplate(CModelTools::CProbabilityAggregator::E_Min), m_Probability(CModelTools::CProbabilityAggregator::E_Min), m_ExplainingProbabilities{ {maths::common::SModelProbabilityResult::E_SingleBucketProbability, {CModelTools::CProbabilityAggregator::E_Min}}, {maths::common::SModelProbabilityResult::E_MultiBucketProbability, {CModelTools::CProbabilityAggregator::E_Min}}}, m_ProbabilityCache(nullptr) { } bool CProbabilityAndInfluenceCalculator::empty() const { return m_Probability.empty(); } double CProbabilityAndInfluenceCalculator::cutoff() const { return m_Cutoff; } void CProbabilityAndInfluenceCalculator::plugin(const CInfluenceCalculator& influenceCalculator) { m_InfluenceCalculator = &influenceCalculator; } void CProbabilityAndInfluenceCalculator::addAggregator( const maths::common::CJointProbabilityOfLessLikelySamples& aggregator) { m_ProbabilityTemplate.add(aggregator); m_Probability.add(aggregator); for (auto& ep : m_ExplainingProbabilities) { ep.second.add(aggregator); } } void CProbabilityAndInfluenceCalculator::addAggregator( const maths::common::CProbabilityOfExtremeSample& aggregator) { m_ProbabilityTemplate.add(aggregator); m_Probability.add(aggregator); for (auto& ep : m_ExplainingProbabilities) { ep.second.add(aggregator); } } void CProbabilityAndInfluenceCalculator::addCache(CModelTools::CProbabilityCache& cache) { m_ProbabilityCache = &cache; } void CProbabilityAndInfluenceCalculator::add(const CProbabilityAndInfluenceCalculator& other, double weight) { double p = 0.0; if (!other.m_Probability.calculate(p)) { return; } double pThis{1.0}; m_Probability.calculate(pThis); double pOther{p}; if (!other.m_Probability.empty()) { m_Probability.add(p, weight); } for (const auto& ep : other.m_ExplainingProbabilities) { if (ep.second.calculate(p) && !ep.second.empty()) { auto ret = m_ExplainingProbabilities.insert(ep); if (ret.second == false) { ret.first->second.add(p, weight); } } } for (const auto& aggregator : other.m_InfluencerProbabilities) { if (aggregator.second.calculate(p)) { auto& aggregator_ = m_InfluencerProbabilities .emplace(aggregator.first, other.m_ProbabilityTemplate) .first->second; if (!aggregator.second.empty()) { aggregator_.add(p, weight); } } } if (pOther < pThis) { m_AnomalyScoreExplanation = other.m_AnomalyScoreExplanation; } } bool CProbabilityAndInfluenceCalculator::addAttributeProbability(const TOptionalStr& attribute, std::size_t cid, SParams& params, CAnnotatedProbabilityBuilder& builder, double weight) { model_t::CResultType type; TSize1Vec mostAnomalousCorrelate; if (this->addProbability(params.s_Feature, cid, *params.s_Model, params.s_ElapsedTime, params.s_ComputeProbabilityParams, params.s_Time, params.s_Value, params.s_Probability, params.s_Tail, type, mostAnomalousCorrelate, weight)) { static const TOptionalStr1Vec NO_CORRELATED_ATTRIBUTES; static const TSizeDoublePr1Vec NO_CORRELATES; builder.addAttributeProbability(cid, attribute, params.s_Probability, model_t::CResultType::E_Unconditional, params.s_Feature, NO_CORRELATED_ATTRIBUTES, NO_CORRELATES); return true; } return false; } bool CProbabilityAndInfluenceCalculator::addAttributeProbability(const TOptionalStr& attribute, std::size_t cid, SCorrelateParams& params, CAnnotatedProbabilityBuilder& builder, double weight) { model_t::CResultType type; params.s_MostAnomalousCorrelate.clear(); if (this->addProbability(params.s_Feature, cid, *params.s_Model, params.s_ElapsedTime, params.s_ComputeProbabilityParams, params.s_Times, params.s_Values, params.s_Probability, params.s_Tail, type, params.s_MostAnomalousCorrelate, weight)) { TOptionalStr1Vec correlatedLabels_; TSizeDoublePr1Vec correlated_; if (!params.s_MostAnomalousCorrelate.empty()) { std::size_t i = params.s_MostAnomalousCorrelate[0]; correlatedLabels_.push_back(params.s_CorrelatedLabels[i]); correlated_.emplace_back(params.s_Correlated[i], params.s_Values[i][params.s_Variables[i][1]]); } builder.addAttributeProbability(cid, attribute, params.s_Probability, type, params.s_Feature, correlatedLabels_, correlated_); return true; } return false; } bool CProbabilityAndInfluenceCalculator::addProbability( model_t::EFeature feature, std::size_t id, const maths::common::CModel& model, core_t::TTime elapsedTime, const maths::common::CModelProbabilityParams& computeProbabilityParams, const TTime2Vec1Vec& time, const TDouble2Vec1Vec& values_, double& probability, TTail2Vec& tail, model_t::CResultType& type, TSize1Vec& mostAnomalousCorrelate, double weight) { if (values_.empty()) { return false; } auto readResult = [&](const maths::common::SModelProbabilityResult& result) { for (const auto& fp : result.s_FeatureProbabilities) { auto itr = m_ExplainingProbabilities.find(fp.s_Label); if (itr != m_ExplainingProbabilities.end()) { double featureProbability = fp.s_Probability; featureProbability = model_t::adjustProbability( feature, elapsedTime, featureProbability); itr->second.add(featureProbability, weight); } } probability = result.s_Probability; probability = model_t::adjustProbability(feature, elapsedTime, probability); tail = std::move(result.s_Tail); type.set(result.s_Conditional ? model_t::CResultType::E_Conditional : model_t::CResultType::E_Unconditional); mostAnomalousCorrelate = std::move(result.s_MostAnomalousCorrelate); m_Probability.add(probability, weight); m_AnomalyScoreExplanation = result.s_AnomalyScoreExplanation; }; // Check the cache. if (model_t::isConstant(feature) == false && m_ProbabilityCache) { TDouble2Vec1Vec values(model_t::stripExtraStatistics(feature, values_)); model.detrend(time, computeProbabilityParams.seasonalConfidenceInterval(), values); maths::common::SModelProbabilityResult cached; if (m_ProbabilityCache->lookup(feature, id, values, cached)) { readResult(cached); return true; } } // Either there isn't a cache or the accuracy isn't good enough // so fall back to calculating. TDouble2Vec1Vec values(model_t::stripExtraStatistics(feature, values_)); maths::common::SModelProbabilityResult result; if (model.probability(computeProbabilityParams, time, values, result)) { if (model_t::isConstant(feature) == false) { readResult(result); if (m_ProbabilityCache) { m_ProbabilityCache->addModes(feature, id, model); m_ProbabilityCache->addProbability(feature, id, values, result); } } else { probability = result.s_Probability; m_AnomalyScoreExplanation = result.s_AnomalyScoreExplanation; tail = std::move(result.s_Tail); type.set(model_t::CResultType::E_Unconditional); mostAnomalousCorrelate.clear(); } return true; } return false; } void CProbabilityAndInfluenceCalculator::addProbability(double probability, double weight) { m_Probability.add(probability, weight); for (auto& aggregator : m_InfluencerProbabilities) { aggregator.second.add(probability, weight); } } void CProbabilityAndInfluenceCalculator::addInfluences(const std::string& influencerName, const TStrCRefDouble1VecDoublePrPrVec& influencerValues, SParams& params, double weight) { if (!m_InfluenceCalculator) { LOG_ERROR(<< "No influence calculator plug-in: can't compute influence"); return; } const std::string* influencerValue = nullptr; if (influencerValues.empty()) { for (std::size_t i = 0; i < params.s_PartitioningFields.size(); ++i) { if (params.s_PartitioningFields[i].first.get() == influencerName) { influencerValue = &(params.s_PartitioningFields[i].second.get()); break; } } if (!influencerValue) { return; } } double logp = std::log(std::max(params.s_Probability, maths::common::CTools::smallestProbability())); params.s_InfluencerName = influencerName; params.s_InfluencerValues = influencerValues; params.s_Cutoff = 0.5 / std::max(-logp, 1.0); params.s_IncludeCutoff = true; m_InfluenceCalculator->computeInfluences(params); m_Influences.swap(params.s_Influences); if (m_Influences.empty() && influencerValue) { m_Influences.emplace_back( std::make_pair(params.s_InfluencerName, *influencerValue), 1.0); } this->commitInfluences(params.s_Feature, logp, weight); } void CProbabilityAndInfluenceCalculator::addInfluences(const std::string& influencerName, const TStrCRefDouble1VecDouble1VecPrPrVecVec& influencerValues, SCorrelateParams& params, double weight) { if (!m_InfluenceCalculator) { LOG_ERROR(<< "No influence calculator plug-in: can't compute influence"); return; } const std::string* influencerValue = nullptr; if (influencerValues.empty()) { for (std::size_t i = 0; i < params.s_PartitioningFields.size(); ++i) { if (params.s_PartitioningFields[i].first.get() == influencerName) { influencerValue = &(params.s_PartitioningFields[i].second.get()); break; } } if (!influencerValue) { return; } } double logp = std::log(std::max(params.s_Probability, maths::common::CTools::smallestProbability())); params.s_InfluencerName = influencerName; params.s_InfluencerValues = influencerValues[params.s_MostAnomalousCorrelate[0]]; params.s_Cutoff = 0.5 / std::max(-logp, 1.0); params.s_IncludeCutoff = true; m_InfluenceCalculator->computeInfluences(params); m_Influences.swap(params.s_Influences); if (m_Influences.empty() && influencerValue) { m_Influences.emplace_back( std::make_pair(params.s_InfluencerName, *influencerValue), 1.0); } this->commitInfluences(params.s_Feature, logp, weight); } bool CProbabilityAndInfluenceCalculator::calculate(double& probability) const { return m_Probability.calculate(probability); } bool CProbabilityAndInfluenceCalculator::calculateExplainingProbabilities( TFeatureProbabilityLabelDoubleUMap& explainingProbabilities) const { double probability{0.0}; for (const auto& ep : m_ExplainingProbabilities) { if (!ep.second.calculate(probability)) { return false; } else { explainingProbabilities.emplace(ep.first, probability); } } return true; } bool CProbabilityAndInfluenceCalculator::calculateMultiBucketImpact(double& multiBucketImpact) const { TFeatureProbabilityLabelDoubleUMap explainingProbabilities; if (!this->calculateExplainingProbabilities(explainingProbabilities)) { LOG_INFO(<< "Failed to compute explaining probabilities"); return false; } double sbProbability = explainingProbabilities[maths::common::SModelProbabilityResult::E_SingleBucketProbability]; double mbProbability = explainingProbabilities[maths::common::SModelProbabilityResult::E_MultiBucketProbability]; double ls = std::log( std::max(sbProbability, ml::maths::common::CTools::smallestProbability())); double lm = std::log( std::max(mbProbability, ml::maths::common::CTools::smallestProbability())); double scale = CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE * std::min(ls, lm) / std::min(std::max(ls, lm), -0.001) / std::log(1000); multiBucketImpact = std::max( std::min(scale * (ls - lm), CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE), -1.0 * CAnomalyDetectorModelConfig::MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE); multiBucketImpact = (std::floor(multiBucketImpact * 100.0)) / 100.0; return true; } bool CProbabilityAndInfluenceCalculator::calculate(double& probability, TOptionalStrOptionalStrPrDoublePrVec& influences) const { if (!m_Probability.calculate(probability)) { return false; } LOG_TRACE(<< "probability = " << probability); if (m_InfluencerProbabilities.empty()) { LOG_TRACE(<< "no influencers"); return true; } double logp = std::log(probability); influences.reserve(m_InfluencerProbabilities.size()); for (const auto& aggregator : m_InfluencerProbabilities) { double probability_; if (!aggregator.second.calculate(probability_)) { LOG_ERROR(<< "Couldn't calculate probability for influencer " << aggregator.first); } LOG_TRACE(<< "influence probability = " << probability_); double influence = CInfluenceCalculator::intersectionInfluence( logp, std::log(probability_)); if (influence >= m_Cutoff) { influences.emplace_back(aggregator.first, influence); } } std::sort(influences.begin(), influences.end(), maths::common::COrderings::SSecondGreater()); return true; } void CProbabilityAndInfluenceCalculator::commitInfluences(model_t::EFeature feature, double logp, double weight) { LOG_TRACE(<< "influences = " << m_Influences); for (const auto& influence : m_Influences) { CModelTools::CProbabilityAggregator& aggregator = m_InfluencerProbabilities .emplace(influence.first, m_ProbabilityTemplate) .first->second; if (!model_t::isConstant(feature)) { double probability = std::exp(influence.second * logp); LOG_TRACE(<< "Adding '" << *influence.first.second << "', probability = " << probability << ", influence = " << influence.second); aggregator.add(probability, weight); } } } CProbabilityAndInfluenceCalculator::SParams::SParams(const CPartitioningFields& partitioningFields) : s_Feature(), s_Model(nullptr), s_ElapsedTime(0), s_Count(0.0), s_Probability(1.0), s_PartitioningFields(partitioningFields), s_Cutoff(1.0), s_IncludeCutoff(false) { } std::string CProbabilityAndInfluenceCalculator::SParams::describe() const { return core::CContainerPrinter::print(s_Value) + " | feature = " + model_t::print(s_Feature) + ", @ " + core::CContainerPrinter::print(s_Time) + ", elapsedTime = " + core::CStringUtils::typeToString(s_ElapsedTime); } CProbabilityAndInfluenceCalculator::SCorrelateParams::SCorrelateParams(const CPartitioningFields& partitioningFields) : s_Feature(), s_Model(nullptr), s_ElapsedTime(0), s_Probability(1.0), s_PartitioningFields(partitioningFields), s_Cutoff(1.0), s_IncludeCutoff(false) { } std::string CProbabilityAndInfluenceCalculator::SCorrelateParams::describe() const { return core::CContainerPrinter::print(s_Values) + " | feature = " + model_t::print(s_Feature) + ", @ " + core::CContainerPrinter::print(s_Times) + ", elapsedTime = " + core::CStringUtils::typeToString(s_ElapsedTime); } ////// CInfluenceCalculator ////// CInfluenceCalculator::~CInfluenceCalculator() { } double CInfluenceCalculator::intersectionInfluence(double logp, double logpi) { return maths::common::CTools::truncate(ratio(logpi, logp, 1.0), 0.0, 1.0); } double CInfluenceCalculator::complementInfluence(double logp, double logpi) { return maths::common::CTools::truncate(1.0 - ratio(logpi, logp, 0.0), 0.0, 1.0); } ////// CInfluenceUnavailableCalculator ////// void CInfluenceUnavailableCalculator::computeInfluences(TParams& params) const { params.s_Influences.clear(); } void CInfluenceUnavailableCalculator::computeInfluences(TCorrelateParams& params) const { params.s_Influences.clear(); } ////// CIndicatorInfluenceCalculator ////// void CIndicatorInfluenceCalculator::computeInfluences(TParams& params) const { params.s_Influences.clear(); doComputeIndicatorInfluences(params.s_InfluencerName, params.s_InfluencerValues, params.s_Influences); } void CIndicatorInfluenceCalculator::computeInfluences(TCorrelateParams& params) const { params.s_Influences.clear(); doComputeIndicatorInfluences(params.s_InfluencerName, params.s_InfluencerValues, params.s_Influences); } ////// CLogProbabilityComplementInfluenceCalculator ////// void CLogProbabilityComplementInfluenceCalculator::computeInfluences(TParams& params) const { params.s_Influences.clear(); maths::common::CModelProbabilityParams computeProbabilityParams; for (std::size_t i = 0; i < params.s_Tail.size(); ++i) { if (params.s_Tail[i] == maths_t::E_RightTail) { computeProbabilityParams.addCalculation(maths_t::E_OneSidedAbove).addCoordinate(i); } } if (computeProbabilityParams.calculations() > 0) { computeProbabilityParams .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[0]); TStrCRefDouble1VecDoublePrPrVec& influencerValues{params.s_InfluencerValues}; if (model_t::dimension(params.s_Feature) == 1) { std::sort(influencerValues.begin(), influencerValues.end(), CDecreasingValueInfluence(params.s_Tail[0])); } LOG_TRACE(<< "influencerValues = " << influencerValues); doComputeInfluences(params.s_Feature, CValueDifference(), complementInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Time, params.s_Value[0], params.s_Count, params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } void CLogProbabilityComplementInfluenceCalculator::computeInfluences(TCorrelateParams& params) const { params.s_Influences.clear(); if (params.s_Tail[0] == maths_t::E_RightTail) { std::size_t correlate{params.s_MostAnomalousCorrelate[0]}; maths::common::CModelProbabilityParams computeProbabilityParams; computeProbabilityParams.addCalculation(maths_t::E_OneSidedAbove) .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[correlate]) .mostAnomalousCorrelate(correlate); LOG_TRACE(<< "influencerValues = " << params.s_InfluencerValues); doComputeCorrelateInfluences( params.s_Feature, CValueDifference(), complementInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Times[correlate], params.s_Values[correlate], params.s_Counts[correlate], params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } ////// CLogProbabilityInfluenceCalculator ////// namespace { //! Maybe add \p coordinate and the appropriate calculation to \p params. void addCoordinate(maths_t::ETail tail, std::size_t coordinate, maths::common::CModelProbabilityParams& params) { switch (tail) { case maths_t::E_LeftTail: { params.addCalculation(maths_t::E_OneSidedBelow).addCoordinate(coordinate); break; } case maths_t::E_RightTail: { params.addCalculation(maths_t::E_OneSidedAbove).addCoordinate(coordinate); break; } case maths_t::E_MixedOrNeitherTail: case maths_t::E_UndeterminedTail: break; } } } void CLogProbabilityInfluenceCalculator::computeInfluences(TParams& params) const { params.s_Influences.clear(); maths::common::CModelProbabilityParams computeProbabilityParams; for (std::size_t i = 0; i < params.s_Tail.size(); ++i) { addCoordinate(params.s_Tail[i], i, computeProbabilityParams); } if (computeProbabilityParams.calculations() > 0) { computeProbabilityParams .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[0]); TStrCRefDouble1VecDoublePrPrVec& influencerValues{params.s_InfluencerValues}; if (model_t::dimension(params.s_Feature) == 1) { std::sort(influencerValues.begin(), influencerValues.end(), CDecreasingValueInfluence(params.s_Tail[0])); } LOG_TRACE(<< "influencerValues = " << influencerValues); doComputeInfluences(params.s_Feature, CValueIntersection(), intersectionInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Time, params.s_Value[0], params.s_Count, params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } void CLogProbabilityInfluenceCalculator::computeInfluences(TCorrelateParams& params) const { params.s_Influences.clear(); maths::common::CModelProbabilityParams computeProbabilityParams; addCoordinate(params.s_Tail[0], 0, computeProbabilityParams); if (computeProbabilityParams.calculations() > 0) { std::size_t correlate{params.s_MostAnomalousCorrelate[0]}; computeProbabilityParams .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[correlate]) .mostAnomalousCorrelate(correlate); LOG_TRACE(<< "influencerValues = " << params.s_InfluencerValues); doComputeCorrelateInfluences( params.s_Feature, CValueDifference(), intersectionInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Times[correlate], params.s_Values[correlate], params.s_Counts[correlate], params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } ////// CMeanInfluenceCalculator ////// void CMeanInfluenceCalculator::computeInfluences(TParams& params) const { params.s_Influences.clear(); maths::common::CModelProbabilityParams computeProbabilityParams; for (std::size_t i = 0; i < params.s_Tail.size(); ++i) { addCoordinate(params.s_Tail[i], i, computeProbabilityParams); } if (computeProbabilityParams.calculations() > 0) { computeProbabilityParams .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[0]); TStrCRefDouble1VecDoublePrPrVec& influencerValues{params.s_InfluencerValues}; if (model_t::dimension(params.s_Feature) == 1) { std::sort(influencerValues.begin(), influencerValues.end(), CDecreasingMeanInfluence(params.s_Tail[0], params.s_Value[0], params.s_Count)); } LOG_TRACE(<< "influencerValues = " << params.s_InfluencerValues); doComputeInfluences(params.s_Feature, CMeanDifference(), complementInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Time, params.s_Value[0], params.s_Count, params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } void CMeanInfluenceCalculator::computeInfluences(TCorrelateParams& params) const { params.s_Influences.clear(); maths::common::CModelProbabilityParams computeProbabilityParams; addCoordinate(params.s_Tail[0], 0, computeProbabilityParams); if (computeProbabilityParams.calculations() > 0) { std::size_t correlate{params.s_MostAnomalousCorrelate[0]}; computeProbabilityParams .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[correlate]) .mostAnomalousCorrelate(correlate); LOG_TRACE(<< "influencerValues = " << params.s_InfluencerValues); doComputeCorrelateInfluences( params.s_Feature, CMeanDifference(), complementInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Times[correlate], params.s_Values[correlate], params.s_Counts[correlate], params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } ////// CVarianceInfluenceCalculator ////// void CVarianceInfluenceCalculator::computeInfluences(TParams& params) const { params.s_Influences.clear(); maths::common::CModelProbabilityParams computeProbabilityParams; for (std::size_t i = 0; i < params.s_Tail.size(); ++i) { addCoordinate(params.s_Tail[i], i, computeProbabilityParams); } if (computeProbabilityParams.calculations() > 0) { computeProbabilityParams .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[0]); TStrCRefDouble1VecDoublePrPrVec& influencerValues{params.s_InfluencerValues}; if (model_t::dimension(params.s_Feature) == 1) { std::sort(influencerValues.begin(), influencerValues.end(), CDecreasingVarianceInfluence( params.s_Tail[0], params.s_Value[0], params.s_Count)); } LOG_TRACE(<< "influencerValues = " << influencerValues); doComputeInfluences(params.s_Feature, CVarianceDifference(), complementInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Time, params.s_Value[0], params.s_Count, params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } void CVarianceInfluenceCalculator::computeInfluences(TCorrelateParams& params) const { params.s_Influences.clear(); maths::common::CModelProbabilityParams computeProbabilityParams; addCoordinate(params.s_Tail[0], 0, computeProbabilityParams); if (computeProbabilityParams.calculations() > 0) { std::size_t correlate{params.s_MostAnomalousCorrelate[0]}; computeProbabilityParams .seasonalConfidenceInterval( params.s_ComputeProbabilityParams.seasonalConfidenceInterval()) .addWeights(params.s_ComputeProbabilityParams.weights()[correlate]) .mostAnomalousCorrelate(correlate); LOG_TRACE(<< "influencerValues = " << params.s_InfluencerValues); doComputeCorrelateInfluences( params.s_Feature, CVarianceDifference(), complementInfluence, *params.s_Model, params.s_ElapsedTime, computeProbabilityParams, params.s_Times[correlate], params.s_Values[correlate], params.s_Counts[correlate], params.s_InfluencerName, params.s_InfluencerValues, params.s_Cutoff, params.s_IncludeCutoff, params.s_Influences); } } } }