lib/model/CEventRateModelFactory.cc (222 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <model/CEventRateModelFactory.h>
#include <maths/common/CConstantPrior.h>
#include <maths/common/CGammaRateConjugate.h>
#include <maths/common/CLogNormalMeanPrecConjugate.h>
#include <maths/common/CMultimodalPrior.h>
#include <maths/common/CMultivariatePrior.h>
#include <maths/common/CNormalMeanPrecConjugate.h>
#include <maths/common/COneOfNPrior.h>
#include <maths/common/CPoissonMeanConjugate.h>
#include <maths/common/CXMeansOnline1d.h>
#include <model/CDataGatherer.h>
#include <model/CEventRateModel.h>
#include <model/CSearchKey.h>
#include <memory>
namespace ml {
namespace model {
CEventRateModelFactory::CEventRateModelFactory(const SModelParams& params,
const TInterimBucketCorrectorWPtr& interimBucketCorrector,
model_t::ESummaryMode summaryMode,
const std::string& summaryCountFieldName)
: CModelFactory(params, interimBucketCorrector), m_SummaryMode(summaryMode),
m_SummaryCountFieldName(summaryCountFieldName) {
}
CEventRateModelFactory* CEventRateModelFactory::clone() const {
return new CEventRateModelFactory(*this);
}
CAnomalyDetectorModel*
CEventRateModelFactory::makeModel(const SModelInitializationData& initData) const {
TDataGathererPtr dataGatherer = initData.s_DataGatherer;
if (dataGatherer == nullptr) {
LOG_ERROR(<< "NULL data gatherer");
return nullptr;
}
const TFeatureVec& features = dataGatherer->features();
TFeatureInfluenceCalculatorCPtrPrVecVec influenceCalculators;
influenceCalculators.reserve(m_InfluenceFieldNames.size());
for (const auto& name : m_InfluenceFieldNames) {
influenceCalculators.push_back(this->defaultInfluenceCalculators(name, features));
}
return new CEventRateModel(
this->modelParams(), dataGatherer,
this->defaultFeatureModels(features, dataGatherer->bucketLength(),
this->minimumSeasonalVarianceScale(), true),
this->defaultCorrelatePriors(features),
this->defaultCorrelates(features), this->defaultCategoricalPrior(),
influenceCalculators, this->interimBucketCorrector());
}
CAnomalyDetectorModel*
CEventRateModelFactory::makeModel(const SModelInitializationData& initData,
core::CStateRestoreTraverser& traverser) const {
TDataGathererPtr dataGatherer = initData.s_DataGatherer;
if (dataGatherer == nullptr) {
LOG_ERROR(<< "NULL data gatherer");
return nullptr;
}
const TFeatureVec& features = dataGatherer->features();
TFeatureInfluenceCalculatorCPtrPrVecVec influenceCalculators;
influenceCalculators.reserve(m_InfluenceFieldNames.size());
for (const auto& name : m_InfluenceFieldNames) {
influenceCalculators.push_back(this->defaultInfluenceCalculators(name, features));
}
return new CEventRateModel(
this->modelParams(), dataGatherer,
this->defaultFeatureModels(features, dataGatherer->bucketLength(),
this->minimumSeasonalVarianceScale(), true),
this->defaultCorrelatePriors(features), this->defaultCorrelates(features),
influenceCalculators, this->interimBucketCorrector(), traverser);
}
CModelFactory::TDataGathererPtr
CEventRateModelFactory::makeDataGatherer(const SGathererInitializationData& initData) const {
CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
m_SummaryCountFieldName,
m_PersonFieldName,
EMPTY_STRING,
m_ValueFieldName,
m_InfluenceFieldNames,
initData.s_StartTime,
initData.s_SampleOverrideCount};
return std::make_shared<CDataGatherer>(
model_t::E_EventRate, m_SummaryMode, this->modelParams(), initData.s_PartitionFieldValue,
this->searchKey(), m_Features, bucketGathererInitData);
}
CModelFactory::TDataGathererPtr
CEventRateModelFactory::makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const {
CBucketGatherer::SBucketGathererInitData bucketGathererInitData{
m_SummaryCountFieldName, m_PersonFieldName, EMPTY_STRING, m_ValueFieldName, m_InfluenceFieldNames, 0, 0};
return std::make_shared<CDataGatherer>(
model_t::E_EventRate, m_SummaryMode, this->modelParams(),
partitionFieldValue, this->searchKey(), bucketGathererInitData, traverser);
}
CEventRateModelFactory::TPriorPtr
CEventRateModelFactory::defaultPrior(model_t::EFeature feature,
const SModelParams& params) const {
// Categorical data all use the multinomial prior. The creation
// of these priors is managed by defaultCategoricalPrior.
if (model_t::isCategorical(feature)) {
return nullptr;
}
// If the feature data only ever takes a single value we use a
// special lightweight prior.
if (model_t::isConstant(feature)) {
return std::make_unique<maths::common::CConstantPrior>();
}
// Gaussian mixture for modeling time-of-day and time-of-week.
if (model_t::isDiurnal(feature)) {
return this->timeOfDayPrior(params);
}
// The data will be counts for the number of events in a specified
// interval. As such we expect counts to be greater than or equal
// to zero. We use a small non-zero offset, for the log-normal prior
// because the p.d.f. is zero at zero and for the gamma because the
// p.d.f. is badly behaved at zero (either zero or infinity), so they
// can model counts of zero.
maths_t::EDataType dataType = this->dataType();
maths::common::CGammaRateConjugate gammaPrior =
maths::common::CGammaRateConjugate::nonInformativePrior(dataType, 0.0,
params.s_DecayRate);
maths::common::CLogNormalMeanPrecConjugate logNormalPrior =
maths::common::CLogNormalMeanPrecConjugate::nonInformativePrior(
dataType, 0.0, params.s_DecayRate);
maths::common::CNormalMeanPrecConjugate normalPrior =
maths::common::CNormalMeanPrecConjugate::nonInformativePrior(
dataType, params.s_DecayRate);
maths::common::CPoissonMeanConjugate poissonPrior =
maths::common::CPoissonMeanConjugate::nonInformativePrior(0.0, params.s_DecayRate);
// Create the component priors.
maths::common::COneOfNPrior::TPriorPtrVec priors;
priors.reserve(params.s_MinimumModeFraction <= 0.5 ? 5 : 4);
priors.emplace_back(gammaPrior.clone());
priors.emplace_back(logNormalPrior.clone());
priors.emplace_back(normalPrior.clone());
priors.emplace_back(poissonPrior.clone());
if (params.s_MinimumModeFraction <= 0.5) {
// Create the multimode prior.
maths::common::COneOfNPrior::TPriorPtrVec modePriors;
modePriors.reserve(3);
modePriors.emplace_back(gammaPrior.clone());
modePriors.emplace_back(logNormalPrior.clone());
modePriors.emplace_back(normalPrior.clone());
maths::common::COneOfNPrior modePrior(modePriors, dataType, params.s_DecayRate);
maths::common::CXMeansOnline1d clusterer(
dataType, maths::common::CAvailableModeDistributions::ALL,
maths_t::E_ClustersFractionWeight, params.s_DecayRate, params.s_MinimumModeFraction,
params.s_MinimumModeCount, params.minimumCategoryCount());
maths::common::CMultimodalPrior multimodalPrior(dataType, clusterer, modePrior,
params.s_DecayRate);
priors.emplace_back(multimodalPrior.clone());
}
return std::make_unique<maths::common::COneOfNPrior>(priors, dataType,
params.s_DecayRate);
}
CEventRateModelFactory::TMultivariatePriorUPtr
CEventRateModelFactory::defaultMultivariatePrior(model_t::EFeature feature,
const SModelParams& params) const {
std::size_t dimension = model_t::dimension(feature);
TMultivariatePriorUPtrVec priors;
priors.reserve(params.s_MinimumModeFraction <= 0.5 ? 2 : 1);
TMultivariatePriorUPtr normal{this->multivariateNormalPrior(dimension, params)};
priors.push_back(std::move(normal));
if (params.s_MinimumModeFraction <= 0.5) {
priors.push_back(this->multivariateMultimodalPrior(dimension, params,
*priors.back()));
}
return this->multivariateOneOfNPrior(dimension, params, priors);
}
CEventRateModelFactory::TMultivariatePriorUPtr
CEventRateModelFactory::defaultCorrelatePrior(model_t::EFeature /*feature*/,
const SModelParams& params) const {
TMultivariatePriorUPtrVec priors;
priors.reserve(params.s_MinimumModeFraction <= 0.5 ? 2 : 1);
TMultivariatePriorUPtr normal{this->multivariateNormalPrior(2, params)};
priors.push_back(std::move(normal));
if (params.s_MinimumModeFraction <= 0.5) {
priors.push_back(this->multivariateMultimodalPrior(2, params, *priors.back()));
}
return this->multivariateOneOfNPrior(2, params, priors);
}
const CSearchKey& CEventRateModelFactory::searchKey() const {
if (m_SearchKeyCache == std::nullopt) {
m_SearchKeyCache.emplace(m_DetectorIndex, function_t::function(m_Features),
m_UseNull, this->modelParams().s_ExcludeFrequent,
m_ValueFieldName, m_PersonFieldName, "",
m_PartitionFieldName, m_InfluenceFieldNames);
}
return *m_SearchKeyCache;
}
bool CEventRateModelFactory::isSimpleCount() const {
return CSearchKey::isSimpleCount(function_t::function(m_Features), m_PersonFieldName);
}
model_t::ESummaryMode CEventRateModelFactory::summaryMode() const {
return m_SummaryMode;
}
maths_t::EDataType CEventRateModelFactory::dataType() const {
return maths_t::E_IntegerData;
}
void CEventRateModelFactory::detectorIndex(int detectorIndex) {
m_DetectorIndex = detectorIndex;
m_SearchKeyCache.reset();
}
void CEventRateModelFactory::fieldNames(const std::string& partitionFieldName,
const std::string& /*overFieldName*/,
const std::string& byFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames) {
m_PartitionFieldName = partitionFieldName;
m_PersonFieldName = byFieldName;
m_ValueFieldName = valueFieldName;
m_InfluenceFieldNames = influenceFieldNames;
m_SearchKeyCache.reset();
}
void CEventRateModelFactory::useNull(bool useNull) {
m_UseNull = useNull;
m_SearchKeyCache.reset();
}
void CEventRateModelFactory::features(const TFeatureVec& features) {
m_Features = features;
m_SearchKeyCache.reset();
}
double CEventRateModelFactory::minimumSeasonalVarianceScale() const {
return 0.4;
}
CEventRateModelFactory::TStrCRefVec CEventRateModelFactory::partitioningFields() const {
TStrCRefVec result;
result.reserve(2);
if (!m_PartitionFieldName.empty()) {
result.emplace_back(m_PartitionFieldName);
}
if (!m_PersonFieldName.empty()) {
result.emplace_back(m_PersonFieldName);
}
return result;
}
}
}