lib/model/CEventRatePopulationModelFactory.cc (234 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#include <model/CEventRatePopulationModelFactory.h>
#include <core/CMemoryDefStd.h>
#include <maths/common/CConstantPrior.h>
#include <maths/common/CGammaRateConjugate.h>
#include <maths/common/CLogNormalMeanPrecConjugate.h>
#include <maths/common/CMultimodalPrior.h>
#include <maths/common/CMultivariatePrior.h>
#include <maths/common/CNormalMeanPrecConjugate.h>
#include <maths/common/COneOfNPrior.h>
#include <maths/common/CPoissonMeanConjugate.h>
#include <maths/common/CXMeansOnline1d.h>
#include <model/CDataGatherer.h>
#include <model/CEventRatePopulationModel.h>
#include <memory>
namespace ml {
namespace model {
CEventRatePopulationModelFactory::CEventRatePopulationModelFactory(
const SModelParams& params,
const TInterimBucketCorrectorWPtr& interimBucketCorrector,
model_t::ESummaryMode summaryMode,
const std::string& summaryCountFieldName)
: CModelFactory(params, interimBucketCorrector), m_SummaryMode(summaryMode),
m_SummaryCountFieldName(summaryCountFieldName) {
}
CEventRatePopulationModelFactory* CEventRatePopulationModelFactory::clone() const {
return new CEventRatePopulationModelFactory(*this);
}
CAnomalyDetectorModel*
CEventRatePopulationModelFactory::makeModel(const SModelInitializationData& initData) const {
TDataGathererPtr dataGatherer = initData.s_DataGatherer;
if (dataGatherer == nullptr) {
LOG_ERROR(<< "NULL data gatherer");
return nullptr;
}
const TFeatureVec& features = dataGatherer->features();
TFeatureInfluenceCalculatorCPtrPrVecVec influenceCalculators;
influenceCalculators.reserve(m_InfluenceFieldNames.size());
for (const auto& name : m_InfluenceFieldNames) {
influenceCalculators.push_back(this->defaultInfluenceCalculators(name, features));
}
return new CEventRatePopulationModel(
this->modelParams(), dataGatherer,
this->defaultFeatureModels(features, dataGatherer->bucketLength(),
this->minimumSeasonalVarianceScale(), false),
this->defaultCorrelatePriors(features), this->defaultCorrelates(features),
influenceCalculators, this->interimBucketCorrector());
}
CAnomalyDetectorModel*
CEventRatePopulationModelFactory::makeModel(const SModelInitializationData& initData,
core::CStateRestoreTraverser& traverser) const {
TDataGathererPtr dataGatherer = initData.s_DataGatherer;
if (dataGatherer == nullptr) {
LOG_ERROR(<< "NULL data gatherer");
return nullptr;
}
const TFeatureVec& features = dataGatherer->features();
TFeatureInfluenceCalculatorCPtrPrVecVec influenceCalculators;
influenceCalculators.reserve(m_InfluenceFieldNames.size());
for (const auto& name : m_InfluenceFieldNames) {
influenceCalculators.push_back(this->defaultInfluenceCalculators(name, features));
}
return new CEventRatePopulationModel(
this->modelParams(), dataGatherer,
this->defaultFeatureModels(features, dataGatherer->bucketLength(),
this->minimumSeasonalVarianceScale(), false),
this->defaultCorrelatePriors(features), this->defaultCorrelates(features),
influenceCalculators, this->interimBucketCorrector(), traverser);
}
CModelFactory::TDataGathererPtr
CEventRatePopulationModelFactory::makeDataGatherer(const SGathererInitializationData& initData) const {
CBucketGatherer::SBucketGathererInitData const bucketGathererInitData{
.s_SummaryCountFieldName = m_SummaryCountFieldName,
.s_PersonFieldName = m_PersonFieldName,
.s_AttributeFieldName = m_AttributeFieldName,
.s_ValueFieldName = m_ValueFieldName,
.s_InfluenceFieldNames = m_InfluenceFieldNames,
.s_StartTime = initData.s_StartTime,
.s_SampleOverrideCount = 0};
return std::make_shared<CDataGatherer>(
model_t::E_PopulationEventRate, m_SummaryMode, this->modelParams(),
initData.s_PartitionFieldValue, this->searchKey(), m_Features,
bucketGathererInitData);
}
CModelFactory::TDataGathererPtr
CEventRatePopulationModelFactory::makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const {
CBucketGatherer::SBucketGathererInitData const bucketGathererInitData{
.s_SummaryCountFieldName = m_SummaryCountFieldName,
.s_PersonFieldName = m_PersonFieldName,
.s_AttributeFieldName = m_AttributeFieldName,
.s_ValueFieldName = m_ValueFieldName,
.s_InfluenceFieldNames = m_InfluenceFieldNames,
.s_StartTime = 0,
.s_SampleOverrideCount = 0};
return std::make_shared<CDataGatherer>(
model_t::E_PopulationEventRate, m_SummaryMode, this->modelParams(),
partitionFieldValue, this->searchKey(), bucketGathererInitData, traverser);
}
CEventRatePopulationModelFactory::TPriorPtr
CEventRatePopulationModelFactory::defaultPrior(model_t::EFeature feature,
const SModelParams& params) const {
// Categorical data all use the multinomial prior. The creation
// of these priors is managed by defaultCategoricalPrior.
if (model_t::isCategorical(feature)) {
return nullptr;
}
// If the feature data only ever takes a single value we use a
// special lightweight prior.
if (model_t::isConstant(feature)) {
return std::make_unique<maths::common::CConstantPrior>();
}
if (model_t::isDiurnal(feature)) {
return this->timeOfDayPrior(params);
}
// The feature data will be counts for the number of events in a
// specified interval. As such we expect counts to be greater than
// or equal to zero. We use a small non-zero offset, for the log-
// normal prior because the p.d.f. is zero at zero and for the
// gamma because the p.d.f. is badly behaved at zero (either zero
// or infinity), so they can model counts of zero.
maths_t::EDataType dataType = this->dataType();
maths::common::CGammaRateConjugate gammaPrior =
maths::common::CGammaRateConjugate::nonInformativePrior(dataType, 0.0,
params.s_DecayRate);
maths::common::CLogNormalMeanPrecConjugate logNormalPrior =
maths::common::CLogNormalMeanPrecConjugate::nonInformativePrior(
dataType, 0.0, params.s_DecayRate);
maths::common::CNormalMeanPrecConjugate normalPrior =
maths::common::CNormalMeanPrecConjugate::nonInformativePrior(
dataType, params.s_DecayRate);
maths::common::CPoissonMeanConjugate poissonPrior =
maths::common::CPoissonMeanConjugate::nonInformativePrior(0.0, params.s_DecayRate);
// Create the component priors.
maths::common::COneOfNPrior::TPriorPtrVec priors;
priors.reserve(params.s_MinimumModeFraction <= 0.5 ? 5 : 4);
priors.emplace_back(gammaPrior.clone());
priors.emplace_back(logNormalPrior.clone());
priors.emplace_back(normalPrior.clone());
priors.emplace_back(poissonPrior.clone());
if (params.s_MinimumModeFraction <= 0.5) {
// Create the multimode prior.
maths::common::COneOfNPrior::TPriorPtrVec modePriors;
modePriors.reserve(3);
modePriors.emplace_back(gammaPrior.clone());
modePriors.emplace_back(logNormalPrior.clone());
modePriors.emplace_back(normalPrior.clone());
maths::common::COneOfNPrior modePrior(modePriors, dataType, params.s_DecayRate);
maths::common::CXMeansOnline1d clusterer(
dataType, maths::common::CAvailableModeDistributions::ALL,
maths_t::E_ClustersFractionWeight, params.s_DecayRate, params.s_MinimumModeFraction,
params.s_MinimumModeCount, params.minimumCategoryCount());
maths::common::CMultimodalPrior multimodalPrior(dataType, clusterer, modePrior,
params.s_DecayRate);
priors.emplace_back(multimodalPrior.clone());
}
return std::make_unique<maths::common::COneOfNPrior>(priors, dataType,
params.s_DecayRate);
}
CEventRatePopulationModelFactory::TMultivariatePriorUPtr
CEventRatePopulationModelFactory::defaultMultivariatePrior(model_t::EFeature feature,
const SModelParams& params) const {
std::size_t dimension = model_t::dimension(feature);
TMultivariatePriorUPtrVec priors;
priors.reserve(params.s_MinimumModeFraction <= 0.5 ? 2 : 1);
TMultivariatePriorUPtr normal{this->multivariateNormalPrior(dimension, params)};
priors.push_back(std::move(normal));
if (params.s_MinimumModeFraction <= 0.5) {
priors.push_back(this->multivariateMultimodalPrior(dimension, params,
*priors.back()));
}
return this->multivariateOneOfNPrior(dimension, params, priors);
}
CEventRatePopulationModelFactory::TMultivariatePriorUPtr
CEventRatePopulationModelFactory::defaultCorrelatePrior(model_t::EFeature /*feature*/,
const SModelParams& params) const {
TMultivariatePriorUPtrVec priors;
priors.reserve(params.s_MinimumModeFraction <= 0.5 ? 2u : 1u);
TMultivariatePriorUPtr normal{this->multivariateNormalPrior(2, params)};
priors.push_back(std::move(normal));
if (params.s_MinimumModeFraction <= 0.5) {
priors.push_back(this->multivariateMultimodalPrior(2, params, *priors.back()));
}
return this->multivariateOneOfNPrior(2, params, priors);
}
const CSearchKey& CEventRatePopulationModelFactory::searchKey() const {
if (m_SearchKeyCache == std::nullopt) {
m_SearchKeyCache.emplace(m_DetectorIndex, function_t::function(m_Features),
m_UseNull, this->modelParams().s_ExcludeFrequent,
m_ValueFieldName, m_AttributeFieldName, m_PersonFieldName,
m_PartitionFieldName, m_InfluenceFieldNames);
}
return *m_SearchKeyCache;
}
bool CEventRatePopulationModelFactory::isSimpleCount() const {
return false;
}
model_t::ESummaryMode CEventRatePopulationModelFactory::summaryMode() const {
return m_SummaryMode;
}
maths_t::EDataType CEventRatePopulationModelFactory::dataType() const {
return maths_t::E_IntegerData;
}
void CEventRatePopulationModelFactory::detectorIndex(int detectorIndex) {
m_DetectorIndex = detectorIndex;
m_SearchKeyCache.reset();
}
void CEventRatePopulationModelFactory::fieldNames(const std::string& partitionFieldName,
const std::string& overFieldName,
const std::string& byFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames) {
m_PartitionFieldName = partitionFieldName;
m_PersonFieldName = overFieldName;
m_AttributeFieldName = byFieldName;
m_ValueFieldName = valueFieldName;
m_InfluenceFieldNames = influenceFieldNames;
m_SearchKeyCache.reset();
}
void CEventRatePopulationModelFactory::useNull(bool useNull) {
m_UseNull = useNull;
m_SearchKeyCache.reset();
}
void CEventRatePopulationModelFactory::features(const TFeatureVec& features) {
m_Features = features;
m_SearchKeyCache.reset();
}
CEventRatePopulationModelFactory::TStrCRefVec
CEventRatePopulationModelFactory::partitioningFields() const {
TStrCRefVec result;
result.reserve(3);
if (!m_PartitionFieldName.empty()) {
result.emplace_back(m_PartitionFieldName);
}
if (!m_PersonFieldName.empty()) {
result.emplace_back(m_PersonFieldName);
}
if (!m_AttributeFieldName.empty()) {
result.emplace_back(m_AttributeFieldName);
}
return result;
}
double CEventRatePopulationModelFactory::minimumSeasonalVarianceScale() const {
return 1.0;
}
}
}