include/model/CModelFactory.h (197 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#ifndef INCLUDED_ml_model_CModelFactory_h
#define INCLUDED_ml_model_CModelFactory_h
#include <core/CoreTypes.h>
#include <maths/common/COrderings.h>
#include <maths/common/MathsTypes.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>
#include <model/SModelParams.h>
#include <map>
#include <memory>
#include <optional>
#include <vector>
namespace ml {
namespace core {
class CStateRestoreTraverser;
}
namespace maths {
namespace common {
class CModel;
class CMultinomialConjugate;
class CMultivariatePrior;
class CPrior;
}
namespace time_series {
class CTimeSeriesCorrelations;
class CTimeSeriesDecompositionInterface;
}
}
namespace model {
class CAnomalyDetectorModel;
class CDataGatherer;
class CInfluenceCalculator;
class CInterimBucketCorrector;
class CSearchKey;
//! \brief A factory class interface for the CAnomalyDetectorModel hierarchy.
//!
//! DESCRIPTION:\n
//! The interface for the factory classes for making concrete objects
//! in the CAnomalyDetectorModel hierarchy.
//!
//! IMPLEMENTATION DECISIONS:\n
//! The CModelConfig object is able to dynamically configure the
//! anomaly detection classes:
//! -# CAnomalyDetector,
//! -# CAnomalyDetectorModel,
//! ...
//!
//! to either compute online or delta probabilities for log messages,
//! metric values, etc. This hierarchy implements the factory pattern
//! for the CAnomalyDetectorModel hierarchy for this purpose.
class MODEL_EXPORT CModelFactory {
public:
using TFeatureVec = std::vector<model_t::EFeature>;
using TStrVec = std::vector<std::string>;
using TOptionalUInt = std::optional<unsigned int>;
using TStrCRef = std::reference_wrapper<const std::string>;
using TStrCRefVec = std::vector<TStrCRef>;
using TDataGathererPtr = std::shared_ptr<CDataGatherer>;
using TPriorPtr = std::unique_ptr<maths::common::CPrior>;
using TMultivariatePriorSPtr = std::shared_ptr<maths::common::CMultivariatePrior>;
using TMultivariatePriorUPtr = std::unique_ptr<maths::common::CMultivariatePrior>;
using TFeatureMultivariatePriorSPtrPr = std::pair<model_t::EFeature, TMultivariatePriorSPtr>;
using TFeatureMultivariatePriorSPtrPrVec = std::vector<TFeatureMultivariatePriorSPtrPr>;
using TDecompositionCPtr =
std::shared_ptr<const maths::time_series::CTimeSeriesDecompositionInterface>;
using TMathsModelPtr = std::shared_ptr<maths::common::CModel>;
using TFeatureMathsModelPtrPr = std::pair<model_t::EFeature, TMathsModelPtr>;
using TFeatureMathsModelPtrPrVec = std::vector<TFeatureMathsModelPtrPr>;
using TCorrelationsPtr = std::unique_ptr<maths::time_series::CTimeSeriesCorrelations>;
using TFeatureCorrelationsPtrPr = std::pair<model_t::EFeature, TCorrelationsPtr>;
using TFeatureCorrelationsPtrPrVec = std::vector<TFeatureCorrelationsPtrPr>;
using TModelPtr = std::shared_ptr<CAnomalyDetectorModel>;
using TModelCPtr = std::shared_ptr<const CAnomalyDetectorModel>;
using TInfluenceCalculatorCPtr = std::shared_ptr<const CInfluenceCalculator>;
using TFeatureInfluenceCalculatorCPtrPr =
std::pair<model_t::EFeature, TInfluenceCalculatorCPtr>;
using TFeatureInfluenceCalculatorCPtrPrVec = std::vector<TFeatureInfluenceCalculatorCPtrPr>;
using TFeatureInfluenceCalculatorCPtrPrVecVec =
std::vector<TFeatureInfluenceCalculatorCPtrPrVec>;
using TInterimBucketCorrectorWPtr = std::weak_ptr<CInterimBucketCorrector>;
using TInterimBucketCorrectorPtr = std::shared_ptr<CInterimBucketCorrector>;
using TDetectionRuleVec = std::vector<CDetectionRule>;
using TDetectionRuleVecCRef = std::reference_wrapper<const TDetectionRuleVec>;
using TStrDetectionRulePr = std::pair<std::string, model::CDetectionRule>;
using TStrDetectionRulePrVec = std::vector<TStrDetectionRulePr>;
using TStrDetectionRulePrVecCRef = std::reference_wrapper<const TStrDetectionRulePrVec>;
public:
//! Wrapper around the model initialization data.
//!
//! IMPLEMENTATION DECISIONS:\n
//! We wrap up the initialization data in an object so we don't
//! need to change the signature of every factory function each
//! time we need extra data to initialize a model.
struct MODEL_EXPORT SModelInitializationData {
SModelInitializationData(const TDataGathererPtr& dataGatherer);
TDataGathererPtr s_DataGatherer;
};
//! Wrapper around the data gatherer initialization data.
//!
//! IMPLEMENTATION DECISIONS:\n
//! We wrap up the initialization data in an object so we don't
//! need to change the signature of every factory function each
//! time we need extra data to initialize a data gatherer.
struct MODEL_EXPORT SGathererInitializationData {
SGathererInitializationData(core_t::TTime startTime,
const std::string& partitionFieldValue,
unsigned int sampleOverrideCount = 0u);
//! This constructor is to simplify unit testing.
SGathererInitializationData(const core_t::TTime startTime);
core_t::TTime s_StartTime;
const std::string& s_PartitionFieldValue;
unsigned int s_SampleOverrideCount;
};
public:
static const std::string EMPTY_STRING;
public:
//! \warning The user must ensure that \p interimBucketCorrector
//! outlives this object. If model factories are obtained from
//! CModelConfig this is ensured for you.
CModelFactory(const SModelParams& params,
const TInterimBucketCorrectorWPtr& interimBucketCorrector);
virtual ~CModelFactory() = default;
//! Create a copy of the factory owned by the calling code.
virtual CModelFactory* clone() const = 0;
//! \name Factory Methods
//@{
//! Make a new model.
//!
//! \param[in] initData The parameters needed to initialize the model.
//! \warning It is owned by the calling code.
virtual CAnomalyDetectorModel*
makeModel(const SModelInitializationData& initData) const = 0;
//! Make a new model from part of a state document.
//!
//! \param[in] initData Additional parameters needed to initialize
//! the model.
//! \param[in,out] traverser A state document traverser.
//! \warning It is owned by the calling code.
virtual CAnomalyDetectorModel*
makeModel(const SModelInitializationData& initData,
core::CStateRestoreTraverser& traverser) const = 0;
//! Make a new data gatherer.
//!
//! \param[in] initData The parameters needed to initialize the
//! data gatherer.
//! \warning It is owned by the calling code.
virtual TDataGathererPtr
makeDataGatherer(const SGathererInitializationData& initData) const = 0;
//! Make a new data gatherer from part of a state document.
//!
//! \param[in,out] traverser A state document traverser.
//! \param[in] partitionFieldValue The partition field value.
//! \warning It is owned by the calling code.
virtual TDataGathererPtr
makeDataGatherer(const std::string& partitionFieldValue,
core::CStateRestoreTraverser& traverser) const = 0;
//@}
//! \name Defaults
//@{
//! Get the default models to use for \p features and \p bucketLength.
const TFeatureMathsModelPtrPrVec& defaultFeatureModels(const TFeatureVec& features,
core_t::TTime bucketLength,
double minimumSeasonalVarianceScale,
bool modelAnomalies) const;
//! Get the default model to use for \p features and \p bucketLength.
TMathsModelPtr defaultFeatureModel(model_t::EFeature feature,
core_t::TTime bucketLength,
double minimumSeasonalVarianceScale,
bool modelAnomalies) const;
//! Get the default correlate priors to use for correlated pairs of time
//! series of \p features.
const TFeatureMultivariatePriorSPtrPrVec&
defaultCorrelatePriors(const TFeatureVec& features) const;
//! Get the default models for correlations of \p features.
TFeatureCorrelationsPtrPrVec defaultCorrelates(const TFeatureVec& features) const;
//! Get the default prior to use for \p feature.
TPriorPtr defaultPrior(model_t::EFeature feature) const;
//! Get the default prior to use for multivariate \p feature.
TMultivariatePriorUPtr defaultMultivariatePrior(model_t::EFeature feature) const;
//! Get the default prior to use for correlared pairs of time
//! series for univariate \p feature.
TMultivariatePriorUPtr defaultCorrelatePrior(model_t::EFeature feature) const;
//! Get the default prior for \p feature.
//!
//! \param[in] feature The feature for which to get the prior.
//! \param[in] params The model parameters.
virtual TPriorPtr defaultPrior(model_t::EFeature feature,
const SModelParams& params) const = 0;
//! Get the default prior for multivariate \p feature.
//!
//! \param[in] feature The feature for which to get the prior.
//! \param[in] params The model parameters.
virtual TMultivariatePriorUPtr
defaultMultivariatePrior(model_t::EFeature feature, const SModelParams& params) const = 0;
//! Get the default prior for pairs of correlated time series
//! of \p feature.
//!
//! \param[in] feature The feature for which to get the prior.
//! \param[in] params The model parameters.
virtual TMultivariatePriorUPtr
defaultCorrelatePrior(model_t::EFeature feature, const SModelParams& params) const = 0;
//! Get the default prior to use for categorical data.
maths::common::CMultinomialConjugate defaultCategoricalPrior() const;
//! Get the default time series decomposition.
//!
//! \param[in] feature The feature for which to get the decomposition.
//! \param[in] bucketLength The data bucketing length.
TDecompositionCPtr defaultDecomposition(model_t::EFeature feature,
core_t::TTime bucketLength) const;
//! Get the influence calculators to use for each feature in \p features.
const TFeatureInfluenceCalculatorCPtrPrVec&
defaultInfluenceCalculators(const std::string& influencerName,
const TFeatureVec& features) const;
//@}
//! Get the search key corresponding to this factory.
virtual const CSearchKey& searchKey() const = 0;
//! Check if this makes the model used for a simple counting search.
virtual bool isSimpleCount() const = 0;
//! Check the pre-summarisation mode for this factory.
virtual model_t::ESummaryMode summaryMode() const = 0;
//! Get the default data type for models from this factory.
virtual maths_t::EDataType dataType() const = 0;
//! \name Customization by a specific search
//@{
//! Set the identifier of the search for which this generates models.
virtual void detectorIndex(int detectorIndex) = 0;
//! Set the record field names which will be modeled.
virtual void fieldNames(const std::string& partitionFieldName,
const std::string& overFieldName,
const std::string& byFieldName,
const std::string& valueFieldName,
const TStrVec& influenceFieldNames) = 0;
//! Set whether the model should process missing field values.
virtual void useNull(bool useNull) = 0;
//! Set the features which will be modeled.
virtual void features(const TFeatureVec& features) = 0;
//! Set the amount by which metric sample count is reduced for
//! fine-grained sampling when there is latency.
void sampleCountFactor(std::size_t sampleCountFactor);
//! Set whether the model should exclude frequent hitters from the
//! calculations.
void excludeFrequent(model_t::EExcludeFrequent excludeFrequent);
//! Set the detection rules for a detector.
void detectionRules(TDetectionRuleVecCRef detectionRules);
//@}
//! Set the scheduled events.
void scheduledEvents(TStrDetectionRulePrVecCRef scheduledEvents);
//! Set the interim bucket corrector.
//!
//! \warning The caller must ensure that \p interimBucketCorrector
//! outlives this object. If model factories are obtained from
//! CModelConfig this is ensured for you.
void interimBucketCorrector(const TInterimBucketCorrectorWPtr& interimBucketCorrector);
//! \name Customization
//@{
//! Set the learn rate used for initializing models.
void learnRate(double learnRate);
//! Set the decay rate used for initializing the models.
void decayRate(double decayRate);
//! Set the initial decay rate multiplier used for initializing
//! models.
void initialDecayRateMultiplier(double multiplier);
//! Set the maximum number of times we'll update a person's model
//! in a bucketing interval.
void maximumUpdatesPerBucket(double maximumUpdatesPerBucket);
//! Set the prune window scale factor minimum
void pruneWindowScaleMinimum(double factor);
//! Set the prune window scale factor maximum
void pruneWindowScaleMaximum(double factor);
//! Set the window length to use for multibucket features.
//!
//! \note A length of zero disables modeling of multibucket features altogether.
void multibucketFeaturesWindowLength(std::size_t length);
//! Set whether multivariate analysis of correlated 'by' fields should
//! be performed.
void multivariateByFields(bool enabled);
//! Set the minimum mode fraction used for initializing the models.
void minimumModeFraction(double minimumModeFraction);
//! Set the minimum mode count used for initializing the models.
void minimumModeCount(double minimumModeCount);
//! Set the periods and the number of points we'll use to model
//! of the seasonal components in the data.
void componentSize(std::size_t componentSize);
//@}
//! Update the bucket length
void updateBucketLength(core_t::TTime length);
//! Set whether model annotations should be reported.
void annotationsEnabled(bool enabled);
//! Get global model configuration parameters.
const SModelParams& modelParams() const;
//! Get the minimum mode fraction used for initializing the models.
double minimumModeFraction() const;
//! Set the minimum mode count used for initializing the models.
double minimumModeCount() const;
//! Get the number of points to use for approximating each seasonal
//! component.
std::size_t componentSize() const;
//! Get the minimum seasonal variance scale, specific to the model
virtual double minimumSeasonalVarianceScale() const = 0;
protected:
using TMultivariatePriorUPtrVec = std::vector<TMultivariatePriorUPtr>;
using TOptionalSearchKey = std::optional<CSearchKey>;
protected:
//! Get the singleton interim bucket correction calculator.
TInterimBucketCorrectorPtr interimBucketCorrector() const;
//! Get a multivariate normal prior with dimension \p dimension.
//!
//! \param[in] dimension The dimension.
//! \param[in] params The model parameters.
//! \warning Up to ten dimensions are supported.
TMultivariatePriorUPtr multivariateNormalPrior(std::size_t dimension,
const SModelParams& params) const;
//! Get a multivariate multimodal prior with dimension \p dimension.
//!
//! \param[in] dimension The dimension.
//! \param[in] params The model parameters.
//! \warning Up to ten dimensions are supported.
TMultivariatePriorUPtr
multivariateMultimodalPrior(std::size_t dimension,
const SModelParams& params,
const maths::common::CMultivariatePrior& modePrior) const;
//! Get a multivariate 1-of-n prior with dimension \p dimension.
//!
//! \param[in] dimension The dimension.
//! \param[in] params The model parameters.
//! \param[in] models The component models to select between.
TMultivariatePriorUPtr multivariateOneOfNPrior(std::size_t dimension,
const SModelParams& params,
const TMultivariatePriorUPtrVec& models) const;
//! Get the default prior for time-of-day and time-of-week modeling.
//! This is just a mixture of normals which allows more modes than
//! we typically do.
//!
//! \param[in] params The model parameters.
TPriorPtr timeOfDayPrior(const SModelParams& params) const;
//! Get the default prior for latitude and longitude modeling.
//! This is just a mixture of correlate normals which allows more
//! modes than we typically do.
//!
//! \param[in] params The model parameters.
TMultivariatePriorUPtr latLongPrior(const SModelParams& params) const;
private:
using TFeatureVecMathsModelMap = std::map<TFeatureVec, TFeatureMathsModelPtrPrVec>;
using TFeatureVecMultivariatePriorMap =
std::map<TFeatureVec, TFeatureMultivariatePriorSPtrPrVec>;
using TStrFeatureVecPr = std::pair<std::string, TFeatureVec>;
using TStrFeatureVecPrInfluenceCalculatorCPtrMap =
std::map<TStrFeatureVecPr, TFeatureInfluenceCalculatorCPtrPrVec, maths::common::COrderings::SLess>;
private:
//! Get the field values which partition the data for modeling.
virtual TStrCRefVec partitioningFields() const = 0;
private:
//! The global model configuration parameters.
SModelParams m_ModelParams;
//! A reference to the singleton interim bucket correction calculator.
//!
//! \note It is the responsibility of the user of the factory class
//! to ensure that the interim bucket corrector is not deleted whilst
//! still in use. We store it here by weak pointer since we don't want
//! this to update the reference count so we properly account for its
//! memory usage in the objects this creates.
TInterimBucketCorrectorWPtr m_InterimBucketCorrector;
//! A cache of models for collections of features.
mutable TFeatureVecMathsModelMap m_MathsModelCache;
//! A cache of priors for correlate pairs of collections of features.
mutable TFeatureVecMultivariatePriorMap m_CorrelatePriorCache;
//! A cache of influence calculators for collections of features.
mutable TStrFeatureVecPrInfluenceCalculatorCPtrMap m_InfluenceCalculatorCache;
};
}
}
#endif // INCLUDED_ml_model_CModelFactory_h