include/model/CAnomalyDetectorModelConfig.h (184 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#ifndef INCLUDED_ml_model_CAnomalyDetectorModelConfig_h
#define INCLUDED_ml_model_CAnomalyDetectorModelConfig_h
#include <core/CoreTypes.h>
#include <model/CSearchKey.h>
#include <model/FunctionTypes.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>
#include <boost/property_tree/ptree_fwd.hpp>
#include <boost/unordered_map.hpp>
#include <cstddef>
#include <map>
#include <memory>
#include <set>
#include <utility>
#include <vector>
namespace ml {
namespace model {
class CDetectionRule;
class CInterimBucketCorrector;
class CSearchKey;
class CModelFactory;
//! \brief Responsible for configuring anomaly detection models.
//!
//! DESCRIPTION:\n
//! Responsible for configuring classes for performing anomaly detection.
//! It also defines all parameter defaults.
//!
//! IMPLEMENTATION DECISIONS:\n
//! This wraps up the configuration of anomaly detection to encapsulate
//! the details from calling code. It is anticipated that:
//! -# Some of this information will be exposed to the user via a
//! configuration file,
//! -# Some may be calculated from data characteristics and so on.
class MODEL_EXPORT CAnomalyDetectorModelConfig {
public:
//! The possible factory types.
enum EFactoryType {
E_EventRateFactory = 0,
E_MetricFactory = 1,
E_EventRatePopulationFactory = 2,
E_MetricPopulationFactory = 3,
E_CountingFactory = 4,
E_UnknownFactory,
E_BadFactory
};
using TStrSet = std::set<std::string>;
using TSizeVec = std::vector<std::size_t>;
using TTimeVec = std::vector<core_t::TTime>;
using TTimeVecCItr = TTimeVec::const_iterator;
using TDoubleDoublePr = std::pair<double, double>;
using TDoubleDoublePrVec = std::vector<TDoubleDoublePr>;
using TFeatureVec = model_t::TFeatureVec;
using TStrVec = std::vector<std::string>;
using TStrVecCItr = TStrVec::const_iterator;
using TInterimBucketCorrectorPtr = std::shared_ptr<CInterimBucketCorrector>;
using TModelFactoryPtr = std::shared_ptr<CModelFactory>;
using TModelFactoryCPtr = std::shared_ptr<const CModelFactory>;
using TFactoryTypeFactoryPtrMap = std::map<EFactoryType, TModelFactoryPtr>;
using TFactoryTypeFactoryPtrMapItr = TFactoryTypeFactoryPtrMap::iterator;
using TFactoryTypeFactoryPtrMapCItr = TFactoryTypeFactoryPtrMap::const_iterator;
using TSearchKeyFactoryCPtrMap = std::map<CSearchKey, TModelFactoryCPtr>;
// Const ref to detection rules map
using TDetectionRuleVec = std::vector<CDetectionRule>;
using TDetectionRuleVecCRef = std::reference_wrapper<const TDetectionRuleVec>;
using TIntDetectionRuleVecUMap = boost::unordered_map<int, TDetectionRuleVec>;
using TIntDetectionRuleVecUMapCRef = std::reference_wrapper<const TIntDetectionRuleVecUMap>;
using TIntDetectionRuleVecUMapCItr = TIntDetectionRuleVecUMap::const_iterator;
using TStrDetectionRulePr = std::pair<std::string, model::CDetectionRule>;
using TStrDetectionRulePrVec = std::vector<TStrDetectionRulePr>;
using TStrDetectionRulePrVecCRef = std::reference_wrapper<const TStrDetectionRulePrVec>;
public:
//! \name Data Gathering
//@{
//! The default value used to separate components of a multivariate feature
//! in its string value.
static const std::string DEFAULT_MULTIVARIATE_COMPONENT_DELIMITER;
//! Bucket length if none is specified on the command line.
static const core_t::TTime DEFAULT_BUCKET_LENGTH;
//! Default maximum number of buckets for receiving out of order records.
static const std::size_t DEFAULT_LATENCY_BUCKETS;
//! Default amount by which metric sample count is reduced for fine-grained
//! sampling when there is no latency.
static const std::size_t DEFAULT_SAMPLE_COUNT_FACTOR_NO_LATENCY;
//! Default amount by which metric sample count is reduced for fine-grained
//! sampling when there is latency.
static const std::size_t DEFAULT_SAMPLE_COUNT_FACTOR_WITH_LATENCY;
//! Default amount by which the metric sample queue expands when it is full.
static const double DEFAULT_SAMPLE_QUEUE_GROWTH_FACTOR;
//! Bucket length corresponding to the default decay and learn rates.
static const core_t::TTime STANDARD_BUCKET_LENGTH;
//@}
//! \name Modelling
//@{
//! The default rate at which the model priors decay to non-informative
//! per standard bucket length.
static const double DEFAULT_DECAY_RATE;
//! The initial rate, as a multiple of the default decay rate, at which
//! the model priors decay to non-informative per standard bucket length.
static const double DEFAULT_INITIAL_DECAY_RATE_MULTIPLIER;
//! The rate at which information accrues in the model per standard
//! bucket length elapsed.
static const double DEFAULT_LEARN_RATE;
//! The default minimum permitted fraction of points in a distribution
//! mode for individual modeling.
static const double DEFAULT_INDIVIDUAL_MINIMUM_MODE_FRACTION;
//! The default minimum permitted fraction of points in a distribution
//! mode for population modeling.
static const double DEFAULT_POPULATION_MINIMUM_MODE_FRACTION;
//! The default minimum count in a cluster we'll permit in a cluster.
static const double DEFAULT_MINIMUM_CLUSTER_SPLIT_COUNT;
//! The default proportion of initial count at which we'll delete a
//! category from the sketch to cluster.
static const double DEFAULT_CATEGORY_DELETE_FRACTION;
//! The default size of the seasonal components we will model.
static const std::size_t DEFAULT_COMPONENT_SIZE;
//! The default minimum time to detect a change point in a time series.
static const core_t::TTime DEFAULT_MINIMUM_TIME_TO_DETECT_CHANGE;
//! The default maximum time to test for a change point in a time series.
static const core_t::TTime DEFAULT_MAXIMUM_TIME_TO_TEST_FOR_CHANGE;
//! The default number of time buckets used to generate multibucket features
//! for anomaly detection.
static const std::size_t MULTIBUCKET_FEATURES_WINDOW_LENGTH;
//! The maximum value that the multi_bucket_impact can take
static const double MAXIMUM_MULTI_BUCKET_IMPACT_MAGNITUDE;
//! The maximum number of times we'll update a model in a bucketing
//! interval. This only applies to our metric statistics, which are
//! computed on a fixed number of measurements rather than a fixed
//! time interval. A value of zero implies no constraint.
static const double DEFAULT_MAXIMUM_UPDATES_PER_BUCKET;
//! The default minimum value for the influence for which an influencing
//! field value is judged to have any influence on a feature value.
static const double DEFAULT_INFLUENCE_CUTOFF;
//! The default scale factor of the decayRate that determines the minimum
//! size of the sliding prune window for purging older entries from the
//! model.
static const double DEFAULT_PRUNE_WINDOW_SCALE_MINIMUM;
//! The default scale factor of the decayRate that determines the maximum
//! size of the sliding prune window for purging older entries from the
//! model.
static const double DEFAULT_PRUNE_WINDOW_SCALE_MAXIMUM;
//! The default factor increase in priors used to model correlations.
static const double DEFAULT_CORRELATION_MODELS_OVERHEAD;
//! The default threshold for the Pearson correlation coefficient at
//! which a correlate will be modeled.
static const double DEFAULT_MINIMUM_SIGNIFICANT_CORRELATION;
//@}
//! \name Anomaly Score Calculation
//@{
//! The default values for the aggregation styles' parameters.
static const double DEFAULT_AGGREGATION_STYLE_PARAMS[model_t::NUMBER_AGGREGATION_STYLES][model_t::NUMBER_AGGREGATION_PARAMS];
//! The default maximum probability which is deemed to be anomalous.
static const double DEFAULT_MAXIMUM_ANOMALOUS_PROBABILITY;
//@}
//! \name Anomaly Score Normalization
//@{
//! The default historic anomaly score percentile for which lower
//! values are classified as noise.
static const double DEFAULT_NOISE_PERCENTILE;
//! The default multiplier applied to the noise level score in
//! order to be classified as anomalous.
static const double DEFAULT_NOISE_MULTIPLIER;
//! We use a piecewise linear mapping between the raw anomaly score
//! and the normalized anomaly score with these default knot points.
//! In particular, if we define the percentile of a raw score \f$s\f$
//! as \f$f_q(s)\f$ and \f$a = \max\{x \le f_q(s)\}\f$ and
//! \f$b = \min{x \ge f_q(s)}\f$ where \f$x\f$ ranges over the knot point
//! X- values then the normalized score would be:\n
//! <pre class="fragment">
//! \f$\displaystyle \bar{s} = \frac{(y(b) - y(a))(f_q(s) - a)}{b - a}\f$
//! </pre>
//! Here, \f$y(.)\f$ denote the corresponding knot point Y- values.
static const TDoubleDoublePr DEFAULT_NORMALIZED_SCORE_KNOT_POINTS[9];
//@}
public:
//! Create the default configuration.
//!
//! \param[in] bucketLength The bucketing interval length.
//! \param[in] summaryMode Indicates whether the data being gathered
//! are already summarized by an external aggregation process.
//! \param[in] summaryCountFieldName If \p summaryMode is E_Manual
//! then this is the name of the field holding the summary count.
//! \param[in] latency The amount of time records are buffered for, to
//! allow out-of-order records to be seen by the models in order.
//! \param[in] multivariateByFields Should multivariate analysis of
//! correlated 'by' fields be performed?
static CAnomalyDetectorModelConfig defaultConfig(core_t::TTime bucketLength,
model_t::ESummaryMode summaryMode,
const std::string& summaryCountFieldName,
core_t::TTime latency,
bool multivariateByFields);
//! Overload using defaults.
static CAnomalyDetectorModelConfig
defaultConfig(core_t::TTime bucketLength = DEFAULT_BUCKET_LENGTH,
model_t::ESummaryMode summaryMode = model_t::E_None,
const std::string& summaryCountFieldName = "") {
return defaultConfig(bucketLength, summaryMode, summaryCountFieldName,
DEFAULT_LATENCY_BUCKETS * bucketLength, false);
}
//! Get the factor to normalize all bucket lengths to the default
//! bucket length.
static double bucketNormalizationFactor(core_t::TTime bucketLength);
//! Get the decay rate to use for the time series decomposition given
//! the model decay rate \p modelDecayRate.
static double trendDecayRate(double modelDecayRate, core_t::TTime bucketLength);
public:
CAnomalyDetectorModelConfig();
//! Set the data bucketing interval.
void bucketLength(core_t::TTime length);
//! Set the single interim bucket correction calculator.
void interimBucketCorrector(const TInterimBucketCorrectorPtr& interimBucketCorrector);
//! Set whether to model multibucket features.
void useMultibucketFeatures(bool enabled);
//! Set whether multivariate analysis of correlated 'by' fields should
//! be performed.
void multivariateByFields(bool enabled);
//! Set the model factories.
void factories(const TFactoryTypeFactoryPtrMap& factories);
//! Set the style and parameter value for raw score aggregation.
bool aggregationStyleParams(model_t::EAggregationStyle style,
model_t::EAggregationParam param,
double value);
//! Set the maximum anomalous probability.
void maximumAnomalousProbability(double probability);
//! Set the noise level as a percentile of historic raw anomaly scores.
bool noisePercentile(double percentile);
//! Set the noise multiplier to use when derating normalized scores
//! based on the noise score level.
bool noiseMultiplier(double multiplier);
//! Set the normalized score knot points for the piecewise linear curve
//! between historic raw score percentiles and normalized scores.
bool normalizedScoreKnotPoints(const TDoubleDoublePrVec& points);
//! Populate the parameters from a configuration file.
bool init(const std::string& configFile);
//! Populate the parameters from a configuration file, also retrieving
//! the raw property tree created from the config file. (The raw
//! property tree is only valid if the method returns true.)
bool init(const std::string& configFile, boost::property_tree::ptree& propTree);
//! Populate the parameters from a property tree.
bool init(const boost::property_tree::ptree& propTree);
//! Get the factory for new models.
//!
//! \param[in] key The key of the detector for which the factory will be
//! used.
TModelFactoryCPtr factory(const CSearchKey& key) const;
//! Get the factory for new models.
//!
//! \param[in] identifier The identifier of the search for which to get a model
//! factory.
//! \param[in] function The function being invoked.
//! \param[in] useNull If true then we will process missing fields as if their
//! value is equal to the empty string where possible.
//! \param[in] excludeFrequent Whether to discard frequent results
//! \param[in] personFieldName The name of the over field.
//! \param[in] attributeFieldName The name of the by field.
//! \param[in] valueFieldName The name of the field containing metric values.
//! \param[in] influenceFieldNames The list of influence field names.
TModelFactoryCPtr
factory(int identifier,
function_t::EFunction function,
bool useNull = false,
model_t::EExcludeFrequent excludeFrequent = model_t::E_XF_None,
const std::string& partitionFieldName = std::string(),
const std::string& personFieldName = std::string(),
const std::string& attributeFieldName = std::string(),
const std::string& valueFieldName = std::string(),
const CSearchKey::TStrVec& influenceFieldNames = CSearchKey::TStrVec()) const;
//! Set the rate at which the models lose information.
void decayRate(double value);
//! Get the rate at which the models lose information.
double decayRate() const;
//! Get the length of the baseline.
core_t::TTime baselineLength() const;
//! Get the bucket length.
core_t::TTime bucketLength() const;
//! Get the period of time at which to perform a potential prune of the models
//! expressed in number of seconds.
core_t::TTime modelPruneWindow() const;
//! Set the period of time at which to perform a potential prune of the models
//! expressed in number of seconds.
void modelPruneWindow(core_t::TTime modelPruneWindow);
//! Get the maximum latency in the arrival of out of order data.
core_t::TTime latency() const;
//! Get the maximum latency in the arrival of out of order data in
//! numbers of buckets.
std::size_t latencyBuckets() const;
//! Get the single interim bucket correction calculator.
const CInterimBucketCorrector& interimBucketCorrector() const;
//! Should multivariate analysis of correlated 'by' fields be performed?
bool multivariateByFields() const;
//! \name Model Plot
//@{
//! Configure modelPlotConfig params from file
bool configureModelPlot(const std::string& modelPlotConfigFile);
//! Configure modelPlotConfig params from a property tree
//! expected to contain three properties: 'boundsPercentile', 'annotationsEnabled'
//! and 'terms'
bool configureModelPlot(const boost::property_tree::ptree& propTree);
//! Configure modelPlotConfig params directly, from the three properties
//! 'modelPlotEnabled', 'annotationPlotEnabled' and 'terms'.
//! This initialisation method does not allow setting the value of the
//! 'boundsPercentile' property, instead a default value is used when 'modelPlotEnabled'
//! is true and a value of -1.0 is used otherwise.
void configureModelPlot(bool modelPlotEnabled,
bool annotationsEnabled,
const std::string& terms);
//! Set the central confidence interval for the model debug plot
//! to \p percentage.
//!
//! This controls upper and lower confidence interval error bars
//! returned by the model debug plot.
//! \note \p percentile should be in the range [0.0, 100.0).
void modelPlotBoundsPercentile(double percentile);
//! Get the central confidence interval for the model debug plot.
double modelPlotBoundsPercentile() const;
//! Is model plot enabled?
bool modelPlotEnabled() const;
//! Are annotations enabled for each of the models?
bool modelPlotAnnotationsEnabled() const;
//! Set terms (by, over, or partition field values) to filter
//! model debug data. When empty, no filtering is applied.
void modelPlotTerms(TStrSet terms);
//! Get the terms (by, over, or partition field values)
//! used to filter model debug data. Empty when no filtering applies.
const TStrSet& modelPlotTerms() const;
//@}
//! \name Anomaly Score Calculation
//@{
//! Get the value of the aggregation style parameter identified by
//! \p style and \p param.
double aggregationStyleParam(model_t::EAggregationStyle style,
model_t::EAggregationParam param) const;
//! Get the maximum anomalous probability.
double maximumAnomalousProbability() const;
//@}
//! \name Anomaly Score Normalization
//@{
//! Get the historic anomaly score percentile for which lower
//! values are classified as noise.
double noisePercentile() const;
//! Get the multiplier applied to the noise level score in order
//! to be classified as anomalous.
double noiseMultiplier() const;
//! Get the normalized anomaly score knot points.
const TDoubleDoublePrVec& normalizedScoreKnotPoints() const;
//@}
//! Sets the reference to the detection rules map
void detectionRules(TIntDetectionRuleVecUMapCRef detectionRules);
//! Sets the reference to the scheduled events vector
void scheduledEvents(TStrDetectionRulePrVecCRef scheduledEvents);
//! Process the stanza properties corresponding \p stanzaName.
//!
//! \param[in] propertyTree The properties of the stanza called
//! \p stanzaName.
bool processStanza(const boost::property_tree::ptree& propertyTree);
//! Get the factor to normalize all bucket lengths to the default
//! bucket length.
double bucketNormalizationFactor() const;
//! The time window during which samples are accepted.
core_t::TTime samplingAgeCutoff() const;
private:
//! Bucket length.
core_t::TTime m_BucketLength{0};
//! Prune window length (in seconds)
core_t::TTime m_ModelPruneWindow{0};
//! Should multivariate analysis of correlated 'by' fields be performed?
bool m_MultivariateByFields{false};
//! The single interim bucket correction calculator.
TInterimBucketCorrectorPtr m_InterimBucketCorrector;
//! The new model factories for each data type.
TFactoryTypeFactoryPtrMap m_Factories;
//! A cache of customized factories requested from this config.
mutable TSearchKeyFactoryCPtrMap m_FactoryCache;
//! Is model plot enabled?
bool m_ModelPlotEnabled{false};
//! Are annotations enabled for each of the models?
bool m_ModelPlotAnnotationsEnabled{false};
//! The central confidence interval for the model debug plot.
double m_ModelPlotBoundsPercentile;
//! Terms (by, over, or partition field values) used to filter model
//! debug data. Empty when no filtering applies.
TStrSet m_ModelPlotTerms;
//@}
//! \name Anomaly Score Calculation
//@{
//! The values for the aggregation styles' parameters.
double m_AggregationStyleParams[model_t::NUMBER_AGGREGATION_STYLES][model_t::NUMBER_AGGREGATION_PARAMS];
//! The maximum probability which is deemed to be anomalous.
double m_MaximumAnomalousProbability;
//@}
//! \name Anomaly Score Normalization
//@{
//! The historic anomaly score percentile for which lower values
//! are classified as noise.
double m_NoisePercentile;
//! The multiplier applied to the noise level score in order to
//! be classified as anomalous.
double m_NoiseMultiplier;
//! We use a piecewise linear mapping between the raw anomaly score
//! and the normalized anomaly score with these knot points.
//! \see DEFAULT_NORMALIZED_SCORE_KNOT_POINTS for details.
TDoubleDoublePrVec m_NormalizedScoreKnotPoints;
//@}
//! A reference to the map containing detection rules per
//! detector key. Note that the owner of the map is CAnomalyJobConfig::CAnalysisConfig.
TIntDetectionRuleVecUMapCRef m_DetectionRules;
//! A reference to the vector of scheduled events.
//! The owner of the vector is CAnomalyJobConfig::CAnalysisConfig.
TStrDetectionRulePrVecCRef m_ScheduledEvents;
};
}
}
#endif // INCLUDED_ml_model_CAnomalyDetectorModelConfig_h