include/api/CAnomalyJobConfig.h (430 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#ifndef INCLUDED_ml_api_CAnomalyJobConfig_h
#define INCLUDED_ml_api_CAnomalyJobConfig_h
#include <core/CLogger.h>
#include <api/CDetectionRulesJsonParser.h>
#include <api/ImportExport.h>
#include <model/CLimits.h>
#include <model/FunctionTypes.h>
#include <boost/unordered_map.hpp>
#include <set>
#include <string>
#include <vector>
class CTestAnomalyJob;
namespace json = boost::json;
namespace ml {
namespace model {
class CAnomalyDetectorModelConfig;
}
namespace api {
//! \brief A parser to convert JSON configuration of an anomaly job JSON into an object
class API_EXPORT CAnomalyJobConfig {
public:
using TStrDetectionRulePr = std::pair<std::string, model::CDetectionRule>;
using TStrDetectionRulePrVec = std::vector<TStrDetectionRulePr>;
public:
class API_EXPORT CAnalysisConfig {
public:
class API_EXPORT CDetectorConfig {
public:
static const std::string DETECTOR_RULES;
static const std::string FUNCTION;
static const std::string FIELD_NAME;
static const std::string BY_FIELD_NAME;
static const std::string OVER_FIELD_NAME;
static const std::string PARTITION_FIELD_NAME;
static const std::string DETECTOR_DESCRIPTION;
static const std::string DETECTOR_INDEX;
static const std::string EXCLUDE_FREQUENT;
static const std::string CUSTOM_RULES;
static const std::string USE_NULL;
static const std::string ALL_TOKEN;
static const std::string BY_TOKEN;
static const std::string NONE_TOKEN;
static const std::string OVER_TOKEN;
//! Strings that define the type of analysis to run
static const std::string FUNCTION_COUNT;
static const std::string FUNCTION_COUNT_ABBREV;
static const std::string FUNCTION_LOW_COUNT;
static const std::string FUNCTION_LOW_COUNT_ABBREV;
static const std::string FUNCTION_HIGH_COUNT;
static const std::string FUNCTION_HIGH_COUNT_ABBREV;
static const std::string FUNCTION_DISTINCT_COUNT;
static const std::string FUNCTION_DISTINCT_COUNT_ABBREV;
static const std::string FUNCTION_LOW_DISTINCT_COUNT;
static const std::string FUNCTION_LOW_DISTINCT_COUNT_ABBREV;
static const std::string FUNCTION_HIGH_DISTINCT_COUNT;
static const std::string FUNCTION_HIGH_DISTINCT_COUNT_ABBREV;
static const std::string FUNCTION_NON_ZERO_COUNT;
static const std::string FUNCTION_NON_ZERO_COUNT_ABBREV;
static const std::string FUNCTION_RARE_NON_ZERO_COUNT;
static const std::string FUNCTION_RARE_NON_ZERO_COUNT_ABBREV;
static const std::string FUNCTION_RARE;
static const std::string FUNCTION_RARE_COUNT;
static const std::string FUNCTION_FREQ_RARE;
static const std::string FUNCTION_FREQ_RARE_ABBREV;
static const std::string FUNCTION_FREQ_RARE_COUNT;
static const std::string FUNCTION_FREQ_RARE_COUNT_ABBREV;
static const std::string FUNCTION_LOW_NON_ZERO_COUNT;
static const std::string FUNCTION_LOW_NON_ZERO_COUNT_ABBREV;
static const std::string FUNCTION_HIGH_NON_ZERO_COUNT;
static const std::string FUNCTION_HIGH_NON_ZERO_COUNT_ABBREV;
static const std::string FUNCTION_INFO_CONTENT;
static const std::string FUNCTION_LOW_INFO_CONTENT;
static const std::string FUNCTION_HIGH_INFO_CONTENT;
static const std::string FUNCTION_METRIC;
static const std::string FUNCTION_AVERAGE;
static const std::string FUNCTION_MEAN;
static const std::string FUNCTION_LOW_MEAN;
static const std::string FUNCTION_HIGH_MEAN;
static const std::string FUNCTION_LOW_AVERAGE;
static const std::string FUNCTION_HIGH_AVERAGE;
static const std::string FUNCTION_MEDIAN;
static const std::string FUNCTION_LOW_MEDIAN;
static const std::string FUNCTION_HIGH_MEDIAN;
static const std::string FUNCTION_MIN;
static const std::string FUNCTION_MAX;
static const std::string FUNCTION_VARIANCE;
static const std::string FUNCTION_LOW_VARIANCE;
static const std::string FUNCTION_HIGH_VARIANCE;
static const std::string FUNCTION_SUM;
static const std::string FUNCTION_LOW_SUM;
static const std::string FUNCTION_HIGH_SUM;
static const std::string FUNCTION_NON_NULL_SUM;
static const std::string FUNCTION_NON_NULL_SUM_ABBREV;
static const std::string FUNCTION_LOW_NON_NULL_SUM;
static const std::string FUNCTION_LOW_NON_NULL_SUM_ABBREV;
static const std::string FUNCTION_HIGH_NON_NULL_SUM;
static const std::string FUNCTION_HIGH_NON_NULL_SUM_ABBREV;
static const std::string FUNCTION_TIME_OF_DAY;
static const std::string FUNCTION_TIME_OF_WEEK;
static const std::string FUNCTION_LAT_LONG;
static const std::string FUNCTION_MAX_VELOCITY;
static const std::string FUNCTION_MIN_VELOCITY;
static const std::string FUNCTION_MEAN_VELOCITY;
static const std::string FUNCTION_SUM_VELOCITY;
public:
CDetectorConfig() {}
// Convenience ctor intended for use by the unit tests only
CDetectorConfig(const std::string& functionName,
const std::string& fieldName,
const std::string& byFieldName,
const std::string& overFieldName,
const std::string& partitionFieldName)
: m_FunctionName(functionName),
m_FieldName(fieldName), m_ByFieldName{byFieldName},
m_OverFieldName{overFieldName}, m_PartitionFieldName{partitionFieldName} {
this->determineFunction(false);
this->decipherExcludeFrequentSetting();
}
void parse(const json::value& detectorConfig,
const CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters,
bool haveSummaryCountField,
int detectorIndex,
CDetectionRulesJsonParser::TDetectionRuleVec& detectionRules);
int detectorIndex() const { return m_DetectorIndex; }
std::string functionName() const { return m_FunctionName; }
model::function_t::EFunction function() const { return m_Function; }
std::string fieldName() const { return m_FieldName; }
std::string byFieldName() const { return m_ByFieldName; }
std::string overFieldName() const { return m_OverFieldName; }
std::string partitionFieldName() const {
return m_PartitionFieldName;
}
model_t::EExcludeFrequent excludeFrequent() const;
std::string detectorDescription() const {
return m_DetectorDescription;
}
bool useNull() const { return m_UseNull; }
bool isPopulation() const {
return m_OverFieldName.empty() == false;
}
private:
bool determineFunction(bool haveSummaryCountField);
bool decipherExcludeFrequentSetting();
private:
std::string m_FunctionName{};
model::function_t::EFunction m_Function;
std::string m_FieldName{};
std::string m_ByFieldName{};
std::string m_OverFieldName{};
std::string m_PartitionFieldName{};
std::string m_ExcludeFrequent{};
bool m_ByHasExcludeFrequent{false};
bool m_OverHasExcludeFrequent{false};
std::string m_DetectorDescription{};
int m_DetectorIndex{};
bool m_UseNull{false};
};
public:
static const std::string BUCKET_SPAN;
static const std::string MODEL_PRUNE_WINDOW;
static const std::string SUMMARY_COUNT_FIELD_NAME;
static const std::string CATEGORIZATION_FIELD_NAME;
static const std::string CATEGORIZATION_FILTERS;
static const std::string DETECTORS;
static const std::string INFLUENCERS;
static const std::string LATENCY;
static const std::string MULTIVARIATE_BY_FIELDS;
static const std::string PER_PARTITION_CATEGORIZATION;
static const std::string ENABLED;
static const std::string STOP_ON_WARN;
static const core_t::TTime DEFAULT_BUCKET_SPAN;
static const core_t::TTime DEFAULT_LATENCY;
static const std::string CLEAR;
static const char SUFFIX_SEPARATOR;
static const std::string SCHEDULED_EVENT_PREFIX;
static const std::string DESCRIPTION_SUFFIX;
static const std::string RULES_SUFFIX;
public:
using TStrVec = std::vector<std::string>;
using TDetectorConfigVec = std::vector<CDetectorConfig>;
using TIntDetectionRuleVecUMap =
boost::unordered_map<int, CDetectionRulesJsonParser::TDetectionRuleVec>;
//! Used to maintain a list of all unique config keys
using TIntSet = std::set<int>;
public:
//! Default constructor
CAnalysisConfig() {}
//! Construct with just a categorization field. (In the case of a
//! categorization job, this is all that is needed for this config.)
CAnalysisConfig(const std::string& categorizationFieldName)
: m_CategorizationFieldName{categorizationFieldName} {}
void init(const CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters,
const TStrDetectionRulePrVec& scheduledEvents) {
m_RuleFilters = ruleFilters;
m_ScheduledEvents = scheduledEvents;
}
void initRuleFilters(const CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters) {
// Update or insert values that are in the new map - we never delete filters at this level.
// Note that we can't simply assign "m_RuleFilters = ruleFilters", as that would result in
// the pattern set objects being destroyed and, as they are referenced by the anomaly detector models,
// this is a bad thing.
for (const auto& kv : ruleFilters) {
CDetectionRulesJsonParser::TStrPatternSetUMap::iterator itr =
m_RuleFilters.find(kv.first);
if (itr != m_RuleFilters.end()) {
itr->second = kv.second;
} else {
m_RuleFilters.insert(kv);
}
}
}
void initScheduledEvents(const TStrDetectionRulePrVec& scheduledEvents) {
m_ScheduledEvents = scheduledEvents;
}
//! Parse a JSON value representing an entire analysis config object.
void parse(const json::value& json);
core_t::TTime bucketSpan() const { return m_BucketSpan; }
//! Return the size of the model prune window expressed as a whole number of seconds.
//! Note that throughout the code the model prune window may sometimes be expressed in
//! seconds, as here, and sometimes as number of buckets. Where any doubt exists
//! a comment will explain which is in use.
core_t::TTime modelPruneWindow() const { return m_ModelPruneWindow; }
std::string summaryCountFieldName() const {
return m_SummaryCountFieldName;
}
std::string categorizationFieldName() const {
return m_CategorizationFieldName;
}
std::string categorizationPartitionFieldName() const {
return m_CategorizationPartitionFieldName;
}
const TStrVec& categorizationFilters() const {
return m_CategorizationFilters;
}
bool perPartitionCategorizationEnabled() const {
return m_PerPartitionCategorizationEnabled;
}
bool perPartitionCategorizationStopOnWarn() const {
return m_PerPartitionCategorizationStopOnWarn;
}
const TDetectorConfigVec& detectorsConfig() const {
return m_Detectors;
}
const TStrVec& influencers() const { return m_Influencers; }
core_t::TTime latency() const { return m_Latency; }
bool multivariateByFields() const { return m_MultivariateByFields; }
const TIntDetectionRuleVecUMap& detectionRules() const {
return m_DetectorRules;
}
const CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters() const {
return m_RuleFilters;
}
//! Get the scheduled events
const TStrDetectionRulePrVec& scheduledEvents() const {
return m_ScheduledEvents;
}
ml::model::CAnomalyDetectorModelConfig makeModelConfig() const;
static core_t::TTime durationSeconds(const std::string& durationString,
core_t::TTime defaultDuration);
bool parseRules(int detectorIndex, const std::string& rules);
bool parseRules(int detectorIndex, const json::value& rules);
bool parseRulesUpdate(const json::value& rulesUpdateConfig);
private:
// Convenience method intended for use by the unit tests only
void addDetector(const std::string& functionName,
const std::string& fieldName,
const std::string& byFieldName,
const std::string& overFieldName,
const std::string& partitionFieldName,
const TStrVec& influencers,
const std::string& summaryCountFieldName) {
m_Influencers = influencers;
m_SummaryCountFieldName = summaryCountFieldName;
m_Detectors.emplace_back(functionName, fieldName, byFieldName,
overFieldName, partitionFieldName);
}
bool parseRules(CDetectionRulesJsonParser::TDetectionRuleVec& detectionRules,
const json::value& rules);
bool parseRules(CDetectionRulesJsonParser::TDetectionRuleVec& detectionRules,
const std::string& rules);
void parseDetectorsConfig(const json::value& detectorsConfig);
private:
core_t::TTime m_BucketSpan{DEFAULT_BUCKET_SPAN};
//! The size of the model prune window expressed as a whole number of seconds.
core_t::TTime m_ModelPruneWindow{0};
std::string m_SummaryCountFieldName{};
std::string m_CategorizationFieldName{};
std::string m_CategorizationPartitionFieldName{};
TStrVec m_CategorizationFilters{};
bool m_PerPartitionCategorizationEnabled{false};
bool m_PerPartitionCategorizationStopOnWarn{false};
TDetectorConfigVec m_Detectors{};
TStrVec m_Influencers{};
core_t::TTime m_Latency{DEFAULT_LATENCY};
bool m_MultivariateByFields{false};
//! The detection rules per detector index.
TIntDetectionRuleVecUMap m_DetectorRules{};
//! The filters per id used by categorical rule conditions.
CDetectionRulesJsonParser::TStrPatternSetUMap m_RuleFilters{};
//! The scheduled events (events apply to all detectors).
//! Events consist of a description and a detection rule
TStrDetectionRulePrVec m_ScheduledEvents{};
std::string m_AnalysisConfigString;
friend class ::CTestAnomalyJob;
};
class API_EXPORT CDataDescription {
public:
static const std::string TIME_FIELD;
// The time format present in the job config is in Java format and
// should be ignored in the C++ code. In any case, in production, the
// time field is always specified in seconds since epoch.
static const std::string TIME_FORMAT;
static const std::string DEFAULT_TIME_FIELD;
public:
//! Default constructor
CDataDescription() {}
void parse(const json::value& json);
std::string timeField() const { return m_TimeField; }
private:
std::string m_TimeField; // e.g. timestamp
std::string m_TimeFormat; // e.g. epoch_ms
};
class API_EXPORT CModelPlotConfig {
public:
static const std::string ANNOTATIONS_ENABLED;
static const std::string ENABLED;
static const std::string TERMS;
public:
//! Default constructor
CModelPlotConfig() {}
void parse(const json::value& modelPlotConfig);
bool annotationsEnabled() const { return m_AnnotationsEnabled; }
bool enabled() const { return m_Enabled; }
// The terms string is a comma separated list of partition or
// by field values, but with no form of escaping.
// TODO improve this to be a more robust format
std::string terms() const { return m_Terms; }
private:
bool m_AnnotationsEnabled{false};
bool m_Enabled{false};
std::string m_Terms;
};
class API_EXPORT CAnalysisLimits {
public:
static const std::string MODEL_MEMORY_LIMIT;
static const std::string CATEGORIZATION_EXAMPLES_LIMIT;
static const std::size_t DEFAULT_MEMORY_LIMIT_BYTES;
public:
//! Default constructor
CAnalysisLimits() {}
void parse(const json::value& analysLimits);
//! Size of the memory limit for the resource monitor
//! as a whole number of MB.
std::size_t modelMemoryLimitMb() const { return m_ModelMemoryLimitMb; }
std::size_t categorizationExamplesLimit() const {
return m_CategorizationExamplesLimit;
}
static std::size_t modelMemoryLimitMb(const std::string& memoryLimitStr);
private:
std::size_t m_CategorizationExamplesLimit{model::CLimits::DEFAULT_RESULTS_MAX_EXAMPLES};
std::size_t m_ModelMemoryLimitMb{model::CResourceMonitor::DEFAULT_MEMORY_LIMIT_MB};
};
class API_EXPORT CFilterConfig {
public:
static const std::string FILTER_ID;
static const std::string ITEMS;
public:
CFilterConfig() {}
void parse(const json::value& filterConfig,
CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters);
std::string name() const { return m_FilterName; }
CAnomalyJobConfig::CAnalysisConfig::TStrVec filterList() const {
return m_FilterList;
}
private:
using TStrVec = std::vector<std::string>;
std::string m_FilterName;
TStrVec m_FilterList;
};
class API_EXPORT CEventConfig {
public:
static const std::string DESCRIPTION;
static const std::string RULES;
public:
void parse(const json::value& filterConfig,
const CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters,
TStrDetectionRulePrVec& scheduledEvents);
private:
std::string m_Description;
CDetectionRulesJsonParser::TDetectionRuleVec m_DetectionRules;
};
public:
static const std::string JOB_ID;
static const std::string JOB_TYPE;
static const std::string ANALYSIS_CONFIG;
static const std::string DATA_DESCRIPTION;
static const std::string BACKGROUND_PERSIST_INTERVAL;
static const std::string MODEL_PLOT_CONFIG;
static const std::string ANALYSIS_LIMITS;
static const std::string FILTERS;
static const std::string EVENTS;
// Roughly how often should the quantiles be output when no
// anomalies are being detected? A staggering factor that varies by job is
// added to this.
static const core_t::TTime BASE_MAX_QUANTILE_INTERVAL;
// Roughly how often should the state be persisted? A staggering
// factor that varies by job is added to this.
static const core_t::TTime DEFAULT_BASE_PERSIST_INTERVAL;
public:
//! Default constructor
CAnomalyJobConfig() {}
explicit CAnomalyJobConfig(const std::string& categorizationFieldName)
: m_AnalysisConfig(categorizationFieldName) {}
bool readFile(const std::string& fileName, std::string& fileContents);
bool initFromFile(const std::string& configFile);
bool initFromFiles(const std::string& configFile,
const std::string& filtersConfigFile,
const std::string& eventsConfigFile);
bool parse(const std::string& json);
bool parseFilterConfig(const std::string& json);
bool parseEventConfig(const std::string& json);
// Generate a random time of up to 1 hour to be added to intervals at which we
// perform periodic operations. This means that when there are many jobs
// there is a certain amount of staggering of their periodic operations.
// A given job will always be given the same staggering interval.
core_t::TTime intervalStagger();
void initRuleFilters() { m_AnalysisConfig.initRuleFilters(m_RuleFilters); }
void initScheduledEvents() {
m_AnalysisConfig.initScheduledEvents(m_ScheduledEvents);
}
std::string jobId() const { return m_JobId; }
std::string jobType() const { return m_JobType; }
const CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters() const {
return m_RuleFilters;
}
CDetectionRulesJsonParser::TStrPatternSetUMap& ruleFilters() {
return m_RuleFilters;
}
const TStrDetectionRulePrVec& scheduledEvents() const {
return m_ScheduledEvents;
}
CAnalysisConfig& analysisConfig() { return m_AnalysisConfig; }
const CAnalysisConfig& analysisConfig() const { return m_AnalysisConfig; }
const CDataDescription& dataDescription() const {
return m_DataDescription;
}
const CModelPlotConfig& modelPlotConfig() const { return m_ModelConfig; }
CModelPlotConfig& modelPlotConfig() { return m_ModelConfig; }
const CAnalysisLimits& analysisLimits() const { return m_AnalysisLimits; }
bool isInitialized() const { return m_IsInitialized; }
core_t::TTime persistInterval() const {
return m_BackgroundPersistInterval;
}
core_t::TTime quantilePersistInterval() const {
return m_MaxQuantilePersistInterval;
}
private:
std::string m_JobId;
std::string m_JobType;
bool m_IsInitialized{false};
CDetectionRulesJsonParser::TStrPatternSetUMap m_RuleFilters;
TStrDetectionRulePrVec m_ScheduledEvents;
CAnalysisConfig m_AnalysisConfig;
CDataDescription m_DataDescription;
CModelPlotConfig m_ModelConfig;
CAnalysisLimits m_AnalysisLimits;
std::vector<CFilterConfig> m_Filters{};
std::vector<CEventConfig> m_Events{};
core_t::TTime m_BackgroundPersistInterval{DEFAULT_BASE_PERSIST_INTERVAL};
core_t::TTime m_MaxQuantilePersistInterval{BASE_MAX_QUANTILE_INTERVAL};
};
}
}
#endif // INCLUDED_ml_api_CAnomalyJobConfig_h