include/model/CHierarchicalResults.h (165 lines of code) (raw):
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the following additional limitation. Functionality enabled by the
* files subject to the Elastic License 2.0 may only be used in production when
* invoked by an Elasticsearch process with a license key installed that permits
* use of machine learning features. You may not use this file except in
* compliance with the Elastic License 2.0 and the foregoing additional
* limitation.
*/
#ifndef INCLUDED_ml_model_CHierarchicalResults_h
#define INCLUDED_ml_model_CHierarchicalResults_h
#include <core/CSmallVector.h>
#include <maths/common/COrderings.h>
#include <model/CAnnotatedProbability.h>
#include <model/FunctionTypes.h>
#include <model/ImportExport.h>
#include <model/ModelTypes.h>
#include <boost/unordered_map.hpp>
#include <cstddef>
#include <deque>
#include <map>
#include <string>
#include <vector>
namespace CHierarchicalResultsTest {
struct testShouldWritePartition;
}
namespace ml {
namespace model {
class CAnomalyDetectorModel;
class CLimits;
namespace hierarchical_results_detail {
using TOptionalStr = std::optional<std::string>;
using TOptionalStrVec = std::vector<TOptionalStr>;
using TOptionalStrOptionalStrPr = std::pair<TOptionalStr, TOptionalStr>;
using TOptionalStrOptionalStrPrDoublePr = std::pair<TOptionalStrOptionalStrPr, double>;
using TOptionalStrOptionalStrPrDoublePrVec = std::vector<TOptionalStrOptionalStrPrDoublePr>;
using TStr1Vec = core::CSmallVector<std::string, 1>;
//! \brief The data fully describing a result node.
//!
//! DESCRIPTION:\n
//! A result is fully described by the complete set of field
//! names and values in a simple search. If one or more of
//! these unset then the node corresponds to come aggregation
//! of simple results.
//!
//! A simple search corresponds to a single clause in the search
//! config file. It corresponds to the following command line
//! syntax:
//! <pre>
//! autodetect <function>[(X)] [by Y] [over Z]
//! </pre>
//!
//! So, examples include
//! -# autodetect count
//! -# autodetect count by status
//! -# autodetect sum(bytes) over host
//! -# autodetect rare by uri_path over clientip
//! -# and so on.
struct MODEL_EXPORT SResultSpec {
SResultSpec();
//! Print of the specification for debugging.
std::string print() const;
//! A unique identifier for the search's detector.
int s_Detector;
//! True if this is a simple counting detector result.
bool s_IsSimpleCount;
//! True if this is a population search result.
bool s_IsPopulation;
//! True if the model was configured to use null values.
bool s_UseNull;
//! The name of the partitioning field.
TOptionalStr s_PartitionFieldName;
//! The value of the partitioning field.
TOptionalStr s_PartitionFieldValue;
//! The person field name. This is the name of field identifying
//! a person, i.e. the by field name if there is no over field
//! and over field name otherwise.
TOptionalStr s_PersonFieldName;
//! The value of the person field if applicable or an empty string
//! otherwise
TOptionalStr s_PersonFieldValue;
//! The name of the field identifying the metric value if this is
//! metric analysis and an empty string otherwise.
TOptionalStr s_ValueFieldName;
//! The optional function. Only leaf nodes have populated functions.
TOptionalStr s_FunctionName;
//! The "by" field name.
TOptionalStr s_ByFieldName;
//! The function identifier.
function_t::EFunction s_Function;
//! The list of scheduled event descriptions if any occured
TStr1Vec s_ScheduledEventDescriptions;
};
//! \brief A node of the hierarchical results tree.
//!
//! DESCRIPTION:\n
//! A node of the hierarchical results tree. By default we build a tree
//! on top of our simple search results which allows us to obtain partial
//! aggregate results for each object of interest.
//!
//! For example, if the following searches are run:
//! - autodetect count by host
//! - autodetect sum(bytes) over host
//! - autodetect rare(process) by host
//!
//! The common field of interest is "host" and we obtain aggregate
//! results for each host as well as an overall aggregate result.
//!
//! This is used to represent a node of the tree corresponding to this
//! aggregation process.
//!
//! \see buildHierarchicalResults for more details.
struct MODEL_EXPORT SNode {
using TAttributeProbabilityVec = std::vector<SAttributeProbability>;
using TNodeCPtr = const SNode*;
using TNodeCPtrVec = std::vector<TNodeCPtr>;
using TNodePtrSizeUMap = boost::unordered_map<TNodeCPtr, std::size_t>;
using TSizeNodePtrUMap = boost::unordered_map<std::size_t, TNodeCPtr>;
SNode();
SNode(const SResultSpec& simpleSearch, SAnnotatedProbability& annotatedProbability);
//! Returns the aggregate probability for the node
double probability() const;
//! Propagate consistent field names and values from the nodes children.
void propagateFields();
//! Print of the node for debugging.
std::string print() const;
//! Efficient swap
void swap(SNode& other) noexcept;
//! \name Connectivity
//@{
//! The node's parent.
TNodeCPtr s_Parent;
//! The node's children.
TNodeCPtrVec s_Children;
//@}
//! Data describing the common field of the simple searches below
//! this node. (Note that for internal nodes the not equal field
//! names are set to empty strings.)
SResultSpec s_Spec;
//! The aggregate annotated probability of the node.
mutable SAnnotatedProbability s_AnnotatedProbability;
//! The detector identifier.
mutable int s_Detector;
//! The aggregation style to use for this probability.
mutable int s_AggregationStyle;
//! The smallest aggregate probability of this node's children.
mutable double s_SmallestChildProbability;
//! The smallest aggregate probability of any of this node's descendants.
mutable double s_SmallestDescendantProbability;
//! The raw anomaly score of the node.
mutable double s_RawAnomalyScore;
//! The normalized anomaly score of the node.
mutable double s_NormalizedAnomalyScore;
//! \name Extra State for Results Output
//@{
//! The model which generated the result.
const CAnomalyDetectorModel* s_Model;
//! The start time of the bucket generating the anomaly.
core_t::TTime s_BucketStartTime;
//! The length of the bucket for this result.
core_t::TTime s_BucketLength;
//@}
};
//! Non-member node swap to work with standard algorithms
MODEL_EXPORT
void swap(SNode& node1, SNode& node2) noexcept;
} // hierarchical_results_detail::
class CHierarchicalResultsVisitor;
//! \brief Represents the bucket result of running a full analysis.
//!
//! DESCRIPTION:\n
//! This wraps up the logic to build a hierarchy on top of a collection
//! of simple search results. A simple search would, for example, be
//! one clause of a model configuration file and has the command line
//! syntax:
//! <pre>
//! [partitionfield = w] function[(x)] [by y] [over z]
//! </pre>
//!
//! An abstract visitor pattern is implemented, with the intention of
//! factoring out logic to, for example, output hierarchical results and
//! aggregate the probabilities up the tree. Both bottom up depth and
//! breadth first visiting strategies have been implemented.
//!
//! IMPLEMENTATION DECISIONS:\n
//! This loosely implements a builder pattern: each simple search result
//! is added and the intention is that all results are first added and
//! then the hierarchical object is built (although buildHierarchy can
//! be called repeatedly).
//!
//! Most of the state of this class is held by reference and could become
//! invalid if it is kept longer than to output a single result. This is
//! to minimize the amount of state that needs to be copied when outputting
//! results (to minimize both runtime and transient memory usage).
class MODEL_EXPORT CHierarchicalResults {
public:
using TDoubleVec = std::vector<double>;
using TAttributeProbabilityVec = std::vector<SAttributeProbability>;
using TResultSpec = hierarchical_results_detail::SResultSpec;
using TOptionalStr = std::optional<std::string>;
using TOptionalStrOptionalStrPr = hierarchical_results_detail::TOptionalStrOptionalStrPr;
using TOptionalStrOptionalStrPrDoublePr =
hierarchical_results_detail::TOptionalStrOptionalStrPrDoublePr;
using TOptionalStrOptionalStrPrDoublePrVec =
hierarchical_results_detail::TOptionalStrOptionalStrPrDoublePrVec;
using TNode = hierarchical_results_detail::SNode;
using TNodePtrSizeUMap = hierarchical_results_detail::SNode::TNodePtrSizeUMap;
using TSizeNodePtrUMap = hierarchical_results_detail::SNode::TSizeNodePtrUMap;
using TNodeDeque = std::deque<TNode>;
using TOptionalStrOptionalStrPrNodeMap =
std::map<TOptionalStrOptionalStrPr, TNode, maths::common::COrderings::SLess>;
using TOptionalStrNodeMap =
std::map<TOptionalStr, TNode, maths::common::COrderings::SLess>;
public:
CHierarchicalResults();
//! Add a dummy result for a simple count detector.
void addSimpleCountResult(SAnnotatedProbability& annotatedProbability,
const CAnomalyDetectorModel* model = nullptr,
core_t::TTime bucketStartTime = 0);
//! Add a simple search result.
//!
//! The general syntax for a simple search is
//! <pre>
//! [partitionfield = w] function[(x)] [by y] [over z]
//! </pre>
//!
//! Examples include:
//! -# count
//! -# rare by x
//! -# partitionfield = x mean(y)
//! -# min(x) over z
//! -# partitionfield = x dc(y) over z
//! -# partitionfield = w max(x) by y over z
//!
//! If a given search doesn't have a field pass the empty string.
//!
//! \param[in] detector An identifier of the detector generating this
//! result.
//! \param[in] isPopulation True if this is a population result and
//! false otherwise.
//! \param[in] functionName The name of the function of the model's search.
//! \param[in] function The function of the model's search.
//! \param[in] partitionFieldName The partition field name or empty.
//! \param[in] partitionFieldValue The partition field value or empty.
//! \param[in] personFieldName The over field name or empty.
//! \param[in] personFieldValue The over field value or empty.
//! \param[in] valueFieldName The name of the field containing the
//! metric value.
//! \param[out] annotatedProbability A struct containing the probability,
//! the smallest attribute probabilities, the influencers,
//! and any extra descriptive data
//! \param[in] model The model which generated the result.
//! \note Values which are passed by non-constant reference are swapped
//! in to place.
void addModelResult(int detector,
bool isPopulation,
const std::string& functionName,
function_t::EFunction function,
const std::string& partitionFieldName,
const std::string& partitionFieldValue,
const std::string& personFieldName,
const std::string& personFieldValue,
const std::string& valueFieldName,
SAnnotatedProbability& annotatedProbability,
const CAnomalyDetectorModel* model = nullptr,
core_t::TTime bucketStartTime = 0);
//! Add the influencer called \p name.
void addInfluencer(const std::string& name);
//! Build a hierarchy from the current flat node list using the
//! default aggregation rules.
//!
//! The aggregation rules in priority order are:
//! -# Only aggregate searches with the same partition field name
//! and value.
//! -# Subject to 1, aggregate searches with the same person field
//! name and value: this is the by field name and value if no
//! over field is specified otherwise it is the over field name
//! name and value.
void buildHierarchy();
//! Creates the pivot nodes for influencing field values.
void createPivots();
//! Get the root node of the hierarchy.
const TNode* root() const;
//! Get the influencer identified by \p influencerName and
//! \p influencerValue if one exists.
const TNode* influencer(const TOptionalStr& influencerName,
const TOptionalStr& influencerValue) const;
//! Bottom up first visit the tree.
void bottomUpBreadthFirst(CHierarchicalResultsVisitor& visitor) const;
//! Top down first visit the tree.
void topDownBreadthFirst(CHierarchicalResultsVisitor& visitor) const;
//! Post-order depth first visit the tree.
void postorderDepthFirst(CHierarchicalResultsVisitor& visitor) const;
//! Visit all the pivot nodes bottom up first.
void pivotsBottomUpBreadthFirst(CHierarchicalResultsVisitor& visitor) const;
//! Visit all the pivot nodes top down first.
void pivotsTopDownBreadthFirst(CHierarchicalResultsVisitor& visitor) const;
//! Check if there are no results at all including the simple
//! count result.
bool empty() const;
//! Get the count of leaf (search) results, i.e. excluding the
//! simple count result.
std::size_t resultCount() const;
//! Sets the result to be interm
void setInterim();
//! Get type of result
model_t::CResultType resultType() const;
//! Print the results for debug.
std::string print() const;
private:
//! Create a new node.
TNode& newNode();
//! Create a new leaf node for the simple search \p simpleSearch.
TNode& newLeaf(const TResultSpec& simpleSearch, SAnnotatedProbability& annotatedProbability);
//! Create or retrieve a pivot node for the \p key.
TNode& newPivot(TOptionalStrOptionalStrPr key);
//! Create or retrieve a pivot root node for the \p key.
TNode& newPivotRoot(const TOptionalStr& key);
//! Post-order depth first visit the tree.
void postorderDepthFirst(const TNode* node, CHierarchicalResultsVisitor& visitor) const;
private:
//! Storage for the nodes.
TNodeDeque m_Nodes;
//! Storage for the pivot nodes.
TOptionalStrOptionalStrPrNodeMap m_PivotNodes;
//! Pivot root nodes.
TOptionalStrNodeMap m_PivotRootNodes;
//! Is the result final or interim?
//! This field is transient - does not get persisted because interim results
//! never get persisted.
model_t::CResultType m_ResultType;
};
//! \brief Interface for visiting the results.
class MODEL_EXPORT CHierarchicalResultsVisitor {
public:
using TNode = CHierarchicalResults::TNode;
public:
virtual ~CHierarchicalResultsVisitor() = default;
//! Visit a node.
virtual void visit(const CHierarchicalResults& results, const TNode& node, bool pivot) = 0;
protected:
//! Check if this node is the root node.
static bool isRoot(const TNode& node);
//! Check if the node is a leaf.
static bool isLeaf(const TNode& node);
//! Check if the node is partition, i.e. if its children are
//! one or more partitions.
static bool isPartitioned(const TNode& node);
//! Check if this is a named partition.
static bool isPartition(const TNode& node);
//! Check if the node is a named person.
static bool isPerson(const TNode& node);
//! Check if the node is an attribute of a person.
static bool isAttribute(const TNode& node);
//! Check if the node is simple counting result.
static bool isSimpleCount(const TNode& node);
//! Check if the node is a population result.
static bool isPopulation(const TNode& node);
//! Check if we can ever write a result for the node.
static bool isTypeForWhichWeWriteResults(const TNode& node, bool pivot);
//! Get the nearest ancestor of the node for which we write results.
static const TNode* nearestAncestorForWhichWeWriteResults(const TNode& node);
//! Check if we'll write a result for the node.
static bool shouldWriteResult(const CLimits& limits,
const CHierarchicalResults& results,
const TNode& node,
bool pivot);
friend struct CHierarchicalResultsTest::testShouldWritePartition;
};
}
}
#endif // INCLUDED_ml_model_CHierarchicalResults_h