include/model/CMetricBucketGatherer.h (88 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #ifndef INCLUDED_ml_model_CMetricBucketGatherer_h #define INCLUDED_ml_model_CMetricBucketGatherer_h #include <core/CMemoryUsage.h> #include <core/CoreTypes.h> #include <model/CBucketGatherer.h> #include <model/ImportExport.h> #include <model/ModelTypes.h> #include <any> #include <map> #include <string> #include <vector> namespace ml { namespace core { class CStatePersistInserter; class CStateRestoreTraverser; } namespace model { class CDataGatherer; class CResourceMonitor; //! \brief Metric series data gathering class. //! //! DESCRIPTION:\n //! This performs all pre-processing of the data which we model in order //! to characterize metric time series. //! //! \sa CDataGatherer. class MODEL_EXPORT CMetricBucketGatherer final : public CBucketGatherer { public: using TCategorySizePr = std::pair<model_t::EMetricCategory, std::size_t>; using TCategorySizePrAnyMap = std::map<TCategorySizePr, std::any>; using TCategorySizePrAnyMapItr = TCategorySizePrAnyMap::iterator; using TCategorySizePrAnyMapCItr = TCategorySizePrAnyMap::const_iterator; public: //! \name Life-cycle //@{ //! Create a new population metric data gatherer. //! //! \param[in] dataGatherer The owning data gatherer. //! \param[in] initData The parameter initialization object for the bucket //! gatherer. CMetricBucketGatherer(CDataGatherer& dataGatherer, const SBucketGathererInitData& initData); //! Construct from a state document. CMetricBucketGatherer(CDataGatherer& dataGatherer, const SBucketGathererInitData& initData, core::CStateRestoreTraverser& traverser); //! Create a copy that will result in the same persisted state as the //! original. This is effectively a copy constructor that creates a //! copy that's only valid for a single purpose. The boolean flag is //! redundant except to create a signature that will not be mistaken for //! a general purpose copy constructor. CMetricBucketGatherer(bool isForPersistence, const CMetricBucketGatherer& other); //@} //! \name Persistence //@{ //! Persist state by passing information to the supplied inserter void acceptPersistInserter(core::CStatePersistInserter& inserter) const override; //! Fill in the state from \p traverser. bool acceptRestoreTraverser(core::CStateRestoreTraverser& traverser); //! Create a clone of this data gatherer that will result in the same //! persisted state. The clone may be incomplete in ways that do not //! affect the persisted representation, and must not be used for any //! other purpose. //! \warning The caller owns the object returned. CBucketGatherer* cloneForPersistence() const override; //! The persistence tag name of this derived class. const std::string& persistenceTag() const override; private: //! Internal restore function. bool acceptRestoreTraverserInternal(core::CStateRestoreTraverser& traverser, bool isCurrentVersion); //@} public: //! \name Fields //@{ //! Get the person field name. //! //! This is the common field in all searches "along" which the //! probabilities are aggregated, i.e. the "over" field name for //! population searches and the "by" field name for individual //! searches. const std::string& personFieldName() const override; //! Get the attribute field name if one exists, i.e. the "by" for //! population searches, field name and returns empty otherwise. const std::string& attributeFieldName() const override; //! Returns an empty string. const std::string& valueFieldName() const override; //! Get an iterator at the beginning the influencing field names. TStrVecCItr beginInfluencers() const override; //! Get an iterator at the end of the influencing field names. TStrVecCItr endInfluencers() const override; //! Get the fields for which to gather data. //! //! For individual searches this gets the field which defines the //! categories whose counts are being analyzed. For population //! searches this gets the fields identifying the people and person //! attributes which are being analyzed. An empty string acts like //! a wild card and matches all records. This is used for analysis //! which is attribute independent such as total count. const TStrVec& fieldsOfInterest() const override; //@} //! Get a description of the search. std::string description() const override; //! \name Update //@{ //! Process the specified fields. //! //! \note For individual searches \p fieldValues should contain two //! fields. The first field should contain the by clause field value //! or a generic name if none was specified. The second field should //! contain a number corresponding to the metric value. For population //! searches \p fieldValues should contain three fields. The first //! field should contain the over clause field value. The second field //! should the by clause field value or a generic name if none was //! specified. The third field should contain a number corresponding //! to the metric value. bool processFields(const TStrCPtrVec& fieldValues, CEventData& result, CResourceMonitor& resourceMonitor) override; //@} //! \name Person //@{ //! Stop gathering data on the people identified by \p peopleToRemove. void recyclePeople(const TSizeVec& peopleToRemove) override; //! Remove all traces of people whose identifiers are greater than //! or equal to \p lowestPersonToRemove. void removePeople(std::size_t lowestPersonToRemove) override; //@} //! \name Attribute //@{ //! Stop gathering data on the attributes identified by \p attributesToRemove. void recycleAttributes(const TSizeVec& attributesToRemove) override; //! Remove all traces of attributes whose identifiers are greater //! than or equal to \p lowestAttributeToRemove. void removeAttributes(std::size_t lowestAttributeToRemove) override; //@} //! Get the checksum of this gatherer. uint64_t checksum() const override; //! Debug the memory used by this object. void debugMemoryUsage(const core::CMemoryUsage::TMemoryUsagePtr& mem) const override; //! Get the memory used by this object. std::size_t memoryUsage() const override; //! Get the static size of this object - used for virtual hierarchies std::size_t staticSize() const override; //! Clear this data gatherer. void clear() override; //! Reset bucket and return true if bucket was successfully reset or false otherwise. bool resetBucket(core_t::TTime bucketStart) override; //! Release memory that is no longer needed void releaseMemory(core_t::TTime samplingCutoffTime) override; //! \name Features //@{ //! Get the raw data for all features for the bucketing time interval //! containing \p time. //! //! \param[in] time The time of interest. //! \param[out] result Filled in with the feature data at \p time. void featureData(core_t::TTime time, core_t::TTime bucketLength, TFeatureAnyPrVec& result) const override; //@} private: //! Create samples if possible for the bucket pointed out by \p time. void sample(core_t::TTime time) override; //! Resize the necessary data structures so they can accommodate //! the person and attribute identified by \p pid and \p cid, //! respectively. //! //! \param[in] pid The identifier of the person to accommodate. //! \param[in] cid The identifier of the attribute to accommodate. void resize(std::size_t pid, std::size_t cid) override; //! Record the arrival of \p values for attribute identified by //! \p cid and person identified by \p pid. //! //! \param[in] pid The identifier of the person who generated //! the value. //! \param[in] cid The identifier of the value's attribute. //! \param[in] time The time of the \p values. //! \param[in] values The metric statistic value(s) //! \param[in] count The number of measurements in the metric //! statistic. //! \param[in] stringValue Ignored. //! \param[in] influences The influencing field values which //! label the value. void addValue(std::size_t pid, std::size_t cid, core_t::TTime time, const CEventData::TDouble1VecArray& values, std::size_t count, const CEventData::TOptionalStr& stringValue, const TOptionalStrVec& influences) override; //! Start a new bucket. void startNewBucket(core_t::TTime time, bool skipUpdates) override; //! Initialize the field names collection. //! initializeFieldNamesPart2() must be called after this. //! In the event that the data gatherer is being restored from persisted //! state, the sequence must be: //! 1) initializeFieldNamesPart1() //! 2) restore state //! 3) initializeFieldNamesPart2() void initializeFieldNamesPart1(const SBucketGathererInitData& initData); //! Initialize the field names collection. //! initializeFieldNamesPart1() must be called before this. //! In the event that the data gatherer is being restored from persisted //! state, the sequence must be: //! 1) initializeFieldNamesPart1() //! 2) restore state //! 3) initializeFieldNamesPart2() void initializeFieldNamesPart2(const SBucketGathererInitData& initData); //! Initialize the feature data gatherers. void initializeFeatureData(); private: //! The metric value field name. This is held separately to //! m_FieldNames because in the case of summarization the field //! names holding the summarized values will be mangled. std::string m_ValueFieldName; //! The names of the fields of interest. //! //! The entries in order are: //! -# The name of the field which identifies people, //! -# For population models only, the name of the field which //! identifies people's attributes, //! -# The name of zero or more influencing fields, //! -# The name of the field holding the count followed by the //! field name(s) of the field(s) which hold the statistics //! themselves, which must (for those that are present) be //! ordered mean, min, max, sum. //! -# For the API with user defined pre-summarisation, the name //! of the field which holds the count then the name of the field //! which holds the statistic value, //! -# Otherwise the name of the field which holds the metric value. TStrVec m_FieldNames; //! The position of the first influencing field. std::size_t m_BeginInfluencingFields{0}; //! The position of the first count/value field. std::size_t m_BeginValueFields{0}; //! For summarized values, this stores the metric categories //! corresponding to the summarized field names in m_FieldNames; //! for non-summarized input this will be empty TMetricCategoryVec m_FieldMetricCategories; //! The data features we are gathering. TCategorySizePrAnyMap m_FeatureData; }; } } #endif // INCLUDED_ml_model_CMetricBucketGatherer_h