include/api/CInferenceModelDefinition.h (430 lines of code) (raw):

/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the Elastic License * 2.0 and the following additional limitation. Functionality enabled by the * files subject to the Elastic License 2.0 may only be used in production when * invoked by an Elasticsearch process with a license key installed that permits * use of machine learning features. You may not use this file except in * compliance with the Elastic License 2.0 and the foregoing additional * limitation. */ #ifndef INCLUDED_ml_api_CInferenceModelDefinition_h #define INCLUDED_ml_api_CInferenceModelDefinition_h #include <api/CSerializableToJson.h> #include <api/ImportExport.h> #include <boost/json.hpp> #include <boost/unordered_map.hpp> #include <map> #include <optional> #include <sstream> #include <string> #include <vector> namespace json = boost::json; namespace ml { namespace core { class CBoostJsonConcurrentLineWriter; } namespace api { //! Abstract class for output aggregation. class API_EXPORT CAggregateOutput : public CSerializableToJsonStream { public: static const std::string JSON_WEIGHTS_TAG; public: //! Aggregation type as a string. virtual const std::string& stringType() const = 0; ~CAggregateOutput() override = default; }; //! Allows to use (weighted) majority vote for classification. class API_EXPORT CWeightedMode final : public CAggregateOutput { public: using TDoubleVec = std::vector<double>; static const std::string JSON_WEIGHTED_MODE_TAG; public: ~CWeightedMode() override = default; //! Construct with the \p weights vector. explicit CWeightedMode(TDoubleVec&& weights); //! Construct with a weight vector of \p size with all entries equal to \p weight. CWeightedMode(std::size_t size, double weight); void addToJsonStream(TGenericLineWriter& writer) const override; const std::string& stringType() const override; private: TDoubleVec m_Weights; }; //! Allows to use (weighted) sum for regression. class API_EXPORT CWeightedSum final : public CAggregateOutput { public: using TDoubleVec = std::vector<double>; static const std::string JSON_WEIGHTED_SUM_TAG; public: ~CWeightedSum() override = default; //! Construct with the \p weights vector. explicit CWeightedSum(TDoubleVec&& weights); //! Construct with a weight vector of \p size with all entries equal to \p weight. CWeightedSum(std::size_t size, double weight); void addToJsonStream(TGenericLineWriter& writer) const override; const std::string& stringType() const override; private: TDoubleVec m_Weights; }; //! \brief Logistic regression aggregation. //! //! DESCRIPTION:\n //! Given a weights vector $\vec{w}$ as a parameter and an output vector from //! the ensemble $\vec{x}$, it computes the logistic regression function //! \f$1/(1 + \exp(-\vec{w}^T \vec{x}))\f$. class API_EXPORT CLogisticRegression final : public CAggregateOutput { public: using TDoubleVec = std::vector<double>; static const std::string JSON_LOGISTIC_REGRESSION_TAG; public: ~CLogisticRegression() override = default; //! Construct with the \p weights vector. explicit CLogisticRegression(TDoubleVec&& weights); //! Construct with a weight vector of \p size with all entries equal to \p weight. CLogisticRegression(std::size_t size, double weight); void addToJsonStream(TGenericLineWriter& writer) const override; const std::string& stringType() const override; private: TDoubleVec m_Weights; }; //! \brief Exponent aggregation. //! //! DESCRIPTION:\n //! Given a weights vector $\vec{w}$ as a parameter and an output vector from the //! ensemble $\vec{x}$, it computes the exponent function \f$\exp(\vec{w}^T \vec{x})\f$. class API_EXPORT CExponent final : public CAggregateOutput { public: using TDoubleVec = std::vector<double>; static const std::string JSON_EXPONENT_TAG; public: ~CExponent() override = default; //! Construct with the \p weights vector. explicit CExponent(TDoubleVec&& weights); //! Construct with a weight vector of \p size with all entries equal to \p weight. CExponent(std::size_t size, double weight); void addToJsonStream(TGenericLineWriter& writer) const override; const std::string& stringType() const override; private: TDoubleVec m_Weights; }; //! List of support numeric relationships. It's only "<" at the moment. enum ENumericRelationship { E_LT }; class API_EXPORT CTrainedModel : public CSerializableToJsonStream { public: using TDoubleVec = std::vector<double>; using TStringVec = std::vector<std::string>; using TOptionalDoubleVec = std::optional<TDoubleVec>; using TOptionalStringVec = std::optional<TStringVec>; enum ETargetType { E_Classification, E_Regression }; //! \brief Provides feature names. //! //! DESCRIPTION:\n //! Trained model features include any input feature and any synthetic features //! which training adds, which include, for example, category encodings. Any code //! which references trained model features by name needs to use consistent naming. //! We standardise by using this class to encapsulate naming trained model features //! from the input feature names, the category names and the type of operation used //! to generate the feature. class CFeatureNameProvider { public: using TStrVec = std::vector<std::string>; using TStrVecVec = std::vector<TStrVec>; public: CFeatureNameProvider(TStrVec fieldNames, TStrVecVec categoryNames); const std::string& fieldName(std::size_t inputColumnIndex) const; const std::string& category(std::size_t inputColumnIndex, std::size_t hotCategory) const; std::string identityEncodingName(std::size_t inputColumnIndex) const; std::string oneHotEncodingName(std::size_t inputColumnIndex, std::size_t hotCategory) const; std::string targetMeanEncodingName(std::size_t inputColumnIndex) const; std::string frequencyEncodingName(std::size_t inputColumnIndex) const; private: TStrVec m_FieldNames; TStrVecVec m_CategoryNames; }; //! \brief A measure of the model complexity. class CSizeInfo : public CSerializableToJsonDocument { public: static const std::string JSON_NUM_CLASSES_TAG; static const std::string JSON_NUM_CLASSIFICATION_WEIGHTS_TAG; public: explicit CSizeInfo(const CTrainedModel& trainedModel); void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; //! \return Expected number of operation for the model evaluation. virtual std::size_t numOperations() const = 0; private: const CTrainedModel& m_TrainedModel; }; using TSizeInfoUPtr = std::unique_ptr<CSizeInfo>; static const std::string JSON_CLASSIFICATION_LABELS_TAG; static const std::string JSON_CLASSIFICATION_WEIGHTS_TAG; static const std::string JSON_FEATURE_NAMES_TAG; static const std::string JSON_TARGET_TYPE_CLASSIFICATION; static const std::string JSON_TARGET_TYPE_REGRESSION; static const std::string JSON_TARGET_TYPE_TAG; public: void addToJsonStream(TGenericLineWriter& writer) const override; //! Names of the features used by the model. const TStringVec& featureNames() const; //! Names of the features used by the model. virtual void featureNames(TStringVec featureNames); //! Sets target type (regression or classification). virtual void targetType(ETargetType targetType); //! Returns target type (regression or classification). virtual ETargetType targetType() const; //! Adjust the feature names, e.g. to exclude not used feature names like the target column. virtual const TStringVec& removeUnusedFeatures() = 0; //! Set the labels to use for each class. virtual void classificationLabels(const TStringVec& classificationLabels); //! Get the labels to use for each class. virtual const TOptionalStringVec& classificationLabels() const; //! Set weights by which to multiply classes when doing label assignment. virtual void classificationWeights(TDoubleVec classificationWeights); //! Get weights by which to multiply classes when doing label assignment. virtual const TOptionalDoubleVec& classificationWeights() const; //! Get the object for model size with information for estimation. virtual TSizeInfoUPtr sizeInfo() const = 0; protected: TStringVec& featureNames(); private: TStringVec m_FeatureNames; ETargetType m_TargetType; TOptionalStringVec m_ClassificationLabels; TOptionalDoubleVec m_ClassificationWeights; }; //! Classification and regression trees. class API_EXPORT CTree final : public CTrainedModel { public: class CTreeNode : public CSerializableToJsonStream { public: using TDoubleVec = std::vector<double>; using TNodeIndex = std::uint32_t; using TOptionalNodeIndex = std::optional<TNodeIndex>; using TOptionalDouble = std::optional<double>; static const std::string JSON_DECISION_TYPE_TAG; static const std::string JSON_DEFAULT_LEFT_TAG; static const std::string JSON_LEAF_VALUE_TAG; static const std::string JSON_LEFT_CHILD_TAG; static const std::string JSON_LT; static const std::string JSON_NODE_INDEX_TAG; static const std::string JSON_NUMBER_SAMPLES_TAG; static const std::string JSON_RIGHT_CHILD_TAG; static const std::string JSON_SPLIT_FEATURE_TAG; static const std::string JSON_SPLIT_GAIN_TAG; static const std::string JSON_THRESHOLD_TAG; public: CTreeNode(TNodeIndex nodeIndex, double threshold, bool defaultLeft, TDoubleVec leafValue, std::size_t splitFeature, std::size_t numberSamples, const TOptionalNodeIndex& leftChild, const TOptionalNodeIndex& rightChild, const TOptionalDouble& splitGain); void addToJsonStream(TGenericLineWriter& writer) const override; size_t splitFeature() const; void splitFeature(size_t splitFeature); bool leaf() const; private: bool m_DefaultLeft; ENumericRelationship m_DecisionType = E_LT; TNodeIndex m_NodeIndex; TOptionalNodeIndex m_LeftChild; TOptionalNodeIndex m_RightChild; std::size_t m_SplitFeature; std::size_t m_NumberSamples; double m_Threshold; TDoubleVec m_LeafValue; TOptionalDouble m_SplitGain; }; class CSizeInfo : public CTrainedModel::CSizeInfo { public: static const std::string JSON_NUM_NODES_TAG; static const std::string JSON_NUM_LEAVES_TAG; public: explicit CSizeInfo(const CTree& tree); void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; std::size_t numOperations() const override; private: const CTree& m_Tree; }; using TTreeNodeVec = std::vector<CTreeNode>; static const std::string JSON_TREE_TAG; static const std::string JSON_TREE_STRUCTURE_TAG; public: void addToJsonStream(TGenericLineWriter& writer) const override; //! Total number of tree nodes. std::size_t size() const; const TStringVec& removeUnusedFeatures() override; TTreeNodeVec& treeStructure(); //! Get the object for model size with information for estimation. TSizeInfoUPtr sizeInfo() const override; private: TTreeNodeVec m_TreeStructure; }; //! Ensemble of a collection of trained models class API_EXPORT CEnsemble final : public CTrainedModel { public: using TAggregateOutputUPtr = std::unique_ptr<CAggregateOutput>; using TTrainedModelUPtr = std::unique_ptr<CTrainedModel>; using TTrainedModelUPtrVec = std::vector<TTrainedModelUPtr>; class CSizeInfo : public CTrainedModel::CSizeInfo { public: static const std::string JSON_FEATURE_NAME_LENGTHS_TAG; static const std::string JSON_NUM_OPERATIONS_TAG; static const std::string JSON_NUM_OUTPUT_PROCESSOR_WEIGHTS_TAG; static const std::string JSON_TREE_SIZES_TAG; public: explicit CSizeInfo(const CEnsemble& ensemble); void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; std::size_t numOperations() const override; private: const CEnsemble* m_Ensemble; }; static const std::string JSON_AGGREGATE_OUTPUT_TAG; static const std::string JSON_ENSEMBLE_TAG; static const std::string JSON_TRAINED_MODELS_TAG; public: void addToJsonStream(TGenericLineWriter& writer) const override; //! Aggregation mechanism for the output from individual models. void aggregateOutput(TAggregateOutputUPtr&& aggregateOutput); const TAggregateOutputUPtr& aggregateOutput() const; void featureNames(TStringVec featureNames) override; //! List of trained models withing this ensemble. TTrainedModelUPtrVec& trainedModels(); //! Number of models in the ensemble. std::size_t size() const; const TStringVec& removeUnusedFeatures() override; void targetType(ETargetType targetType) override; //! Set the labels to use for each class. void classificationLabels(const TStringVec& classificationLabels) override; //! Set weights by which to multiply classes when doing label assignment. void classificationWeights(TDoubleVec classificationWeights) override; //! Get the object for model size with information for estimation. TSizeInfoUPtr sizeInfo() const override; using CTrainedModel::classificationLabels; using CTrainedModel::classificationWeights; using CTrainedModel::featureNames; using CTrainedModel::targetType; private: TTrainedModelUPtrVec m_TrainedModels; TAggregateOutputUPtr m_AggregateOutput; }; class API_EXPORT CEncoding : public CSerializableToJsonStream { public: class CSizeInfo : public CSerializableToJsonDocument { public: static const std::string JSON_FEATURE_NAME_LENGTH_TAG; static const std::string JSON_FIELD_VALUE_LENGTHS_TAG; static const std::string JSON_FIELD_LENGTH_TAG; public: void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; virtual const std::string& typeString() const = 0; const CEncoding* encoding() const; protected: using TSizeVec = std::vector<std::size_t>; protected: explicit CSizeInfo(const CEncoding* encoding); private: const CEncoding* m_Encoding; }; using TSizeInfoUPtr = std::unique_ptr<CSizeInfo>; static const std::string JSON_FIELD_TAG; static const std::string JSON_FEATURE_NAME_TAG; public: ~CEncoding() override = default; explicit CEncoding(std::string field); void addToJsonStream(TGenericLineWriter& writer) const override; //! Input field name. Must be defined in the input section. void field(const std::string& field); const std::string& field() const; //! Encoding type as string. virtual const std::string& typeString() const = 0; //! Get the object for model size with information for estimation. virtual TSizeInfoUPtr sizeInfo() const = 0; private: //! Input field name. Must be defined in the input section. std::string m_Field; }; class API_EXPORT CCustomEncoding : public CSerializableToJsonStream {}; //! \brief Mapping from categorical columns to numerical values related to categorical value distribution. class API_EXPORT CFrequencyEncoding final : public CEncoding { public: class CSizeInfo final : public CEncoding::CSizeInfo { public: explicit CSizeInfo(const CFrequencyEncoding& encoding); void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; const std::string& typeString() const override; private: const CFrequencyEncoding& m_Encoding; }; using TStringDoubleUMap = const boost::unordered_map<std::string, double>; static const std::string JSON_FREQUENCY_MAP_TAG; static const std::string JSON_FREQUENCY_ENCODING_TAG; public: ~CFrequencyEncoding() override = default; CFrequencyEncoding(const std::string& field, std::string featureName, TStringDoubleUMap frequencyMap); void addToJsonStream(TGenericLineWriter& writer) const override; //! Feature name after pre-processing. const std::string& featureName() const; //! Map from the category names to the frequency values. const TStringDoubleUMap& frequencyMap() const; const std::string& typeString() const override; //! Get the object for model size with information for estimation. TSizeInfoUPtr sizeInfo() const override; private: std::string m_FeatureName; TStringDoubleUMap m_FrequencyMap; }; //! \brief Application of the one-hot encoding function on a single column. class API_EXPORT COneHotEncoding final : public CEncoding { public: class CSizeInfo final : public CEncoding::CSizeInfo { public: static const std::string JSON_FEATURE_NAME_LENGTHS_TAG; public: explicit CSizeInfo(const COneHotEncoding& encoding); void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; const std::string& typeString() const override; private: const COneHotEncoding& m_Encoding; }; using TStrStrMap = std::map<std::string, std::string>; static const std::string JSON_HOT_MAP_TAG; static const std::string JSON_ONE_HOT_ENCODING_TAG; public: ~COneHotEncoding() override = default; COneHotEncoding(const std::string& field, TStrStrMap hotMap); void addToJsonStream(TGenericLineWriter& writer) const override; //! Map from the category names of the original field to the new field names. const TStrStrMap& hotMap() const; TStrStrMap& hotMap(); const std::string& typeString() const override; //! Get the object for model size with information for estimation. TSizeInfoUPtr sizeInfo() const override; private: TStrStrMap m_HotMap; }; //! \brief Mapping from categorical columns to numerical values related to the target value. class API_EXPORT CTargetMeanEncoding final : public CEncoding { public: class CSizeInfo final : public CEncoding::CSizeInfo { public: explicit CSizeInfo(const CTargetMeanEncoding& encoding); void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; const std::string& typeString() const override; private: const CTargetMeanEncoding& m_Encoding; }; using TStringDoubleUMap = boost::unordered_map<std::string, double>; static const std::string JSON_TARGET_MAP_TAG; static const std::string JSON_TARGET_MEAN_ENCODING_TAG; static const std::string JSON_DEFAULT_VALUE_TAG; public: ~CTargetMeanEncoding() override = default; CTargetMeanEncoding(const std::string& field, double defaultValue, std::string featureName, TStringDoubleUMap&& targetMap); void addToJsonStream(TGenericLineWriter& writer) const override; //! Value for categories that have not been seen before. double defaultValue() const; //! Feature name after pre-processing. const std::string& featureName() const; //! Map from the category names to the target values. const TStringDoubleUMap& targetMap() const; const std::string& typeString() const override; //! Get the object for model size with information for estimation. TSizeInfoUPtr sizeInfo() const override; private: double m_DefaultValue; std::string m_FeatureName; TStringDoubleUMap m_TargetMap; }; //! \brief A JSON blob defining a custom encoding or an array of custom encodings. class API_EXPORT COpaqueEncoding final : public CCustomEncoding { public: explicit COpaqueEncoding(const json::value& object); void addToJsonStream(TGenericLineWriter& writer) const override; private: json::value m_Object; }; //! \brief Technical details required for model evaluation. class API_EXPORT CInferenceModelDefinition : public CSerializableToCompressedChunkedJson { public: using TApiEncodingUPtr = std::unique_ptr<api::CEncoding>; using TApiEncodingUPtrVec = std::vector<TApiEncodingUPtr>; using TApiCustomEncodingUPtr = std::unique_ptr<api::CCustomEncoding>; using TApiCustomEncodingUPtrVec = std::vector<TApiCustomEncodingUPtr>; using TBoostJsonWriter = core::CBoostJsonConcurrentLineWriter; using TSizeStringUMap = boost::unordered_map<std::size_t, std::string>; using TSizeStringUMapVec = std::vector<TSizeStringUMap>; using TStringSizeUMap = boost::unordered_map<std::string, std::size_t>; using TStringSizeUMapVec = std::vector<TStringSizeUMap>; using TStringVec = std::vector<std::string>; using TTrainedModelUPtr = CEnsemble::TTrainedModelUPtr; class API_EXPORT CSizeInfo final : public CSerializableToJsonDocument { public: static const std::string JSON_ENSEMBLE_MODEL_SIZE_TAG; static const std::string JSON_MODEL_SIZE_INFO_TAG; static const std::string JSON_TRAINED_MODEL_SIZE_TAG; public: explicit CSizeInfo(const CInferenceModelDefinition& definition); void addToJsonDocument(json::object& parentObject, TBoostJsonWriter& writer) const override; const std::string& typeString() const; std::string jsonString(); private: const CInferenceModelDefinition& m_Definition; }; using TSizeInfoUPtr = std::unique_ptr<CSizeInfo>; static const std::string JSON_COMPRESSED_INFERENCE_MODEL_TAG; static const std::string JSON_DEFINITION_TAG; static const std::string JSON_PREPROCESSORS_TAG; static const std::string JSON_TRAINED_MODEL_TAG; public: TApiEncodingUPtrVec& preprocessors(); const TApiEncodingUPtrVec& preprocessors() const { return m_Preprocessors; } TApiCustomEncodingUPtrVec& customPreprocessors(); const TApiCustomEncodingUPtrVec& customPreprocessors() const { return m_CustomPreprocessors; } void trainedModel(TTrainedModelUPtr&& trainedModel); TTrainedModelUPtr& trainedModel(); const TTrainedModelUPtr& trainedModel() const; void addToJsonStream(TGenericLineWriter& writer) const final; void addCompressedToJsonStream(TBoostJsonWriter& writer) const final; void fieldNames(TStringVec&& fieldNames); const TStringVec& fieldNames() const; const std::string& typeString() const; std::size_t dependentVariableColumnIndex() const; void dependentVariableColumnIndex(size_t dependentVariableColumnIndex); //! Get the object for model size with information for estimation. TSizeInfoUPtr sizeInfo() const; private: //! Optional step for pre-processing data, e.g. vector embedding, one-hot-encoding, etc. TApiEncodingUPtrVec m_Preprocessors; //! Optional step for custom pre-processing data, supplied from analytics config TApiCustomEncodingUPtrVec m_CustomPreprocessors; //! Details of the model evaluation step with a trained_model. TTrainedModelUPtr m_TrainedModel; TStringVec m_FieldNames; std::string m_TypeString; std::size_t m_DependentVariableColumnIndex; }; } } #endif //INCLUDED_ml_api_CInferenceModelDefinition_h