From 06007cd9618d36391ad64354adbb1cb8022598e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Witek?= Date: Fri, 4 Oct 2019 07:05:37 +0200 Subject: [PATCH] Introduce classification analysis runner. (#701) --- include/api/CDataFrameAnalysisRunner.h | 10 +- include/api/CDataFrameAnalysisSpecification.h | 4 + include/api/CDataFrameAnalyzer.h | 2 + include/api/CDataFrameBoostedTreeRunner.h | 36 ++--- include/api/CDataFrameClassificationRunner.h | 67 +++++++++ include/api/CDataFrameOutliersRunner.h | 6 +- include/api/CDataFrameRegressionRunner.h | 59 ++++++++ lib/api/CDataFrameAnalysisRunner.cc | 10 +- lib/api/CDataFrameAnalysisSpecification.cc | 15 +- lib/api/CDataFrameAnalyzer.cc | 41 +++++- lib/api/CDataFrameBoostedTreeRunner.cc | 52 ++----- lib/api/CDataFrameClassificationRunner.cc | 134 +++++++++++++++++ lib/api/CDataFrameOutliersRunner.cc | 12 +- lib/api/CDataFrameRegressionRunner.cc | 85 +++++++++++ lib/api/Makefile.first | 2 + .../unittest/CDataFrameAnalysisRunnerTest.cc | 36 +++++ .../unittest/CDataFrameAnalysisRunnerTest.h | 2 + .../CDataFrameAnalysisSpecificationTest.cc | 8 +- lib/api/unittest/CDataFrameAnalyzerTest.cc | 135 +++++++++++++----- lib/api/unittest/CDataFrameAnalyzerTest.h | 1 + .../unittest/CDataFrameMockAnalysisRunner.cc | 1 + .../unittest/CDataFrameMockAnalysisRunner.h | 1 + 22 files changed, 610 insertions(+), 109 deletions(-) create mode 100644 include/api/CDataFrameClassificationRunner.h create mode 100644 include/api/CDataFrameRegressionRunner.h create mode 100644 lib/api/CDataFrameClassificationRunner.cc create mode 100644 lib/api/CDataFrameRegressionRunner.cc diff --git a/include/api/CDataFrameAnalysisRunner.h b/include/api/CDataFrameAnalysisRunner.h index 2a3214d41b..7e28048ee9 100644 --- a/include/api/CDataFrameAnalysisRunner.h +++ b/include/api/CDataFrameAnalysisRunner.h @@ -59,7 +59,9 @@ class CMemoryUsageEstimationResultJsonWriter; //! early to determine how to implement a good cooperative interrupt scheme. class API_EXPORT CDataFrameAnalysisRunner { public: + using TBoolVec = std::vector; using TStrVec = std::vector; + using TStrVecVec = std::vector; using TRowRef = core::data_frame_detail::CRowRef; using TProgressRecorder = std::function; @@ -98,6 +100,9 @@ class API_EXPORT CDataFrameAnalysisRunner { //! \return The number of columns this analysis appends. virtual std::size_t numberExtraColumns() const = 0; + //! \return Indicator of columns for which empty value should be treated as missing. + virtual TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const; + //! Write the extra columns of \p row added by the analysis to \p writer. //! //! This should create a new object of the form: @@ -114,6 +119,7 @@ class API_EXPORT CDataFrameAnalysisRunner { //! \param[in] row The row to write the columns added by this analysis. //! \param[in,out] writer The stream to which to write the extra columns. virtual void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const = 0; @@ -189,12 +195,12 @@ class API_EXPORT CDataFrameAnalysisRunnerFactory { TRunnerUPtr make(const CDataFrameAnalysisSpecification& spec) const; TRunnerUPtr make(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const; + const rapidjson::Value& jsonParameters) const; private: virtual TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const = 0; virtual TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const = 0; + const rapidjson::Value& jsonParameters) const = 0; }; } } diff --git a/include/api/CDataFrameAnalysisSpecification.h b/include/api/CDataFrameAnalysisSpecification.h index 706b3ff9f7..da8f211f59 100644 --- a/include/api/CDataFrameAnalysisSpecification.h +++ b/include/api/CDataFrameAnalysisSpecification.h @@ -43,6 +43,7 @@ namespace api { //! performance statistics. class API_EXPORT CDataFrameAnalysisSpecification { public: + using TBoolVec = std::vector; using TStrVec = std::vector; using TDataFrameUPtr = std::unique_ptr; using TTemporaryDirectoryPtr = std::shared_ptr; @@ -173,6 +174,9 @@ class API_EXPORT CDataFrameAnalysisSpecification { //! 2. disk is used (only one partition needs to be loaded to main memory) void estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const; + //! \return Indicator of columns for which empty value should be treated as missing. + TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const; + //! \return shared pointer to the persistence stream. TDataAdderUPtr persister() const; diff --git a/include/api/CDataFrameAnalyzer.h b/include/api/CDataFrameAnalyzer.h index e76ddfde3d..2e3516fb01 100644 --- a/include/api/CDataFrameAnalyzer.h +++ b/include/api/CDataFrameAnalyzer.h @@ -34,6 +34,7 @@ class CDataFrameAnalysisSpecification; //! class API_EXPORT CDataFrameAnalyzer { public: + using TBoolVec = std::vector; using TStrVec = std::vector; using TJsonOutputStreamWrapperUPtr = std::unique_ptr; using TJsonOutputStreamWrapperUPtrSupplier = @@ -104,6 +105,7 @@ class API_EXPORT CDataFrameAnalyzer { TDataFrameAnalysisSpecificationUPtr m_AnalysisSpecification; TStrVec m_CategoricalFieldNames; TStrSizeUMapVec m_CategoricalFieldValues; + TBoolVec m_EmptyAsMissing; TDataFrameUPtr m_DataFrame; TStrVec m_FieldNames; TTemporaryDirectoryPtr m_DataFrameDirectory; diff --git a/include/api/CDataFrameBoostedTreeRunner.h b/include/api/CDataFrameBoostedTreeRunner.h index 13b0a6863f..e11cccbdf2 100644 --- a/include/api/CDataFrameBoostedTreeRunner.h +++ b/include/api/CDataFrameBoostedTreeRunner.h @@ -9,6 +9,7 @@ #include +#include #include #include #include @@ -25,11 +26,11 @@ class CBoostedTreeFactory; namespace api { //! \brief Runs boosted tree regression on a core::CDataFrame. -class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRunner { +class API_EXPORT CDataFrameBoostedTreeRunner : public CDataFrameAnalysisRunner { public: //! This is not intended to be called directly: use CDataFrameBoostedTreeRunnerFactory. CDataFrameBoostedTreeRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters); + const CDataFrameAnalysisConfigReader::CParameters& parameters); //! This is not intended to be called directly: use CDataFrameBoostedTreeRunnerFactory. CDataFrameBoostedTreeRunner(const CDataFrameAnalysisSpecification& spec); @@ -39,13 +40,20 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu //! \return The number of columns this adds to the data frame. std::size_t numberExtraColumns() const override; - //! Write the prediction for \p row to \p writer. - void writeOneRow(const TStrVec& featureNames, - TRowRef row, - core::CRapidJsonConcurrentLineWriter& writer) const override; +protected: + using TBoostedTreeUPtr = std::unique_ptr; + +protected: + //! Parameter reader handling parameters that are shared by subclasses. + static CDataFrameAnalysisConfigReader getParameterReader(); + //! Name of dependent variable field. + const std::string& dependentVariableFieldName() const; + //! Name of prediction field. + const std::string& predictionFieldName() const; + //! Underlying boosted tree. + const maths::CBoostedTree& boostedTree() const; private: - using TBoostedTreeUPtr = std::unique_ptr; using TBoostedTreeFactoryUPtr = std::unique_ptr; using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr; using TMemoryEstimator = std::function; @@ -71,20 +79,6 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu TBoostedTreeUPtr m_BoostedTree; std::atomic m_Memory; }; - -//! \brief Makes a core::CDataFrame boosted tree regression runner. -class API_EXPORT CDataFrameBoostedTreeRunnerFactory final : public CDataFrameAnalysisRunnerFactory { -public: - const std::string& name() const override; - -private: - static const std::string NAME; - -private: - TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; - TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const override; -}; } } diff --git a/include/api/CDataFrameClassificationRunner.h b/include/api/CDataFrameClassificationRunner.h new file mode 100644 index 0000000000..f5974ef39c --- /dev/null +++ b/include/api/CDataFrameClassificationRunner.h @@ -0,0 +1,67 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_api_CDataFrameClassificationRunner_h +#define INCLUDED_ml_api_CDataFrameClassificationRunner_h + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +namespace ml { +namespace api { + +//! \brief Runs boosted tree classification on a core::CDataFrame. +class API_EXPORT CDataFrameClassificationRunner final : public CDataFrameBoostedTreeRunner { +public: + static const CDataFrameAnalysisConfigReader getParameterReader(); + + //! This is not intended to be called directly: use CDataFrameClassificationRunnerFactory. + CDataFrameClassificationRunner(const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters); + + //! This is not intended to be called directly: use CDataFrameClassificationRunnerFactory. + CDataFrameClassificationRunner(const CDataFrameAnalysisSpecification& spec); + + //! \return Indicator of columns for which empty value should be treated as missing. + TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const override; + + //! Write the prediction for \p row to \p writer. + void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const override; + +private: + std::size_t m_NumTopClasses; +}; + +//! \brief Makes a core::CDataFrame boosted tree classification runner. +class API_EXPORT CDataFrameClassificationRunnerFactory final + : public CDataFrameAnalysisRunnerFactory { +public: + const std::string& name() const override; + +private: + static const std::string NAME; + +private: + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const override; +}; +} +} + +#endif // INCLUDED_ml_api_CDataFrameClassificationRunner_h diff --git a/include/api/CDataFrameOutliersRunner.h b/include/api/CDataFrameOutliersRunner.h index cda4bce5c6..86d40caf33 100644 --- a/include/api/CDataFrameOutliersRunner.h +++ b/include/api/CDataFrameOutliersRunner.h @@ -7,6 +7,7 @@ #ifndef INCLUDED_ml_api_CDataFrameOutliersRunner_h #define INCLUDED_ml_api_CDataFrameOutliersRunner_h +#include #include #include @@ -21,7 +22,7 @@ class API_EXPORT CDataFrameOutliersRunner final : public CDataFrameAnalysisRunne public: //! This is not intended to be called directly: use CDataFrameOutliersRunnerFactory. CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters); + const CDataFrameAnalysisConfigReader::CParameters& parameters); //! This is not intended to be called directly: use CDataFrameOutliersRunnerFactory. CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec); @@ -31,6 +32,7 @@ class API_EXPORT CDataFrameOutliersRunner final : public CDataFrameAnalysisRunne //! Write the extra columns of \p row added by outlier analysis to \p writer. void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const override; @@ -79,7 +81,7 @@ class API_EXPORT CDataFrameOutliersRunnerFactory final : public CDataFrameAnalys private: TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const override; + const rapidjson::Value& jsonParameters) const override; }; } } diff --git a/include/api/CDataFrameRegressionRunner.h b/include/api/CDataFrameRegressionRunner.h new file mode 100644 index 0000000000..4c542987c8 --- /dev/null +++ b/include/api/CDataFrameRegressionRunner.h @@ -0,0 +1,59 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_api_CDataFrameRegressionRunner_h +#define INCLUDED_ml_api_CDataFrameRegressionRunner_h + +#include + +#include +#include +#include +#include + +#include + +#include + +namespace ml { +namespace api { + +//! \brief Runs boosted tree regression on a core::CDataFrame. +class API_EXPORT CDataFrameRegressionRunner final : public CDataFrameBoostedTreeRunner { +public: + static const CDataFrameAnalysisConfigReader getParameterReader(); + + //! This is not intended to be called directly: use CDataFrameRegressionRunnerFactory. + CDataFrameRegressionRunner(const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters); + + //! This is not intended to be called directly: use CDataFrameRegressionRunnerFactory. + CDataFrameRegressionRunner(const CDataFrameAnalysisSpecification& spec); + + //! Write the prediction for \p row to \p writer. + void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const override; +}; + +//! \brief Makes a core::CDataFrame boosted tree regression runner. +class API_EXPORT CDataFrameRegressionRunnerFactory final : public CDataFrameAnalysisRunnerFactory { +public: + const std::string& name() const override; + +private: + static const std::string NAME; + +private: + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const override; +}; +} +} + +#endif // INCLUDED_ml_api_CDataFrameRegressionRunner_h diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index 2acf7264ab..b6c194b8ee 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -24,6 +24,8 @@ namespace ml { namespace api { namespace { +using TBoolVec = std::vector; + std::size_t maximumNumberPartitions(const CDataFrameAnalysisSpecification& spec) { // We limit the maximum number of partitions to rows^(1/2) because very // large numbers of partitions are going to be slow and it is better to tell @@ -43,6 +45,10 @@ CDataFrameAnalysisRunner::~CDataFrameAnalysisRunner() { this->waitToFinish(); } +TBoolVec CDataFrameAnalysisRunner::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { + return TBoolVec(fieldNames.size(), false); +} + void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const { std::size_t numberRows{m_Spec.numberRows()}; std::size_t numberColumns{m_Spec.numberColumns() + this->numberExtraColumns()}; @@ -223,8 +229,8 @@ CDataFrameAnalysisRunnerFactory::make(const CDataFrameAnalysisSpecification& spe CDataFrameAnalysisRunnerFactory::TRunnerUPtr CDataFrameAnalysisRunnerFactory::make(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const { - auto result = this->makeImpl(spec, params); + const rapidjson::Value& jsonParameters) const { + auto result = this->makeImpl(spec, jsonParameters); result->computeAndSaveExecutionStrategy(); return result; } diff --git a/lib/api/CDataFrameAnalysisSpecification.cc b/lib/api/CDataFrameAnalysisSpecification.cc index 9dac982b77..28bdd70613 100644 --- a/lib/api/CDataFrameAnalysisSpecification.cc +++ b/lib/api/CDataFrameAnalysisSpecification.cc @@ -10,8 +10,9 @@ #include #include -#include +#include #include +#include #include #include @@ -41,11 +42,13 @@ const std::string CDataFrameAnalysisSpecification::NAME("name"); const std::string CDataFrameAnalysisSpecification::PARAMETERS("parameters"); namespace { +using TBoolVec = std::vector; using TRunnerFactoryUPtrVec = ml::api::CDataFrameAnalysisSpecification::TRunnerFactoryUPtrVec; TRunnerFactoryUPtrVec analysisFactories() { TRunnerFactoryUPtrVec factories; - factories.push_back(std::make_unique()); + factories.push_back(std::make_unique()); + factories.push_back(std::make_unique()); factories.push_back(std::make_unique()); // Add new analysis types here. return factories; @@ -206,6 +209,14 @@ void CDataFrameAnalysisSpecification::estimateMemoryUsage(CMemoryUsageEstimation m_Runner->estimateMemoryUsage(writer); } +TBoolVec CDataFrameAnalysisSpecification::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { + if (m_Runner == nullptr) { + HANDLE_FATAL(<< "Internal error: no runner available. Please report this problem."); + return TBoolVec(fieldNames.size(), false); + } + return m_Runner->columnsForWhichEmptyIsMissing(fieldNames); +} + void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& jsonAnalysis) { // We pass of the interpretation of the parameters object to the appropriate // analysis runner. diff --git a/lib/api/CDataFrameAnalyzer.cc b/lib/api/CDataFrameAnalyzer.cc index a61017ce62..43c28db4be 100644 --- a/lib/api/CDataFrameAnalyzer.cc +++ b/lib/api/CDataFrameAnalyzer.cc @@ -24,12 +24,36 @@ namespace ml { namespace api { namespace { using TStrVec = std::vector; +using TStrVecVec = std::vector; +using TStrSizeUMap = boost::unordered_map; +using TStrSizeUMapVec = std::vector; core::CFloatStorage truncateToFloatRange(double value) { double largest{static_cast(std::numeric_limits::max())}; return maths::CTools::truncate(value, -largest, largest); } +void mapToVector(const TStrSizeUMap& map, TStrVec& vector) { + assert(vector.empty()); + vector.resize(map.size()); + for (const auto& entry : map) { + std::size_t index = entry.second; + if (index >= vector.size()) { + HANDLE_FATAL(<< "Index out of bounds: " << index); + } else { + vector[index] = entry.first; + } + } +} + +void mapsToVectors(const TStrSizeUMapVec& maps, TStrVecVec& vectors) { + assert(vectors.empty()); + vectors.resize(maps.size()); + for (std::size_t i = 0; i < maps.size(); ++i) { + mapToVector(maps[i], vectors[i]); + } +} + const std::string SPECIAL_COLUMN_FIELD_NAME{"."}; // Control message types: @@ -278,6 +302,7 @@ void CDataFrameAnalyzer::captureFieldNames(const TStrVec& fieldNames) { if (m_FieldNames.empty()) { m_FieldNames.assign(fieldNames.begin() + m_BeginDataFieldValues, fieldNames.begin() + m_EndDataFieldValues); + m_EmptyAsMissing = m_AnalysisSpecification->columnsForWhichEmptyIsMissing(m_FieldNames); } } @@ -289,8 +314,11 @@ void CDataFrameAnalyzer::addRowToDataFrame(const TStrVec& fieldValues) { using TFloatVecItr = core::CDataFrame::TFloatVecItr; auto fieldToValue = [this](bool isCategorical, TStrSizeUMap& categoricalFields, - const std::string& fieldValue) { + bool emptyAsMissing, const std::string& fieldValue) { if (isCategorical) { + if (fieldValue.empty() && emptyAsMissing) { + return core::CFloatStorage{core::CDataFrame::valueOfMissing()}; + } // This encodes in a format suitable for efficient storage. The // actual encoding approach is chosen when the analysis runs. std::int64_t id; @@ -337,8 +365,8 @@ void CDataFrameAnalyzer::addRowToDataFrame(const TStrVec& fieldValues) { m_DataFrame->writeRow([&](TFloatVecItr columns, std::int32_t& docHash) { for (std::ptrdiff_t i = m_BeginDataFieldValues; i != m_EndDataFieldValues; ++i, ++columns) { - *columns = fieldToValue(isCategorical[i], - m_CategoricalFieldValues[i], fieldValues[i]); + *columns = fieldToValue(isCategorical[i], m_CategoricalFieldValues[i], + m_EmptyAsMissing[i], fieldValues[i]); } docHash = 0; if (m_DocHashFieldIndex != FIELD_MISSING && @@ -380,6 +408,11 @@ void CDataFrameAnalyzer::writeResultsOf(const CDataFrameAnalysisRunner& analysis // can join the extra columns with the original data frame. std::size_t numberThreads{1}; + // Change representation of categorical field values from map (category name -> index) + // to the vector of category names. + TStrVecVec categoricalFieldValues; + mapsToVectors(m_CategoricalFieldValues, categoricalFieldValues); + using TRowItr = core::CDataFrame::TRowItr; m_DataFrame->readRows(numberThreads, [&](TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { @@ -392,7 +425,7 @@ void CDataFrameAnalyzer::writeResultsOf(const CDataFrameAnalysisRunner& analysis writer.StartObject(); writer.Key(m_AnalysisSpecification->resultsField()); - analysis.writeOneRow(m_FieldNames, *row, writer); + analysis.writeOneRow(m_FieldNames, categoricalFieldValues, *row, writer); writer.EndObject(); writer.EndObject(); diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 831d520754..e69858b65e 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -40,8 +40,9 @@ const std::string FEATURE_BAG_FRACTION{"feature_bag_fraction"}; const std::string NUMBER_FOLDS{"number_folds"}; const std::string NUMBER_ROUNDS_PER_HYPERPARAMETER{"number_rounds_per_hyperparameter"}; const std::string BAYESIAN_OPTIMISATION_RESTARTS{"bayesian_optimisation_restarts"}; +} -const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { +CDataFrameAnalysisConfigReader CDataFrameBoostedTreeRunner::getParameterReader() { CDataFrameAnalysisConfigReader theReader; theReader.addParameter(DEPENDENT_VARIABLE_NAME, CDataFrameAnalysisConfigReader::E_RequiredParameter); @@ -65,18 +66,13 @@ const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { theReader.addParameter(BAYESIAN_OPTIMISATION_RESTARTS, CDataFrameAnalysisConfigReader::E_OptionalParameter); return theReader; -}()}; - -// Output -const std::string IS_TRAINING_FIELD_NAME{"is_training"}; } -CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters) +CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) : CDataFrameBoostedTreeRunner{spec} { - auto parameters = PARAMETER_READER.read(jsonParameters); - m_DependentVariableFieldName = parameters[DEPENDENT_VARIABLE_NAME].as(); m_PredictionFieldName = parameters[PREDICTION_FIELD_NAME].fallback( @@ -189,20 +185,19 @@ std::size_t CDataFrameBoostedTreeRunner::numberExtraColumns() const { return m_BoostedTreeFactory->numberExtraColumnsForTrain(); } -void CDataFrameBoostedTreeRunner::writeOneRow(const TStrVec&, - TRowRef row, - core::CRapidJsonConcurrentLineWriter& writer) const { +const std::string& CDataFrameBoostedTreeRunner::dependentVariableFieldName() const { + return m_DependentVariableFieldName; +} + +const std::string& CDataFrameBoostedTreeRunner::predictionFieldName() const { + return m_PredictionFieldName; +} + +const maths::CBoostedTree& CDataFrameBoostedTreeRunner::boostedTree() const { if (m_BoostedTree == nullptr) { HANDLE_FATAL(<< "Internal error: boosted tree object missing. Please report this error."); - } else { - writer.StartObject(); - writer.Key(m_PredictionFieldName); - writer.Double(row[m_BoostedTree->columnHoldingPrediction(row.numberColumns())]); - writer.Key(IS_TRAINING_FIELD_NAME); - writer.Bool(maths::CDataFrameUtils::isMissing( - row[m_BoostedTree->columnHoldingDependentVariable()]) == false); - writer.EndObject(); } + return *m_BoostedTree; } void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, @@ -284,22 +279,5 @@ std::size_t CDataFrameBoostedTreeRunner::estimateBookkeepingMemoryUsage( std::size_t numberColumns) const { return m_BoostedTreeFactory->estimateMemoryUsage(totalNumberRows, numberColumns); } - -const std::string& CDataFrameBoostedTreeRunnerFactory::name() const { - return NAME; -} - -CDataFrameBoostedTreeRunnerFactory::TRunnerUPtr -CDataFrameBoostedTreeRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec) const { - return std::make_unique(spec); -} - -CDataFrameBoostedTreeRunnerFactory::TRunnerUPtr -CDataFrameBoostedTreeRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const { - return std::make_unique(spec, params); -} - -const std::string CDataFrameBoostedTreeRunnerFactory::NAME{"regression"}; } } diff --git a/lib/api/CDataFrameClassificationRunner.cc b/lib/api/CDataFrameClassificationRunner.cc new file mode 100644 index 0000000000..7628c9e326 --- /dev/null +++ b/lib/api/CDataFrameClassificationRunner.cc @@ -0,0 +1,134 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace ml { +namespace api { +namespace { +using TBoolVec = std::vector; + +// Configuration +const std::string NUM_TOP_CLASSES{"num_top_classes"}; + +// Output +const std::string IS_TRAINING_FIELD_NAME{"is_training"}; +const std::string TOP_CLASSES_FIELD_NAME{"top_classes"}; +const std::string CLASS_NAME_FIELD_NAME{"class_name"}; +const std::string CLASS_PROBABILITY_FIELD_NAME{"class_probability"}; +} + +const CDataFrameAnalysisConfigReader CDataFrameClassificationRunner::getParameterReader() { + CDataFrameAnalysisConfigReader theReader{CDataFrameBoostedTreeRunner::getParameterReader()}; + theReader.addParameter(NUM_TOP_CLASSES, CDataFrameAnalysisConfigReader::E_OptionalParameter); + return theReader; +} + +CDataFrameClassificationRunner::CDataFrameClassificationRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) + : CDataFrameBoostedTreeRunner{spec, parameters} { + + m_NumTopClasses = parameters[NUM_TOP_CLASSES].fallback(std::size_t{0}); +} + +CDataFrameClassificationRunner::CDataFrameClassificationRunner(const CDataFrameAnalysisSpecification& spec) + : CDataFrameBoostedTreeRunner{spec} { +} + +// The only field for which empty value should be treated as missing is dependent variable +// which has empty value for non-training rows. +TBoolVec CDataFrameClassificationRunner::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { + TBoolVec emptyAsMissing(fieldNames.size(), false); + auto pos = std::find(fieldNames.begin(), fieldNames.end(), + this->dependentVariableFieldName()); + if (pos != fieldNames.end()) { + emptyAsMissing[pos - fieldNames.begin()] = true; + } + return emptyAsMissing; +} + +void CDataFrameClassificationRunner::writeOneRow(const TStrVec&, + const TStrVecVec& categoricalFieldValues, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const { + const auto& tree = this->boostedTree(); + const std::size_t columnHoldingDependentVariable{tree.columnHoldingDependentVariable()}; + const std::size_t columnHoldingPrediction{ + tree.columnHoldingPrediction(row.numberColumns())}; + const double dependentVariable{row[columnHoldingDependentVariable]}; + const std::uint64_t prediction{ + static_cast(std::lround(row[columnHoldingPrediction]))}; + if (prediction >= categoricalFieldValues[columnHoldingDependentVariable].size()) { + HANDLE_FATAL(<< "Index out of bounds: " << prediction); + } + const std::string& predictedClassName{ + categoricalFieldValues[columnHoldingDependentVariable][prediction]}; + + writer.StartObject(); + writer.Key(this->predictionFieldName()); + writer.String(predictedClassName); + writer.Key(IS_TRAINING_FIELD_NAME); + writer.Bool(maths::CDataFrameUtils::isMissing(dependentVariable) == false); + // TODO: Uncomment and adapt the code below once top classes feature is implemented + // See https://github.com/elastic/ml-cpp/issues/712 + /* + if (m_NumTopClasses > 0) { + writer.Key(TOP_CLASSES_FIELD_NAME); + writer.StartArray(); + for (std::size_t i = 0; i < m_NumTopClasses; i++) { + writer.StartObject(); + writer.Key(CLASS_NAME_FIELD_NAME); + writer.String(???); + writer.Key(CLASS_PROBABILITY_FIELD_NAME); + writer.Double(???); + writer.EndObject(); + } + writer.EndArray(); + } + */ + writer.EndObject(); +} + +const std::string& CDataFrameClassificationRunnerFactory::name() const { + return NAME; +} + +CDataFrameClassificationRunnerFactory::TRunnerUPtr +CDataFrameClassificationRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec) const { + return std::make_unique(spec); +} + +CDataFrameClassificationRunnerFactory::TRunnerUPtr +CDataFrameClassificationRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const { + CDataFrameAnalysisConfigReader parameterReader = + CDataFrameClassificationRunner::getParameterReader(); + auto parameters = parameterReader.read(jsonParameters); + return std::make_unique(spec, parameters); +} + +const std::string CDataFrameClassificationRunnerFactory::NAME{"classification"}; +} +} diff --git a/lib/api/CDataFrameOutliersRunner.cc b/lib/api/CDataFrameOutliersRunner.cc index 0f10a60792..f928ae255f 100644 --- a/lib/api/CDataFrameOutliersRunner.cc +++ b/lib/api/CDataFrameOutliersRunner.cc @@ -61,11 +61,11 @@ const std::string OUTLIER_SCORE_FIELD_NAME{"outlier_score"}; const std::string FEATURE_INFLUENCE_FIELD_NAME_PREFIX{"feature_influence."}; } -CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters) +CDataFrameOutliersRunner::CDataFrameOutliersRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) : CDataFrameOutliersRunner{spec} { - auto parameters = PARAMETER_READER.read(jsonParameters); m_StandardizeColumns = parameters[STANDARDIZE_COLUMNS].fallback(true); m_NumberNeighbours = parameters[N_NEIGHBORS].fallback(std::size_t{0}); m_Method = parameters[METHOD].fallback(maths::COutliers::E_Ensemble); @@ -84,6 +84,7 @@ std::size_t CDataFrameOutliersRunner::numberExtraColumns() const { } void CDataFrameOutliersRunner::writeOneRow(const TStrVec& featureNames, + const TStrVecVec&, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const { std::size_t scoreColumn{row.numberColumns() - this->numberExtraColumns()}; @@ -157,8 +158,9 @@ CDataFrameOutliersRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& CDataFrameOutliersRunnerFactory::TRunnerUPtr CDataFrameOutliersRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const { - return std::make_unique(spec, params); + const rapidjson::Value& jsonParameters) const { + auto parameters = PARAMETER_READER.read(jsonParameters); + return std::make_unique(spec, parameters); } const std::string CDataFrameOutliersRunnerFactory::NAME{"outlier_detection"}; diff --git a/lib/api/CDataFrameRegressionRunner.cc b/lib/api/CDataFrameRegressionRunner.cc new file mode 100644 index 0000000000..c8d236e936 --- /dev/null +++ b/lib/api/CDataFrameRegressionRunner.cc @@ -0,0 +1,85 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace ml { +namespace api { +namespace { +// Output +const std::string IS_TRAINING_FIELD_NAME{"is_training"}; +} + +const CDataFrameAnalysisConfigReader CDataFrameRegressionRunner::getParameterReader() { + return CDataFrameBoostedTreeRunner::getParameterReader(); +} + +CDataFrameRegressionRunner::CDataFrameRegressionRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) + : CDataFrameBoostedTreeRunner{spec, parameters} { +} + +CDataFrameRegressionRunner::CDataFrameRegressionRunner(const CDataFrameAnalysisSpecification& spec) + : CDataFrameBoostedTreeRunner{spec} { +} + +void CDataFrameRegressionRunner::writeOneRow(const TStrVec&, + const TStrVecVec&, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const { + const auto& tree = this->boostedTree(); + const std::size_t columnHoldingDependentVariable = tree.columnHoldingDependentVariable(); + const std::size_t columnHoldingPrediction = + tree.columnHoldingPrediction(row.numberColumns()); + + writer.StartObject(); + writer.Key(this->predictionFieldName()); + writer.Double(row[columnHoldingPrediction]); + writer.Key(IS_TRAINING_FIELD_NAME); + writer.Bool(maths::CDataFrameUtils::isMissing(row[columnHoldingDependentVariable]) == false); + writer.EndObject(); +} + +const std::string& CDataFrameRegressionRunnerFactory::name() const { + return NAME; +} + +CDataFrameRegressionRunnerFactory::TRunnerUPtr +CDataFrameRegressionRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec) const { + return std::make_unique(spec); +} + +CDataFrameRegressionRunnerFactory::TRunnerUPtr +CDataFrameRegressionRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const { + CDataFrameAnalysisConfigReader parameterReader = + CDataFrameRegressionRunner::getParameterReader(); + auto parameters = parameterReader.read(jsonParameters); + return std::make_unique(spec, parameters); +} + +const std::string CDataFrameRegressionRunnerFactory::NAME{"regression"}; +} +} diff --git a/lib/api/Makefile.first b/lib/api/Makefile.first index 093f0155ef..e00f4f0c8d 100644 --- a/lib/api/Makefile.first +++ b/lib/api/Makefile.first @@ -31,7 +31,9 @@ CDataFrameAnalysisSpecification.cc \ CDataFrameAnalysisSpecificationJsonWriter.cc \ CDataFrameAnalyzer.cc \ CDataFrameBoostedTreeRunner.cc \ +CDataFrameClassificationRunner.cc \ CDataFrameOutliersRunner.cc \ +CDataFrameRegressionRunner.cc \ CDataProcessor.cc \ CDataTyper.cc \ CDetectionRulesJsonParser.cc \ diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 5b6ab57a4c..ba8e814bcc 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -189,6 +189,36 @@ void CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000() { testEstimateMemoryUsage(1000, "450kB", "143kB", 0); } +void testColumnsForWhichEmptyIsMissing(const std::string& analysis, + bool expected_dependentVariableEmptyAsMissing) { + using TBoolVec = std::vector; + using TStrVec = std::vector; + + std::string parameters{"{\"dependent_variable\": \"label\"}"}; + std::string jsonSpec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( + "testJob", 10000, 5, 100000000, 1, {}, true, + test::CTestTmpDir::tmpDir(), "", analysis, parameters)}; + api::CDataFrameAnalysisSpecification spec{jsonSpec}; + + TStrVec fieldNames{"feature_1", "feature_2", "feature_3", "label"}; + TBoolVec emptyAsMissing{spec.columnsForWhichEmptyIsMissing(fieldNames)}; + + CPPUNIT_ASSERT_EQUAL(fieldNames.size(), emptyAsMissing.size()); + CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[0])); + CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[1])); + CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[2])); + CPPUNIT_ASSERT_EQUAL(expected_dependentVariableEmptyAsMissing, + bool(emptyAsMissing[3])); +} + +void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingClassification() { + testColumnsForWhichEmptyIsMissing("classification", true); +} + +void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingRegression() { + testColumnsForWhichEmptyIsMissing("regression", false); +} + CppUnit::Test* CDataFrameAnalysisRunnerTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CDataFrameAnalysisRunnerTest"); @@ -213,6 +243,12 @@ CppUnit::Test* CDataFrameAnalysisRunnerTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000", &CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingClassification", + &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingClassification)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingRegression", + &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingRegression)); return suiteOfTests; } diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.h b/lib/api/unittest/CDataFrameAnalysisRunnerTest.h index 2b3b73d9f0..c6ba62cd17 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.h +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.h @@ -18,6 +18,8 @@ class CDataFrameAnalysisRunnerTest : public CppUnit::TestFixture { void testEstimateMemoryUsage_10(); void testEstimateMemoryUsage_100(); void testEstimateMemoryUsage_1000(); + void testColumnsForWhichEmptyIsMissingClassification(); + void testColumnsForWhichEmptyIsMissingRegression(); static CppUnit::Test* suite(); diff --git a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc index c872da1f1f..93d706c92d 100644 --- a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc @@ -13,8 +13,9 @@ #include #include #include -#include +#include #include +#include #include @@ -47,10 +48,13 @@ void CDataFrameAnalysisSpecificationTest::testCreate() { auto runnerFactories = []() { TRunnerFactoryUPtr outliers{std::make_unique()}; TRunnerFactoryUPtr regression{ - std::make_unique()}; + std::make_unique()}; + TRunnerFactoryUPtr classification{ + std::make_unique()}; TRunnerFactoryUPtrVec factories; factories.push_back(std::move(outliers)); factories.push_back(std::move(regression)); + factories.push_back(std::move(classification)); return factories; }; auto jsonSpec = [](const std::string& jobId, const std::string& rows, diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index 8b11db920f..b87139108b 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,7 @@ using TDoubleVecVec = std::vector; using TSizeVec = std::vector; using TStrVec = std::vector; using TRowItr = core::CDataFrame::TRowItr; +using TRowRef = core::CDataFrame::TRowRef; using TPoint = maths::CDenseVector; using TPointVec = std::vector; using TDataFrameUPtr = std::unique_ptr; @@ -144,20 +146,21 @@ auto outlierSpec(std::size_t rows = 110, return std::make_unique(spec); } -auto regressionSpec(std::string dependentVariable, - std::size_t rows = 100, - std::size_t cols = 5, - std::size_t memoryLimit = 3000000, - std::size_t numberRoundsPerHyperparameter = 0, - std::size_t bayesianOptimisationRestarts = 0, - const TStrVec& categoricalFieldNames = TStrVec{}, - double lambda = -1.0, - double gamma = -1.0, - double eta = -1.0, - std::size_t maximumNumberTrees = 0, - double featureBagFraction = -1.0, - CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr, - CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) { +auto analysisSpec(std::string analysis, + std::string dependentVariable, + std::size_t rows = 100, + std::size_t cols = 5, + std::size_t memoryLimit = 3000000, + std::size_t numberRoundsPerHyperparameter = 0, + std::size_t bayesianOptimisationRestarts = 0, + const TStrVec& categoricalFieldNames = TStrVec{}, + double lambda = -1.0, + double gamma = -1.0, + double eta = -1.0, + std::size_t maximumNumberTrees = 0, + double featureBagFraction = -1.0, + CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr, + CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) { std::string parameters = "{\n\"dependent_variable\": \"" + dependentVariable + "\""; if (lambda >= 0.0) { @@ -189,7 +192,7 @@ auto regressionSpec(std::string dependentVariable, std::string spec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( "testJob", rows, cols, memoryLimit, 1, categoricalFieldNames, true, - test::CTestTmpDir::tmpDir(), "ml", "regression", parameters)}; + test::CTestTmpDir::tmpDir(), "ml", analysis, parameters)}; LOG_TRACE(<< "spec =\n" << spec); @@ -614,7 +617,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() { TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; - api::CDataFrameAnalyzer analyzer{regressionSpec("c5"), outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "c5"), outputWriterFactory}; addRegressionTestData(fieldNames, fieldValues, analyzer, expectedPredictions); core::CStopWatch watch{true}; @@ -676,8 +679,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() { }; api::CDataFrameAnalyzer analyzer{ - regressionSpec("c5", 100, 5, 3000000, 0, 0, {}, lambda, gamma, eta, - maximumNumberTrees, featureBagFraction), + analysisSpec("regression", "c5", 100, 5, 3000000, 0, 0, {}, lambda, + gamma, eta, maximumNumberTrees, featureBagFraction), outputWriterFactory}; TDoubleVec expectedPredictions; @@ -728,8 +731,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithRowsMissingTargetValu auto target = [](double feature) { return 10.0 * feature; }; - api::CDataFrameAnalyzer analyzer{regressionSpec("target", 50, 2, 2000000), - outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{ + analysisSpec("regression", "target", 50, 2, 2000000), outputWriterFactory}; TDoubleVec feature; rng.generateUniformSamples(1.0, 3.0, 50, feature); @@ -946,7 +949,8 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { { api::CDataFrameAnalyzer analyzer{ - regressionSpec("x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}), outputWriterFactory}; + analysisSpec("regression", "x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}), + outputWriterFactory}; TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"}, {"x21", "x22", "x23", "x24", "x25", "x26", "x27"}}; @@ -972,7 +976,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { passed &= (expected[1] == (*row)[1]); if (wasPassed && passed == false) { LOG_ERROR(<< "expected " << core::CContainerPrinter::print(expected) - << "got [" << (*row)[0] << ", " << (*row)[1] << "]"); + << ", got [" << (*row)[0] << ", " << (*row)[1] << "]"); } } }); @@ -984,9 +988,9 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { { std::size_t rows{api::CDataFrameAnalyzer::MAX_CATEGORICAL_CARDINALITY + 3}; - api::CDataFrameAnalyzer analyzer{ - regressionSpec("x5", rows, 5, 8000000000, 0, 0, {"x1"}, 0, 0, 0, 0, 0), - outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "x5", rows, 5, 8000000000, + 0, 0, {"x1"}, 0, 0, 0, 0, 0), + outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "", ""}; @@ -1009,7 +1013,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { bool wasPassed{passed}; passed &= (expected == (*row)[0]); if (wasPassed && passed == false) { - LOG_ERROR(<< "expected " << expected << " got " << (*row)[0]); + LOG_ERROR(<< "expected " << expected << ", got " << (*row)[0]); } } }); @@ -1018,6 +1022,70 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { } } +void CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing() { + + auto eq = [](double expected) { + return [expected](double actual) { return expected == actual; }; + }; + + auto missing = []() { + return [](double actual) { + return maths::CDataFrameUtils::isMissing(actual); + }; + }; + + auto assertRow = [&](const std::size_t row_i, + const std::vector>& matchers, + const TRowRef& row) { + CPPUNIT_ASSERT_EQUAL_MESSAGE("row " + std::to_string(row_i), + matchers.size(), row.numberColumns()); + for (std::size_t i = 0; i < row.numberColumns(); ++i) { + CPPUNIT_ASSERT_MESSAGE("row " + std::to_string(row_i) + + ", column " + std::to_string(i), + matchers[i](row[i])); + } + }; + + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + api::CDataFrameAnalyzer analyzer{analysisSpec("classification", "x5", 1000, 5, + 8000000, 0, 0, {"x1", "x2", "x5"}), + outputWriterFactory}; + + TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; + analyzer.handleRecord(fieldNames, {"x11", "x21", "0", "0", "x51", "0", ""}); + analyzer.handleRecord(fieldNames, {"x12", "x22", "1", "1", "x52", "1", ""}); + analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""}); + analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""}); + analyzer.handleRecord(fieldNames, {"x15", "x25", "4", "4", "x51", "4", ""}); + analyzer.handleRecord(fieldNames, {"x11", "x26", "5", "5", "x52", "5", ""}); + analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""}); + analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""}); + analyzer.handleRecord(fieldNames, {"x14", "x22", "8", "8", "x51", "8", ""}); + analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""}); + analyzer.receivedAllRows(); + + const core::CDataFrame& frame{analyzer.dataFrame()}; + frame.readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + std::vector rows; + std::copy(beginRows, endRows, std::back_inserter(rows)); + CPPUNIT_ASSERT_EQUAL(std::size_t{10}, rows.size()); + assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]); + assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]); + assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]); + assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing()}, rows[3]); + assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]); + assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]); + assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing()}, rows[6]); + assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing()}, rows[7]); + assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]); + assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]); + }); +} + CppUnit::Test* CDataFrameAnalyzerTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CDataFrameAnalyzerTest"); @@ -1058,6 +1126,9 @@ CppUnit::Test* CDataFrameAnalyzerTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CDataFrameAnalyzerTest::testCategoricalFields", &CDataFrameAnalyzerTest::testCategoricalFields)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing", + &CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing)); return suiteOfTests; } @@ -1137,9 +1208,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti // compute expected tree api::CDataFrameAnalyzer analyzer{ - regressionSpec("c5", numberExamples, 5, 15000000, - numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta, - maximumNumberTrees, featureBagFraction, &persisterSupplier), + analysisSpec("regression", "c5", numberExamples, 5, 15000000, + numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta, + maximumNumberTrees, featureBagFraction, &persisterSupplier), outputWriterFactory}; std::size_t dependentVariable( std::find(fieldNames.begin(), fieldNames.end(), "c5") - fieldNames.begin()); @@ -1161,9 +1232,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti }; api::CDataFrameAnalyzer analyzerToRestore{ - regressionSpec("c5", numberExamples, 5, 15000000, numberRoundsPerHyperparameter, - 12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction, - &persisterSupplier, &restoreSearcherSupplier), + analysisSpec("regression", "c5", numberExamples, 5, 15000000, numberRoundsPerHyperparameter, + 12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction, + &persisterSupplier, &restoreSearcherSupplier), outputWriterFactory}; passDataToAnalyzer(fieldNames, fieldValues, analyzerToRestore, weights, values); diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.h b/lib/api/unittest/CDataFrameAnalyzerTest.h index 7943653c08..bbf1ecaf49 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.h +++ b/lib/api/unittest/CDataFrameAnalyzerTest.h @@ -39,6 +39,7 @@ class CDataFrameAnalyzerTest : public CppUnit::TestFixture { void testErrors(); void testRoundTripDocHashes(); void testCategoricalFields(); + void testCategoricalFieldsEmptyAsMissing(); static CppUnit::Test* suite(); diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.cc b/lib/api/unittest/CDataFrameMockAnalysisRunner.cc index acc0c4c0ce..895f2ffbac 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.cc +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.cc @@ -18,6 +18,7 @@ std::size_t CDataFrameMockAnalysisRunner::numberExtraColumns() const { } void CDataFrameMockAnalysisRunner::writeOneRow(const TStrVec&, + const TStrVecVec&, TRowRef, ml::core::CRapidJsonConcurrentLineWriter&) const { } diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.h b/lib/api/unittest/CDataFrameMockAnalysisRunner.h index 556f58099f..2be3859afc 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.h +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.h @@ -20,6 +20,7 @@ class CDataFrameMockAnalysisRunner final : public ml::api::CDataFrameAnalysisRun std::size_t numberExtraColumns() const override; void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, TRowRef, ml::core::CRapidJsonConcurrentLineWriter&) const override;