From 6ef21aa6676ef2a50fa469b42ea7f246450f7c4e Mon Sep 17 00:00:00 2001 From: Przemyslaw Witek Date: Thu, 26 Sep 2019 11:24:10 +0200 Subject: [PATCH 1/5] Make CDataFrameBoostedTreeRunner class abstract and derive two subclasses from it: CDataFrameRegressionRunner and CDataFrameClassificationRunner. --- include/api/CDataFrameAnalysisRunner.h | 9 +- include/api/CDataFrameAnalysisSpecification.h | 3 + include/api/CDataFrameAnalyzer.h | 1 + include/api/CDataFrameBoostedTreeRunner.h | 34 ++--- include/api/CDataFrameClassificationRunner.h | 67 ++++++++++ include/api/CDataFrameOutliersRunner.h | 6 +- include/api/CDataFrameRegressionRunner.h | 59 +++++++++ lib/api/CDataFrameAnalysisRunner.cc | 7 +- lib/api/CDataFrameAnalysisSpecification.cc | 14 +- lib/api/CDataFrameAnalyzer.cc | 44 +++++- lib/api/CDataFrameBoostedTreeRunner.cc | 53 +++----- lib/api/CDataFrameClassificationRunner.cc | 125 ++++++++++++++++++ lib/api/CDataFrameOutliersRunner.cc | 12 +- lib/api/CDataFrameRegressionRunner.cc | 84 ++++++++++++ lib/api/Makefile.first | 2 + .../CDataFrameAnalysisSpecificationTest.cc | 8 +- .../unittest/CDataFrameMockAnalysisRunner.cc | 1 + .../unittest/CDataFrameMockAnalysisRunner.h | 1 + 18 files changed, 453 insertions(+), 77 deletions(-) create mode 100644 include/api/CDataFrameClassificationRunner.h create mode 100644 include/api/CDataFrameRegressionRunner.h create mode 100644 lib/api/CDataFrameClassificationRunner.cc create mode 100644 lib/api/CDataFrameRegressionRunner.cc diff --git a/include/api/CDataFrameAnalysisRunner.h b/include/api/CDataFrameAnalysisRunner.h index 2a3214d41b..329d6e7543 100644 --- a/include/api/CDataFrameAnalysisRunner.h +++ b/include/api/CDataFrameAnalysisRunner.h @@ -60,6 +60,7 @@ class CMemoryUsageEstimationResultJsonWriter; class API_EXPORT CDataFrameAnalysisRunner { public: using TStrVec = std::vector; + using TStrVecVec = std::vector; using TRowRef = core::data_frame_detail::CRowRef; using TProgressRecorder = std::function; @@ -98,6 +99,9 @@ class API_EXPORT CDataFrameAnalysisRunner { //! \return The number of columns this analysis appends. virtual std::size_t numberExtraColumns() const = 0; + //! Fills in categorical field names for which empty value should be treated as missing. + virtual void columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const; + //! Write the extra columns of \p row added by the analysis to \p writer. //! //! This should create a new object of the form: @@ -114,6 +118,7 @@ class API_EXPORT CDataFrameAnalysisRunner { //! \param[in] row The row to write the columns added by this analysis. //! \param[in,out] writer The stream to which to write the extra columns. virtual void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const = 0; @@ -189,12 +194,12 @@ class API_EXPORT CDataFrameAnalysisRunnerFactory { TRunnerUPtr make(const CDataFrameAnalysisSpecification& spec) const; TRunnerUPtr make(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const; + const rapidjson::Value& jsonParameters) const; private: virtual TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const = 0; virtual TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const = 0; + const rapidjson::Value& jsonParameters) const = 0; }; } } diff --git a/include/api/CDataFrameAnalysisSpecification.h b/include/api/CDataFrameAnalysisSpecification.h index 706b3ff9f7..55ade44b2a 100644 --- a/include/api/CDataFrameAnalysisSpecification.h +++ b/include/api/CDataFrameAnalysisSpecification.h @@ -173,6 +173,9 @@ class API_EXPORT CDataFrameAnalysisSpecification { //! 2. disk is used (only one partition needs to be loaded to main memory) void estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const; + //! Fills in categorical field names for which empty value should be treated as missing. + void columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const; + //! \return shared pointer to the persistence stream. TDataAdderUPtr persister() const; diff --git a/include/api/CDataFrameAnalyzer.h b/include/api/CDataFrameAnalyzer.h index e76ddfde3d..a1606a9979 100644 --- a/include/api/CDataFrameAnalyzer.h +++ b/include/api/CDataFrameAnalyzer.h @@ -104,6 +104,7 @@ class API_EXPORT CDataFrameAnalyzer { TDataFrameAnalysisSpecificationUPtr m_AnalysisSpecification; TStrVec m_CategoricalFieldNames; TStrSizeUMapVec m_CategoricalFieldValues; + std::set m_EmptyAsMissing; TDataFrameUPtr m_DataFrame; TStrVec m_FieldNames; TTemporaryDirectoryPtr m_DataFrameDirectory; diff --git a/include/api/CDataFrameBoostedTreeRunner.h b/include/api/CDataFrameBoostedTreeRunner.h index 13b0a6863f..f657db117a 100644 --- a/include/api/CDataFrameBoostedTreeRunner.h +++ b/include/api/CDataFrameBoostedTreeRunner.h @@ -9,6 +9,7 @@ #include +#include #include #include #include @@ -25,11 +26,11 @@ class CBoostedTreeFactory; namespace api { //! \brief Runs boosted tree regression on a core::CDataFrame. -class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRunner { +class API_EXPORT CDataFrameBoostedTreeRunner : public CDataFrameAnalysisRunner { public: //! This is not intended to be called directly: use CDataFrameBoostedTreeRunnerFactory. CDataFrameBoostedTreeRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters); + const CDataFrameAnalysisConfigReader::CParameters& parameters); //! This is not intended to be called directly: use CDataFrameBoostedTreeRunnerFactory. CDataFrameBoostedTreeRunner(const CDataFrameAnalysisSpecification& spec); @@ -39,17 +40,22 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu //! \return The number of columns this adds to the data frame. std::size_t numberExtraColumns() const override; - //! Write the prediction for \p row to \p writer. - void writeOneRow(const TStrVec& featureNames, - TRowRef row, - core::CRapidJsonConcurrentLineWriter& writer) const override; - private: using TBoostedTreeUPtr = std::unique_ptr; using TBoostedTreeFactoryUPtr = std::unique_ptr; using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr; using TMemoryEstimator = std::function; +protected: + //! Parameter reader handling parameters that are shared by subclasses. + static CDataFrameAnalysisConfigReader getParameterReader(); + //! Name of dependent variable field. + const std::string& dependentVariableFieldName() const; + //! Name of prediction field. + const std::string& predictionFieldName() const; + //! Underlying boosted tree. + const TBoostedTreeUPtr& boostedTree() const; + private: void runImpl(const TStrVec& featureNames, core::CDataFrame& frame) override; std::size_t estimateBookkeepingMemoryUsage(std::size_t numberPartitions, @@ -71,20 +77,6 @@ class API_EXPORT CDataFrameBoostedTreeRunner final : public CDataFrameAnalysisRu TBoostedTreeUPtr m_BoostedTree; std::atomic m_Memory; }; - -//! \brief Makes a core::CDataFrame boosted tree regression runner. -class API_EXPORT CDataFrameBoostedTreeRunnerFactory final : public CDataFrameAnalysisRunnerFactory { -public: - const std::string& name() const override; - -private: - static const std::string NAME; - -private: - TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; - TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const override; -}; } } diff --git a/include/api/CDataFrameClassificationRunner.h b/include/api/CDataFrameClassificationRunner.h new file mode 100644 index 0000000000..d15172d2d8 --- /dev/null +++ b/include/api/CDataFrameClassificationRunner.h @@ -0,0 +1,67 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_api_CDataFrameClassificationRunner_h +#define INCLUDED_ml_api_CDataFrameClassificationRunner_h + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +namespace ml { +namespace api { + +//! \brief Runs boosted tree classification on a core::CDataFrame. +class API_EXPORT CDataFrameClassificationRunner final : public CDataFrameBoostedTreeRunner { +public: + static const CDataFrameAnalysisConfigReader getParameterReader(); + + //! This is not intended to be called directly: use CDataFrameClassificationRunnerFactory. + CDataFrameClassificationRunner(const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters); + + //! This is not intended to be called directly: use CDataFrameClassificationRunnerFactory. + CDataFrameClassificationRunner(const CDataFrameAnalysisSpecification& spec); + + //! Fills in categorical field names for which empty value should be treated as missing. + void columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const override; + + //! Write the prediction for \p row to \p writer. + void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const override; + +private: + std::size_t m_NumTopClasses; +}; + +//! \brief Makes a core::CDataFrame boosted tree classification runner. +class API_EXPORT CDataFrameClassificationRunnerFactory final + : public CDataFrameAnalysisRunnerFactory { +public: + const std::string& name() const override; + +private: + static const std::string NAME; + +private: + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const override; +}; +} +} + +#endif // INCLUDED_ml_api_CDataFrameClassificationRunner_h diff --git a/include/api/CDataFrameOutliersRunner.h b/include/api/CDataFrameOutliersRunner.h index cda4bce5c6..86d40caf33 100644 --- a/include/api/CDataFrameOutliersRunner.h +++ b/include/api/CDataFrameOutliersRunner.h @@ -7,6 +7,7 @@ #ifndef INCLUDED_ml_api_CDataFrameOutliersRunner_h #define INCLUDED_ml_api_CDataFrameOutliersRunner_h +#include #include #include @@ -21,7 +22,7 @@ class API_EXPORT CDataFrameOutliersRunner final : public CDataFrameAnalysisRunne public: //! This is not intended to be called directly: use CDataFrameOutliersRunnerFactory. CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters); + const CDataFrameAnalysisConfigReader::CParameters& parameters); //! This is not intended to be called directly: use CDataFrameOutliersRunnerFactory. CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec); @@ -31,6 +32,7 @@ class API_EXPORT CDataFrameOutliersRunner final : public CDataFrameAnalysisRunne //! Write the extra columns of \p row added by outlier analysis to \p writer. void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const override; @@ -79,7 +81,7 @@ class API_EXPORT CDataFrameOutliersRunnerFactory final : public CDataFrameAnalys private: TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const override; + const rapidjson::Value& jsonParameters) const override; }; } } diff --git a/include/api/CDataFrameRegressionRunner.h b/include/api/CDataFrameRegressionRunner.h new file mode 100644 index 0000000000..4c542987c8 --- /dev/null +++ b/include/api/CDataFrameRegressionRunner.h @@ -0,0 +1,59 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#ifndef INCLUDED_ml_api_CDataFrameRegressionRunner_h +#define INCLUDED_ml_api_CDataFrameRegressionRunner_h + +#include + +#include +#include +#include +#include + +#include + +#include + +namespace ml { +namespace api { + +//! \brief Runs boosted tree regression on a core::CDataFrame. +class API_EXPORT CDataFrameRegressionRunner final : public CDataFrameBoostedTreeRunner { +public: + static const CDataFrameAnalysisConfigReader getParameterReader(); + + //! This is not intended to be called directly: use CDataFrameRegressionRunnerFactory. + CDataFrameRegressionRunner(const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters); + + //! This is not intended to be called directly: use CDataFrameRegressionRunnerFactory. + CDataFrameRegressionRunner(const CDataFrameAnalysisSpecification& spec); + + //! Write the prediction for \p row to \p writer. + void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const override; +}; + +//! \brief Makes a core::CDataFrame boosted tree regression runner. +class API_EXPORT CDataFrameRegressionRunnerFactory final : public CDataFrameAnalysisRunnerFactory { +public: + const std::string& name() const override; + +private: + static const std::string NAME; + +private: + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec) const override; + TRunnerUPtr makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const override; +}; +} +} + +#endif // INCLUDED_ml_api_CDataFrameRegressionRunner_h diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index 2acf7264ab..26440e937a 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -43,6 +43,9 @@ CDataFrameAnalysisRunner::~CDataFrameAnalysisRunner() { this->waitToFinish(); } +void CDataFrameAnalysisRunner::columnsForWhichEmptyIsMissing(TStrVec&) const { +} + void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const { std::size_t numberRows{m_Spec.numberRows()}; std::size_t numberColumns{m_Spec.numberColumns() + this->numberExtraColumns()}; @@ -223,8 +226,8 @@ CDataFrameAnalysisRunnerFactory::make(const CDataFrameAnalysisSpecification& spe CDataFrameAnalysisRunnerFactory::TRunnerUPtr CDataFrameAnalysisRunnerFactory::make(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const { - auto result = this->makeImpl(spec, params); + const rapidjson::Value& jsonParameters) const { + auto result = this->makeImpl(spec, jsonParameters); result->computeAndSaveExecutionStrategy(); return result; } diff --git a/lib/api/CDataFrameAnalysisSpecification.cc b/lib/api/CDataFrameAnalysisSpecification.cc index 9dac982b77..a09b4a67cd 100644 --- a/lib/api/CDataFrameAnalysisSpecification.cc +++ b/lib/api/CDataFrameAnalysisSpecification.cc @@ -10,8 +10,9 @@ #include #include -#include +#include #include +#include #include #include @@ -45,7 +46,8 @@ using TRunnerFactoryUPtrVec = ml::api::CDataFrameAnalysisSpecification::TRunnerF TRunnerFactoryUPtrVec analysisFactories() { TRunnerFactoryUPtrVec factories; - factories.push_back(std::make_unique()); + factories.push_back(std::make_unique()); + factories.push_back(std::make_unique()); factories.push_back(std::make_unique()); // Add new analysis types here. return factories; @@ -206,6 +208,14 @@ void CDataFrameAnalysisSpecification::estimateMemoryUsage(CMemoryUsageEstimation m_Runner->estimateMemoryUsage(writer); } +void CDataFrameAnalysisSpecification::columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const { + if (m_Runner == nullptr) { + HANDLE_FATAL(<< "Internal error: no runner available. Please report this problem."); + return; + } + m_Runner->columnsForWhichEmptyIsMissing(fieldNames); +} + void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& jsonAnalysis) { // We pass of the interpretation of the parameters object to the appropriate // analysis runner. diff --git a/lib/api/CDataFrameAnalyzer.cc b/lib/api/CDataFrameAnalyzer.cc index a61017ce62..d033cf81ce 100644 --- a/lib/api/CDataFrameAnalyzer.cc +++ b/lib/api/CDataFrameAnalyzer.cc @@ -24,6 +24,9 @@ namespace ml { namespace api { namespace { using TStrVec = std::vector; +using TStrVecVec = std::vector; +using TStrSizeUMap = boost::unordered_map; +using TStrSizeUMapVec = std::vector; core::CFloatStorage truncateToFloatRange(double value) { double largest{static_cast(std::numeric_limits::max())}; @@ -54,6 +57,10 @@ CDataFrameAnalyzer::CDataFrameAnalyzer(TDataFrameAnalysisSpecificationUPtr analy auto frameAndDirectory = m_AnalysisSpecification->makeDataFrame(); m_DataFrame = std::move(frameAndDirectory.first); m_DataFrameDirectory = frameAndDirectory.second; + TStrVec emptyAsMissing; + m_AnalysisSpecification->columnsForWhichEmptyIsMissing(emptyAsMissing); + m_EmptyAsMissing = + std::set(emptyAsMissing.begin(), emptyAsMissing.end()); } } @@ -289,8 +296,11 @@ void CDataFrameAnalyzer::addRowToDataFrame(const TStrVec& fieldValues) { using TFloatVecItr = core::CDataFrame::TFloatVecItr; auto fieldToValue = [this](bool isCategorical, TStrSizeUMap& categoricalFields, - const std::string& fieldValue) { + bool emptyAsMissing, const std::string& fieldValue) { if (isCategorical) { + if (fieldValue.empty() && emptyAsMissing) { + return core::CFloatStorage{core::CDataFrame::valueOfMissing()}; + } // This encodes in a format suitable for efficient storage. The // actual encoding approach is chosen when the analysis runs. std::int64_t id; @@ -337,8 +347,9 @@ void CDataFrameAnalyzer::addRowToDataFrame(const TStrVec& fieldValues) { m_DataFrame->writeRow([&](TFloatVecItr columns, std::int32_t& docHash) { for (std::ptrdiff_t i = m_BeginDataFieldValues; i != m_EndDataFieldValues; ++i, ++columns) { - *columns = fieldToValue(isCategorical[i], - m_CategoricalFieldValues[i], fieldValues[i]); + *columns = fieldToValue(isCategorical[i], m_CategoricalFieldValues[i], + m_EmptyAsMissing.count(m_FieldNames[i]) > 0, + fieldValues[i]); } docHash = 0; if (m_DocHashFieldIndex != FIELD_MISSING && @@ -373,6 +384,26 @@ void CDataFrameAnalyzer::writeProgress(int progress, writer.flush(); } +void mapToVector(const TStrSizeUMap& map, TStrVec& vector) { + assert(vector.empty()); + vector.resize(map.size()); + for (const auto& entry : map) { + std::size_t index = entry.second; + if (index >= vector.size()) { + HANDLE_FATAL(<< "Index out of bounds: " << index); + } + vector[index] = entry.first; + } +} + +void mapsToVectors(const TStrSizeUMapVec& maps, TStrVecVec& vectors) { + assert(vectors.empty()); + vectors.resize(maps.size()); + for (std::size_t i = 0; i < maps.size(); i++) { + mapToVector(maps[i], vectors[i]); + } +} + void CDataFrameAnalyzer::writeResultsOf(const CDataFrameAnalysisRunner& analysis, core::CRapidJsonConcurrentLineWriter& writer) const { // We write results single threaded because we need to write the rows to @@ -380,6 +411,11 @@ void CDataFrameAnalyzer::writeResultsOf(const CDataFrameAnalysisRunner& analysis // can join the extra columns with the original data frame. std::size_t numberThreads{1}; + // Change representation of categorical field values from map (category name -> index) + // to the vector of category names. + TStrVecVec categoricalFieldValues; + mapsToVectors(m_CategoricalFieldValues, categoricalFieldValues); + using TRowItr = core::CDataFrame::TRowItr; m_DataFrame->readRows(numberThreads, [&](TRowItr beginRows, TRowItr endRows) { for (auto row = beginRows; row != endRows; ++row) { @@ -392,7 +428,7 @@ void CDataFrameAnalyzer::writeResultsOf(const CDataFrameAnalysisRunner& analysis writer.StartObject(); writer.Key(m_AnalysisSpecification->resultsField()); - analysis.writeOneRow(m_FieldNames, *row, writer); + analysis.writeOneRow(m_FieldNames, categoricalFieldValues, *row, writer); writer.EndObject(); writer.EndObject(); diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 831d520754..70074be3e1 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -40,8 +40,9 @@ const std::string FEATURE_BAG_FRACTION{"feature_bag_fraction"}; const std::string NUMBER_FOLDS{"number_folds"}; const std::string NUMBER_ROUNDS_PER_HYPERPARAMETER{"number_rounds_per_hyperparameter"}; const std::string BAYESIAN_OPTIMISATION_RESTARTS{"bayesian_optimisation_restarts"}; +} -const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { +CDataFrameAnalysisConfigReader CDataFrameBoostedTreeRunner::getParameterReader() { CDataFrameAnalysisConfigReader theReader; theReader.addParameter(DEPENDENT_VARIABLE_NAME, CDataFrameAnalysisConfigReader::E_RequiredParameter); @@ -65,18 +66,13 @@ const CDataFrameAnalysisConfigReader PARAMETER_READER{[] { theReader.addParameter(BAYESIAN_OPTIMISATION_RESTARTS, CDataFrameAnalysisConfigReader::E_OptionalParameter); return theReader; -}()}; - -// Output -const std::string IS_TRAINING_FIELD_NAME{"is_training"}; } -CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters) +CDataFrameBoostedTreeRunner::CDataFrameBoostedTreeRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) : CDataFrameBoostedTreeRunner{spec} { - auto parameters = PARAMETER_READER.read(jsonParameters); - m_DependentVariableFieldName = parameters[DEPENDENT_VARIABLE_NAME].as(); m_PredictionFieldName = parameters[PREDICTION_FIELD_NAME].fallback( @@ -189,20 +185,20 @@ std::size_t CDataFrameBoostedTreeRunner::numberExtraColumns() const { return m_BoostedTreeFactory->numberExtraColumnsForTrain(); } -void CDataFrameBoostedTreeRunner::writeOneRow(const TStrVec&, - TRowRef row, - core::CRapidJsonConcurrentLineWriter& writer) const { +const std::string& CDataFrameBoostedTreeRunner::dependentVariableFieldName() const { + return m_DependentVariableFieldName; +} + +const std::string& CDataFrameBoostedTreeRunner::predictionFieldName() const { + return m_PredictionFieldName; +} + +const CDataFrameBoostedTreeRunner::TBoostedTreeUPtr& +CDataFrameBoostedTreeRunner::boostedTree() const { if (m_BoostedTree == nullptr) { HANDLE_FATAL(<< "Internal error: boosted tree object missing. Please report this error."); - } else { - writer.StartObject(); - writer.Key(m_PredictionFieldName); - writer.Double(row[m_BoostedTree->columnHoldingPrediction(row.numberColumns())]); - writer.Key(IS_TRAINING_FIELD_NAME); - writer.Bool(maths::CDataFrameUtils::isMissing( - row[m_BoostedTree->columnHoldingDependentVariable()]) == false); - writer.EndObject(); } + return m_BoostedTree; } void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, @@ -284,22 +280,5 @@ std::size_t CDataFrameBoostedTreeRunner::estimateBookkeepingMemoryUsage( std::size_t numberColumns) const { return m_BoostedTreeFactory->estimateMemoryUsage(totalNumberRows, numberColumns); } - -const std::string& CDataFrameBoostedTreeRunnerFactory::name() const { - return NAME; -} - -CDataFrameBoostedTreeRunnerFactory::TRunnerUPtr -CDataFrameBoostedTreeRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec) const { - return std::make_unique(spec); -} - -CDataFrameBoostedTreeRunnerFactory::TRunnerUPtr -CDataFrameBoostedTreeRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const { - return std::make_unique(spec, params); -} - -const std::string CDataFrameBoostedTreeRunnerFactory::NAME{"regression"}; } } diff --git a/lib/api/CDataFrameClassificationRunner.cc b/lib/api/CDataFrameClassificationRunner.cc new file mode 100644 index 0000000000..324b057f59 --- /dev/null +++ b/lib/api/CDataFrameClassificationRunner.cc @@ -0,0 +1,125 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace ml { +namespace api { +namespace { +// Configuration +const std::string NUM_TOP_CLASSES{"num_top_classes"}; + +// Output +const std::string IS_TRAINING_FIELD_NAME{"is_training"}; +const std::string TOP_CLASSES_FIELD_NAME{"top_classes"}; +const std::string CLASS_NAME_FIELD_NAME{"class_name"}; +const std::string CLASS_PROBABILITY_FIELD_NAME{"class_probability"}; +} + +const CDataFrameAnalysisConfigReader CDataFrameClassificationRunner::getParameterReader() { + CDataFrameAnalysisConfigReader theReader = + CDataFrameBoostedTreeRunner::getParameterReader(); + theReader.addParameter(NUM_TOP_CLASSES, CDataFrameAnalysisConfigReader::E_OptionalParameter); + return theReader; +} + +CDataFrameClassificationRunner::CDataFrameClassificationRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) + : CDataFrameBoostedTreeRunner{spec, parameters} { + + m_NumTopClasses = parameters[NUM_TOP_CLASSES].fallback(std::size_t{0}); +} + +CDataFrameClassificationRunner::CDataFrameClassificationRunner(const CDataFrameAnalysisSpecification& spec) + : CDataFrameBoostedTreeRunner{spec} { +} + +// The only field for which empty value should be treated as missing is dependent variable +// which has empty value for non-training rows. +void CDataFrameClassificationRunner::columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const { + fieldNames.push_back(dependentVariableFieldName()); +} + +void CDataFrameClassificationRunner::writeOneRow(const TStrVec&, + const TStrVecVec& categoricalFieldValues, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const { + const auto& tree{boostedTree()}; + std::size_t columnHoldingDependentVariable = tree->columnHoldingDependentVariable(); + std::size_t columnHoldingPrediction = tree->columnHoldingPrediction(row.numberColumns()); + const double dependentVariable = row[columnHoldingDependentVariable]; + const std::uint64_t prediction = std::lround(row[columnHoldingPrediction]); + if (prediction >= categoricalFieldValues[columnHoldingDependentVariable].size()) { + HANDLE_FATAL(<< "Index out of bounds: " << prediction); + } + std::string predictedClassName = + categoricalFieldValues[columnHoldingDependentVariable][prediction]; + + writer.StartObject(); + writer.Key(predictionFieldName()); + writer.String(predictedClassName); + writer.Key(IS_TRAINING_FIELD_NAME); + writer.Bool(maths::CDataFrameUtils::isMissing(dependentVariable) == false); + // TODO: Uncomment and adapt the code below once top classes feature is implemented + // See https://github.com/elastic/ml-cpp/issues/712 + /* + if (m_NumTopClasses > 0) { + writer.Key(TOP_CLASSES_FIELD_NAME); + writer.StartArray(); + for (std::size_t i = 0; i < m_NumTopClasses; i++) { + writer.StartObject(); + writer.Key(CLASS_NAME_FIELD_NAME); + writer.String(???); + writer.Key(CLASS_PROBABILITY_FIELD_NAME); + writer.Double(???); + writer.EndObject(); + } + writer.EndArray(); + } + */ + writer.EndObject(); +} + +const std::string& CDataFrameClassificationRunnerFactory::name() const { + return NAME; +} + +CDataFrameClassificationRunnerFactory::TRunnerUPtr +CDataFrameClassificationRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec) const { + return std::make_unique(spec); +} + +CDataFrameClassificationRunnerFactory::TRunnerUPtr +CDataFrameClassificationRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const { + CDataFrameAnalysisConfigReader parameterReader = + CDataFrameClassificationRunner::getParameterReader(); + auto parameters = parameterReader.read(jsonParameters); + return std::make_unique(spec, parameters); +} + +const std::string CDataFrameClassificationRunnerFactory::NAME{"classification"}; +} +} diff --git a/lib/api/CDataFrameOutliersRunner.cc b/lib/api/CDataFrameOutliersRunner.cc index 0f10a60792..f928ae255f 100644 --- a/lib/api/CDataFrameOutliersRunner.cc +++ b/lib/api/CDataFrameOutliersRunner.cc @@ -61,11 +61,11 @@ const std::string OUTLIER_SCORE_FIELD_NAME{"outlier_score"}; const std::string FEATURE_INFLUENCE_FIELD_NAME_PREFIX{"feature_influence."}; } -CDataFrameOutliersRunner::CDataFrameOutliersRunner(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& jsonParameters) +CDataFrameOutliersRunner::CDataFrameOutliersRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) : CDataFrameOutliersRunner{spec} { - auto parameters = PARAMETER_READER.read(jsonParameters); m_StandardizeColumns = parameters[STANDARDIZE_COLUMNS].fallback(true); m_NumberNeighbours = parameters[N_NEIGHBORS].fallback(std::size_t{0}); m_Method = parameters[METHOD].fallback(maths::COutliers::E_Ensemble); @@ -84,6 +84,7 @@ std::size_t CDataFrameOutliersRunner::numberExtraColumns() const { } void CDataFrameOutliersRunner::writeOneRow(const TStrVec& featureNames, + const TStrVecVec&, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const { std::size_t scoreColumn{row.numberColumns() - this->numberExtraColumns()}; @@ -157,8 +158,9 @@ CDataFrameOutliersRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& CDataFrameOutliersRunnerFactory::TRunnerUPtr CDataFrameOutliersRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, - const rapidjson::Value& params) const { - return std::make_unique(spec, params); + const rapidjson::Value& jsonParameters) const { + auto parameters = PARAMETER_READER.read(jsonParameters); + return std::make_unique(spec, parameters); } const std::string CDataFrameOutliersRunnerFactory::NAME{"outlier_detection"}; diff --git a/lib/api/CDataFrameRegressionRunner.cc b/lib/api/CDataFrameRegressionRunner.cc new file mode 100644 index 0000000000..79974777de --- /dev/null +++ b/lib/api/CDataFrameRegressionRunner.cc @@ -0,0 +1,84 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include + +namespace ml { +namespace api { +namespace { +// Output +const std::string IS_TRAINING_FIELD_NAME{"is_training"}; +} + +const CDataFrameAnalysisConfigReader CDataFrameRegressionRunner::getParameterReader() { + return CDataFrameBoostedTreeRunner::getParameterReader(); +} + +CDataFrameRegressionRunner::CDataFrameRegressionRunner( + const CDataFrameAnalysisSpecification& spec, + const CDataFrameAnalysisConfigReader::CParameters& parameters) + : CDataFrameBoostedTreeRunner{spec, parameters} { +} + +CDataFrameRegressionRunner::CDataFrameRegressionRunner(const CDataFrameAnalysisSpecification& spec) + : CDataFrameBoostedTreeRunner{spec} { +} + +void CDataFrameRegressionRunner::writeOneRow(const TStrVec&, + const TStrVecVec&, + TRowRef row, + core::CRapidJsonConcurrentLineWriter& writer) const { + const auto& tree{boostedTree()}; + std::size_t columnHoldingDependentVariable = tree->columnHoldingDependentVariable(); + std::size_t columnHoldingPrediction = tree->columnHoldingPrediction(row.numberColumns()); + + writer.StartObject(); + writer.Key(predictionFieldName()); + writer.Double(row[columnHoldingPrediction]); + writer.Key(IS_TRAINING_FIELD_NAME); + writer.Bool(maths::CDataFrameUtils::isMissing(row[columnHoldingDependentVariable]) == false); + writer.EndObject(); +} + +const std::string& CDataFrameRegressionRunnerFactory::name() const { + return NAME; +} + +CDataFrameRegressionRunnerFactory::TRunnerUPtr +CDataFrameRegressionRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec) const { + return std::make_unique(spec); +} + +CDataFrameRegressionRunnerFactory::TRunnerUPtr +CDataFrameRegressionRunnerFactory::makeImpl(const CDataFrameAnalysisSpecification& spec, + const rapidjson::Value& jsonParameters) const { + CDataFrameAnalysisConfigReader parameterReader = + CDataFrameRegressionRunner::getParameterReader(); + auto parameters = parameterReader.read(jsonParameters); + return std::make_unique(spec, parameters); +} + +const std::string CDataFrameRegressionRunnerFactory::NAME{"regression"}; +} +} diff --git a/lib/api/Makefile.first b/lib/api/Makefile.first index 093f0155ef..e00f4f0c8d 100644 --- a/lib/api/Makefile.first +++ b/lib/api/Makefile.first @@ -31,7 +31,9 @@ CDataFrameAnalysisSpecification.cc \ CDataFrameAnalysisSpecificationJsonWriter.cc \ CDataFrameAnalyzer.cc \ CDataFrameBoostedTreeRunner.cc \ +CDataFrameClassificationRunner.cc \ CDataFrameOutliersRunner.cc \ +CDataFrameRegressionRunner.cc \ CDataProcessor.cc \ CDataTyper.cc \ CDetectionRulesJsonParser.cc \ diff --git a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc index c872da1f1f..93d706c92d 100644 --- a/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisSpecificationTest.cc @@ -13,8 +13,9 @@ #include #include #include -#include +#include #include +#include #include @@ -47,10 +48,13 @@ void CDataFrameAnalysisSpecificationTest::testCreate() { auto runnerFactories = []() { TRunnerFactoryUPtr outliers{std::make_unique()}; TRunnerFactoryUPtr regression{ - std::make_unique()}; + std::make_unique()}; + TRunnerFactoryUPtr classification{ + std::make_unique()}; TRunnerFactoryUPtrVec factories; factories.push_back(std::move(outliers)); factories.push_back(std::move(regression)); + factories.push_back(std::move(classification)); return factories; }; auto jsonSpec = [](const std::string& jobId, const std::string& rows, diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.cc b/lib/api/unittest/CDataFrameMockAnalysisRunner.cc index acc0c4c0ce..895f2ffbac 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.cc +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.cc @@ -18,6 +18,7 @@ std::size_t CDataFrameMockAnalysisRunner::numberExtraColumns() const { } void CDataFrameMockAnalysisRunner::writeOneRow(const TStrVec&, + const TStrVecVec&, TRowRef, ml::core::CRapidJsonConcurrentLineWriter&) const { } diff --git a/lib/api/unittest/CDataFrameMockAnalysisRunner.h b/lib/api/unittest/CDataFrameMockAnalysisRunner.h index 556f58099f..2be3859afc 100644 --- a/lib/api/unittest/CDataFrameMockAnalysisRunner.h +++ b/lib/api/unittest/CDataFrameMockAnalysisRunner.h @@ -20,6 +20,7 @@ class CDataFrameMockAnalysisRunner final : public ml::api::CDataFrameAnalysisRun std::size_t numberExtraColumns() const override; void writeOneRow(const TStrVec& featureNames, + const TStrVecVec& categoricalFieldValues, TRowRef, ml::core::CRapidJsonConcurrentLineWriter&) const override; From 56248b1c1fe4f0be409b6e7c6a02aa49aac2d810 Mon Sep 17 00:00:00 2001 From: Przemyslaw Witek Date: Thu, 3 Oct 2019 09:56:05 +0200 Subject: [PATCH 2/5] Apply review comments --- include/api/CDataFrameAnalysisRunner.h | 5 +- include/api/CDataFrameAnalysisSpecification.h | 5 +- include/api/CDataFrameAnalyzer.h | 3 +- include/api/CDataFrameBoostedTreeRunner.h | 12 +++-- include/api/CDataFrameClassificationRunner.h | 4 +- lib/api/CDataFrameAnalysisRunner.cc | 5 +- lib/api/CDataFrameAnalysisSpecification.cc | 7 +-- lib/api/CDataFrameAnalyzer.cc | 49 +++++++++---------- lib/api/CDataFrameBoostedTreeRunner.cc | 5 +- lib/api/CDataFrameClassificationRunner.cc | 33 ++++++++----- lib/api/CDataFrameRegressionRunner.cc | 9 ++-- 11 files changed, 76 insertions(+), 61 deletions(-) diff --git a/include/api/CDataFrameAnalysisRunner.h b/include/api/CDataFrameAnalysisRunner.h index 329d6e7543..7e28048ee9 100644 --- a/include/api/CDataFrameAnalysisRunner.h +++ b/include/api/CDataFrameAnalysisRunner.h @@ -59,6 +59,7 @@ class CMemoryUsageEstimationResultJsonWriter; //! early to determine how to implement a good cooperative interrupt scheme. class API_EXPORT CDataFrameAnalysisRunner { public: + using TBoolVec = std::vector; using TStrVec = std::vector; using TStrVecVec = std::vector; using TRowRef = core::data_frame_detail::CRowRef; @@ -99,8 +100,8 @@ class API_EXPORT CDataFrameAnalysisRunner { //! \return The number of columns this analysis appends. virtual std::size_t numberExtraColumns() const = 0; - //! Fills in categorical field names for which empty value should be treated as missing. - virtual void columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const; + //! \return Indicator of columns for which empty value should be treated as missing. + virtual TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const; //! Write the extra columns of \p row added by the analysis to \p writer. //! diff --git a/include/api/CDataFrameAnalysisSpecification.h b/include/api/CDataFrameAnalysisSpecification.h index 55ade44b2a..da8f211f59 100644 --- a/include/api/CDataFrameAnalysisSpecification.h +++ b/include/api/CDataFrameAnalysisSpecification.h @@ -43,6 +43,7 @@ namespace api { //! performance statistics. class API_EXPORT CDataFrameAnalysisSpecification { public: + using TBoolVec = std::vector; using TStrVec = std::vector; using TDataFrameUPtr = std::unique_ptr; using TTemporaryDirectoryPtr = std::shared_ptr; @@ -173,8 +174,8 @@ class API_EXPORT CDataFrameAnalysisSpecification { //! 2. disk is used (only one partition needs to be loaded to main memory) void estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const; - //! Fills in categorical field names for which empty value should be treated as missing. - void columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const; + //! \return Indicator of columns for which empty value should be treated as missing. + TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const; //! \return shared pointer to the persistence stream. TDataAdderUPtr persister() const; diff --git a/include/api/CDataFrameAnalyzer.h b/include/api/CDataFrameAnalyzer.h index a1606a9979..2e3516fb01 100644 --- a/include/api/CDataFrameAnalyzer.h +++ b/include/api/CDataFrameAnalyzer.h @@ -34,6 +34,7 @@ class CDataFrameAnalysisSpecification; //! class API_EXPORT CDataFrameAnalyzer { public: + using TBoolVec = std::vector; using TStrVec = std::vector; using TJsonOutputStreamWrapperUPtr = std::unique_ptr; using TJsonOutputStreamWrapperUPtrSupplier = @@ -104,7 +105,7 @@ class API_EXPORT CDataFrameAnalyzer { TDataFrameAnalysisSpecificationUPtr m_AnalysisSpecification; TStrVec m_CategoricalFieldNames; TStrSizeUMapVec m_CategoricalFieldValues; - std::set m_EmptyAsMissing; + TBoolVec m_EmptyAsMissing; TDataFrameUPtr m_DataFrame; TStrVec m_FieldNames; TTemporaryDirectoryPtr m_DataFrameDirectory; diff --git a/include/api/CDataFrameBoostedTreeRunner.h b/include/api/CDataFrameBoostedTreeRunner.h index f657db117a..e11cccbdf2 100644 --- a/include/api/CDataFrameBoostedTreeRunner.h +++ b/include/api/CDataFrameBoostedTreeRunner.h @@ -40,11 +40,8 @@ class API_EXPORT CDataFrameBoostedTreeRunner : public CDataFrameAnalysisRunner { //! \return The number of columns this adds to the data frame. std::size_t numberExtraColumns() const override; -private: +protected: using TBoostedTreeUPtr = std::unique_ptr; - using TBoostedTreeFactoryUPtr = std::unique_ptr; - using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr; - using TMemoryEstimator = std::function; protected: //! Parameter reader handling parameters that are shared by subclasses. @@ -54,7 +51,12 @@ class API_EXPORT CDataFrameBoostedTreeRunner : public CDataFrameAnalysisRunner { //! Name of prediction field. const std::string& predictionFieldName() const; //! Underlying boosted tree. - const TBoostedTreeUPtr& boostedTree() const; + const maths::CBoostedTree& boostedTree() const; + +private: + using TBoostedTreeFactoryUPtr = std::unique_ptr; + using TDataSearcherUPtr = CDataFrameAnalysisSpecification::TDataSearcherUPtr; + using TMemoryEstimator = std::function; private: void runImpl(const TStrVec& featureNames, core::CDataFrame& frame) override; diff --git a/include/api/CDataFrameClassificationRunner.h b/include/api/CDataFrameClassificationRunner.h index d15172d2d8..f5974ef39c 100644 --- a/include/api/CDataFrameClassificationRunner.h +++ b/include/api/CDataFrameClassificationRunner.h @@ -34,8 +34,8 @@ class API_EXPORT CDataFrameClassificationRunner final : public CDataFrameBoosted //! This is not intended to be called directly: use CDataFrameClassificationRunnerFactory. CDataFrameClassificationRunner(const CDataFrameAnalysisSpecification& spec); - //! Fills in categorical field names for which empty value should be treated as missing. - void columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const override; + //! \return Indicator of columns for which empty value should be treated as missing. + TBoolVec columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const override; //! Write the prediction for \p row to \p writer. void writeOneRow(const TStrVec& featureNames, diff --git a/lib/api/CDataFrameAnalysisRunner.cc b/lib/api/CDataFrameAnalysisRunner.cc index 26440e937a..b6c194b8ee 100644 --- a/lib/api/CDataFrameAnalysisRunner.cc +++ b/lib/api/CDataFrameAnalysisRunner.cc @@ -24,6 +24,8 @@ namespace ml { namespace api { namespace { +using TBoolVec = std::vector; + std::size_t maximumNumberPartitions(const CDataFrameAnalysisSpecification& spec) { // We limit the maximum number of partitions to rows^(1/2) because very // large numbers of partitions are going to be slow and it is better to tell @@ -43,7 +45,8 @@ CDataFrameAnalysisRunner::~CDataFrameAnalysisRunner() { this->waitToFinish(); } -void CDataFrameAnalysisRunner::columnsForWhichEmptyIsMissing(TStrVec&) const { +TBoolVec CDataFrameAnalysisRunner::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { + return TBoolVec(fieldNames.size(), false); } void CDataFrameAnalysisRunner::estimateMemoryUsage(CMemoryUsageEstimationResultJsonWriter& writer) const { diff --git a/lib/api/CDataFrameAnalysisSpecification.cc b/lib/api/CDataFrameAnalysisSpecification.cc index a09b4a67cd..28bdd70613 100644 --- a/lib/api/CDataFrameAnalysisSpecification.cc +++ b/lib/api/CDataFrameAnalysisSpecification.cc @@ -42,6 +42,7 @@ const std::string CDataFrameAnalysisSpecification::NAME("name"); const std::string CDataFrameAnalysisSpecification::PARAMETERS("parameters"); namespace { +using TBoolVec = std::vector; using TRunnerFactoryUPtrVec = ml::api::CDataFrameAnalysisSpecification::TRunnerFactoryUPtrVec; TRunnerFactoryUPtrVec analysisFactories() { @@ -208,12 +209,12 @@ void CDataFrameAnalysisSpecification::estimateMemoryUsage(CMemoryUsageEstimation m_Runner->estimateMemoryUsage(writer); } -void CDataFrameAnalysisSpecification::columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const { +TBoolVec CDataFrameAnalysisSpecification::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { if (m_Runner == nullptr) { HANDLE_FATAL(<< "Internal error: no runner available. Please report this problem."); - return; + return TBoolVec(fieldNames.size(), false); } - m_Runner->columnsForWhichEmptyIsMissing(fieldNames); + return m_Runner->columnsForWhichEmptyIsMissing(fieldNames); } void CDataFrameAnalysisSpecification::initializeRunner(const rapidjson::Value& jsonAnalysis) { diff --git a/lib/api/CDataFrameAnalyzer.cc b/lib/api/CDataFrameAnalyzer.cc index d033cf81ce..43c28db4be 100644 --- a/lib/api/CDataFrameAnalyzer.cc +++ b/lib/api/CDataFrameAnalyzer.cc @@ -33,6 +33,27 @@ core::CFloatStorage truncateToFloatRange(double value) { return maths::CTools::truncate(value, -largest, largest); } +void mapToVector(const TStrSizeUMap& map, TStrVec& vector) { + assert(vector.empty()); + vector.resize(map.size()); + for (const auto& entry : map) { + std::size_t index = entry.second; + if (index >= vector.size()) { + HANDLE_FATAL(<< "Index out of bounds: " << index); + } else { + vector[index] = entry.first; + } + } +} + +void mapsToVectors(const TStrSizeUMapVec& maps, TStrVecVec& vectors) { + assert(vectors.empty()); + vectors.resize(maps.size()); + for (std::size_t i = 0; i < maps.size(); ++i) { + mapToVector(maps[i], vectors[i]); + } +} + const std::string SPECIAL_COLUMN_FIELD_NAME{"."}; // Control message types: @@ -57,10 +78,6 @@ CDataFrameAnalyzer::CDataFrameAnalyzer(TDataFrameAnalysisSpecificationUPtr analy auto frameAndDirectory = m_AnalysisSpecification->makeDataFrame(); m_DataFrame = std::move(frameAndDirectory.first); m_DataFrameDirectory = frameAndDirectory.second; - TStrVec emptyAsMissing; - m_AnalysisSpecification->columnsForWhichEmptyIsMissing(emptyAsMissing); - m_EmptyAsMissing = - std::set(emptyAsMissing.begin(), emptyAsMissing.end()); } } @@ -285,6 +302,7 @@ void CDataFrameAnalyzer::captureFieldNames(const TStrVec& fieldNames) { if (m_FieldNames.empty()) { m_FieldNames.assign(fieldNames.begin() + m_BeginDataFieldValues, fieldNames.begin() + m_EndDataFieldValues); + m_EmptyAsMissing = m_AnalysisSpecification->columnsForWhichEmptyIsMissing(m_FieldNames); } } @@ -348,8 +366,7 @@ void CDataFrameAnalyzer::addRowToDataFrame(const TStrVec& fieldValues) { for (std::ptrdiff_t i = m_BeginDataFieldValues; i != m_EndDataFieldValues; ++i, ++columns) { *columns = fieldToValue(isCategorical[i], m_CategoricalFieldValues[i], - m_EmptyAsMissing.count(m_FieldNames[i]) > 0, - fieldValues[i]); + m_EmptyAsMissing[i], fieldValues[i]); } docHash = 0; if (m_DocHashFieldIndex != FIELD_MISSING && @@ -384,26 +401,6 @@ void CDataFrameAnalyzer::writeProgress(int progress, writer.flush(); } -void mapToVector(const TStrSizeUMap& map, TStrVec& vector) { - assert(vector.empty()); - vector.resize(map.size()); - for (const auto& entry : map) { - std::size_t index = entry.second; - if (index >= vector.size()) { - HANDLE_FATAL(<< "Index out of bounds: " << index); - } - vector[index] = entry.first; - } -} - -void mapsToVectors(const TStrSizeUMapVec& maps, TStrVecVec& vectors) { - assert(vectors.empty()); - vectors.resize(maps.size()); - for (std::size_t i = 0; i < maps.size(); i++) { - mapToVector(maps[i], vectors[i]); - } -} - void CDataFrameAnalyzer::writeResultsOf(const CDataFrameAnalysisRunner& analysis, core::CRapidJsonConcurrentLineWriter& writer) const { // We write results single threaded because we need to write the rows to diff --git a/lib/api/CDataFrameBoostedTreeRunner.cc b/lib/api/CDataFrameBoostedTreeRunner.cc index 70074be3e1..e69858b65e 100644 --- a/lib/api/CDataFrameBoostedTreeRunner.cc +++ b/lib/api/CDataFrameBoostedTreeRunner.cc @@ -193,12 +193,11 @@ const std::string& CDataFrameBoostedTreeRunner::predictionFieldName() const { return m_PredictionFieldName; } -const CDataFrameBoostedTreeRunner::TBoostedTreeUPtr& -CDataFrameBoostedTreeRunner::boostedTree() const { +const maths::CBoostedTree& CDataFrameBoostedTreeRunner::boostedTree() const { if (m_BoostedTree == nullptr) { HANDLE_FATAL(<< "Internal error: boosted tree object missing. Please report this error."); } - return m_BoostedTree; + return *m_BoostedTree; } void CDataFrameBoostedTreeRunner::runImpl(const TStrVec& featureNames, diff --git a/lib/api/CDataFrameClassificationRunner.cc b/lib/api/CDataFrameClassificationRunner.cc index 324b057f59..7628c9e326 100644 --- a/lib/api/CDataFrameClassificationRunner.cc +++ b/lib/api/CDataFrameClassificationRunner.cc @@ -27,6 +27,8 @@ namespace ml { namespace api { namespace { +using TBoolVec = std::vector; + // Configuration const std::string NUM_TOP_CLASSES{"num_top_classes"}; @@ -38,8 +40,7 @@ const std::string CLASS_PROBABILITY_FIELD_NAME{"class_probability"}; } const CDataFrameAnalysisConfigReader CDataFrameClassificationRunner::getParameterReader() { - CDataFrameAnalysisConfigReader theReader = - CDataFrameBoostedTreeRunner::getParameterReader(); + CDataFrameAnalysisConfigReader theReader{CDataFrameBoostedTreeRunner::getParameterReader()}; theReader.addParameter(NUM_TOP_CLASSES, CDataFrameAnalysisConfigReader::E_OptionalParameter); return theReader; } @@ -58,27 +59,35 @@ CDataFrameClassificationRunner::CDataFrameClassificationRunner(const CDataFrameA // The only field for which empty value should be treated as missing is dependent variable // which has empty value for non-training rows. -void CDataFrameClassificationRunner::columnsForWhichEmptyIsMissing(TStrVec& fieldNames) const { - fieldNames.push_back(dependentVariableFieldName()); +TBoolVec CDataFrameClassificationRunner::columnsForWhichEmptyIsMissing(const TStrVec& fieldNames) const { + TBoolVec emptyAsMissing(fieldNames.size(), false); + auto pos = std::find(fieldNames.begin(), fieldNames.end(), + this->dependentVariableFieldName()); + if (pos != fieldNames.end()) { + emptyAsMissing[pos - fieldNames.begin()] = true; + } + return emptyAsMissing; } void CDataFrameClassificationRunner::writeOneRow(const TStrVec&, const TStrVecVec& categoricalFieldValues, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const { - const auto& tree{boostedTree()}; - std::size_t columnHoldingDependentVariable = tree->columnHoldingDependentVariable(); - std::size_t columnHoldingPrediction = tree->columnHoldingPrediction(row.numberColumns()); - const double dependentVariable = row[columnHoldingDependentVariable]; - const std::uint64_t prediction = std::lround(row[columnHoldingPrediction]); + const auto& tree = this->boostedTree(); + const std::size_t columnHoldingDependentVariable{tree.columnHoldingDependentVariable()}; + const std::size_t columnHoldingPrediction{ + tree.columnHoldingPrediction(row.numberColumns())}; + const double dependentVariable{row[columnHoldingDependentVariable]}; + const std::uint64_t prediction{ + static_cast(std::lround(row[columnHoldingPrediction]))}; if (prediction >= categoricalFieldValues[columnHoldingDependentVariable].size()) { HANDLE_FATAL(<< "Index out of bounds: " << prediction); } - std::string predictedClassName = - categoricalFieldValues[columnHoldingDependentVariable][prediction]; + const std::string& predictedClassName{ + categoricalFieldValues[columnHoldingDependentVariable][prediction]}; writer.StartObject(); - writer.Key(predictionFieldName()); + writer.Key(this->predictionFieldName()); writer.String(predictedClassName); writer.Key(IS_TRAINING_FIELD_NAME); writer.Bool(maths::CDataFrameUtils::isMissing(dependentVariable) == false); diff --git a/lib/api/CDataFrameRegressionRunner.cc b/lib/api/CDataFrameRegressionRunner.cc index 79974777de..c8d236e936 100644 --- a/lib/api/CDataFrameRegressionRunner.cc +++ b/lib/api/CDataFrameRegressionRunner.cc @@ -49,12 +49,13 @@ void CDataFrameRegressionRunner::writeOneRow(const TStrVec&, const TStrVecVec&, TRowRef row, core::CRapidJsonConcurrentLineWriter& writer) const { - const auto& tree{boostedTree()}; - std::size_t columnHoldingDependentVariable = tree->columnHoldingDependentVariable(); - std::size_t columnHoldingPrediction = tree->columnHoldingPrediction(row.numberColumns()); + const auto& tree = this->boostedTree(); + const std::size_t columnHoldingDependentVariable = tree.columnHoldingDependentVariable(); + const std::size_t columnHoldingPrediction = + tree.columnHoldingPrediction(row.numberColumns()); writer.StartObject(); - writer.Key(predictionFieldName()); + writer.Key(this->predictionFieldName()); writer.Double(row[columnHoldingPrediction]); writer.Key(IS_TRAINING_FIELD_NAME); writer.Bool(maths::CDataFrameUtils::isMissing(row[columnHoldingDependentVariable]) == false); From d3bec8647208b88ad9fcb07396a90f484ca2f80a Mon Sep 17 00:00:00 2001 From: Przemyslaw Witek Date: Thu, 3 Oct 2019 12:29:57 +0200 Subject: [PATCH 3/5] Add unit tests --- .../unittest/CDataFrameAnalysisRunnerTest.cc | 36 +++++ .../unittest/CDataFrameAnalysisRunnerTest.h | 2 + lib/api/unittest/CDataFrameAnalyzerTest.cc | 130 +++++++++++++----- lib/api/unittest/CDataFrameAnalyzerTest.h | 1 + 4 files changed, 137 insertions(+), 32 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 5b6ab57a4c..36446053e9 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -189,6 +189,36 @@ void CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000() { testEstimateMemoryUsage(1000, "450kB", "143kB", 0); } +void testColumnsForWhichEmptyIsMissing(const std::string& analysis, + bool expected_dependentVariableEmptyAsMissing) { + using TBoolVec = std::vector; + using TStrVec = std::vector; + + std::string parameters{"{\"dependent_variable\": \"label\"}"}; + std::string jsonSpec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( + "testJob", 10000, 5, 100000000, 1, {}, true, + test::CTestTmpDir::tmpDir(), "", analysis, parameters)}; + api::CDataFrameAnalysisSpecification spec{jsonSpec}; + + TStrVec fieldNames{"feature_1", "feature_2", "feature_3", "label"}; + TBoolVec emptyAsMissing{spec.columnsForWhichEmptyIsMissing(fieldNames)}; + + CPPUNIT_ASSERT_EQUAL(fieldNames.size(), emptyAsMissing.size()); + CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[0])); + CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[1])); + CPPUNIT_ASSERT_EQUAL(false, bool(emptyAsMissing[2])); + CPPUNIT_ASSERT_EQUAL(expected_dependentVariableEmptyAsMissing, + bool(emptyAsMissing[3])); +} + +void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification() { + testColumnsForWhichEmptyIsMissing("classification", true); +} + +void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression() { + testColumnsForWhichEmptyIsMissing("regression", false); +} + CppUnit::Test* CDataFrameAnalysisRunnerTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CDataFrameAnalysisRunnerTest"); @@ -213,6 +243,12 @@ CppUnit::Test* CDataFrameAnalysisRunnerTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000", &CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification", + &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression", + &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression)); return suiteOfTests; } diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.h b/lib/api/unittest/CDataFrameAnalysisRunnerTest.h index 2b3b73d9f0..df8bcbc44d 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.h +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.h @@ -18,6 +18,8 @@ class CDataFrameAnalysisRunnerTest : public CppUnit::TestFixture { void testEstimateMemoryUsage_10(); void testEstimateMemoryUsage_100(); void testEstimateMemoryUsage_1000(); + void testColumnsForWhichEmptyIsMissing_Classification(); + void testColumnsForWhichEmptyIsMissing_Regression(); static CppUnit::Test* suite(); diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index 8b11db920f..6a58dd022d 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -44,6 +44,7 @@ using TDoubleVecVec = std::vector; using TSizeVec = std::vector; using TStrVec = std::vector; using TRowItr = core::CDataFrame::TRowItr; +using TRowRef = core::CDataFrame::TRowRef; using TPoint = maths::CDenseVector; using TPointVec = std::vector; using TDataFrameUPtr = std::unique_ptr; @@ -144,20 +145,21 @@ auto outlierSpec(std::size_t rows = 110, return std::make_unique(spec); } -auto regressionSpec(std::string dependentVariable, - std::size_t rows = 100, - std::size_t cols = 5, - std::size_t memoryLimit = 3000000, - std::size_t numberRoundsPerHyperparameter = 0, - std::size_t bayesianOptimisationRestarts = 0, - const TStrVec& categoricalFieldNames = TStrVec{}, - double lambda = -1.0, - double gamma = -1.0, - double eta = -1.0, - std::size_t maximumNumberTrees = 0, - double featureBagFraction = -1.0, - CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr, - CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) { +auto analysisSpec(std::string analysis, + std::string dependentVariable, + std::size_t rows = 100, + std::size_t cols = 5, + std::size_t memoryLimit = 3000000, + std::size_t numberRoundsPerHyperparameter = 0, + std::size_t bayesianOptimisationRestarts = 0, + const TStrVec& categoricalFieldNames = TStrVec{}, + double lambda = -1.0, + double gamma = -1.0, + double eta = -1.0, + std::size_t maximumNumberTrees = 0, + double featureBagFraction = -1.0, + CDataFrameAnalyzerTest::TPersisterSupplier* persisterSupplier = nullptr, + CDataFrameAnalyzerTest::TRestoreSearcherSupplier* restoreSearcherSupplier = nullptr) { std::string parameters = "{\n\"dependent_variable\": \"" + dependentVariable + "\""; if (lambda >= 0.0) { @@ -189,7 +191,7 @@ auto regressionSpec(std::string dependentVariable, std::string spec{api::CDataFrameAnalysisSpecificationJsonWriter::jsonString( "testJob", rows, cols, memoryLimit, 1, categoricalFieldNames, true, - test::CTestTmpDir::tmpDir(), "ml", "regression", parameters)}; + test::CTestTmpDir::tmpDir(), "ml", analysis, parameters)}; LOG_TRACE(<< "spec =\n" << spec); @@ -614,7 +616,7 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTraining() { TStrVec fieldNames{"c1", "c2", "c3", "c4", "c5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "0", ""}; - api::CDataFrameAnalyzer analyzer{regressionSpec("c5"), outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "c5"), outputWriterFactory}; addRegressionTestData(fieldNames, fieldValues, analyzer, expectedPredictions); core::CStopWatch watch{true}; @@ -676,8 +678,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithParams() { }; api::CDataFrameAnalyzer analyzer{ - regressionSpec("c5", 100, 5, 3000000, 0, 0, {}, lambda, gamma, eta, - maximumNumberTrees, featureBagFraction), + analysisSpec("regression", "c5", 100, 5, 3000000, 0, 0, {}, lambda, + gamma, eta, maximumNumberTrees, featureBagFraction), outputWriterFactory}; TDoubleVec expectedPredictions; @@ -728,8 +730,8 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithRowsMissingTargetValu auto target = [](double feature) { return 10.0 * feature; }; - api::CDataFrameAnalyzer analyzer{regressionSpec("target", 50, 2, 2000000), - outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{ + analysisSpec("regression", "target", 50, 2, 2000000), outputWriterFactory}; TDoubleVec feature; rng.generateUniformSamples(1.0, 3.0, 50, feature); @@ -946,7 +948,8 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { { api::CDataFrameAnalyzer analyzer{ - regressionSpec("x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}), outputWriterFactory}; + analysisSpec("regression", "x5", 1000, 5, 8000000, 0, 0, {"x1", "x2"}), + outputWriterFactory}; TStrVec x[]{{"x11", "x12", "x13", "x14", "x15"}, {"x21", "x22", "x23", "x24", "x25", "x26", "x27"}}; @@ -972,7 +975,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { passed &= (expected[1] == (*row)[1]); if (wasPassed && passed == false) { LOG_ERROR(<< "expected " << core::CContainerPrinter::print(expected) - << "got [" << (*row)[0] << ", " << (*row)[1] << "]"); + << ", got [" << (*row)[0] << ", " << (*row)[1] << "]"); } } }); @@ -984,9 +987,9 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { { std::size_t rows{api::CDataFrameAnalyzer::MAX_CATEGORICAL_CARDINALITY + 3}; - api::CDataFrameAnalyzer analyzer{ - regressionSpec("x5", rows, 5, 8000000000, 0, 0, {"x1"}, 0, 0, 0, 0, 0), - outputWriterFactory}; + api::CDataFrameAnalyzer analyzer{analysisSpec("regression", "x5", rows, 5, 8000000000, + 0, 0, {"x1"}, 0, 0, 0, 0, 0), + outputWriterFactory}; TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; TStrVec fieldValues{"", "", "", "", "", "", ""}; @@ -1009,7 +1012,7 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { bool wasPassed{passed}; passed &= (expected == (*row)[0]); if (wasPassed && passed == false) { - LOG_ERROR(<< "expected " << expected << " got " << (*row)[0]); + LOG_ERROR(<< "expected " << expected << ", got " << (*row)[0]); } } }); @@ -1018,6 +1021,66 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { } } +void CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing() { + + auto eq = [](double expected) { + return [expected](double actual) { return expected == actual; }; + }; + + auto nan = []() { return [](double actual) { return isnan(actual); }; }; + + auto assertRow = [&](const std::size_t row_i, + const std::vector>& matchers, + const TRowRef& row) { + CPPUNIT_ASSERT_EQUAL_MESSAGE("row " + std::to_string(row_i), + matchers.size(), row.numberColumns()); + for (std::size_t i = 0; i < row.numberColumns(); ++i) { + CPPUNIT_ASSERT_MESSAGE("row " + std::to_string(row_i) + + ", column " + std::to_string(i), + matchers[i](row[i])); + } + }; + + std::stringstream output; + auto outputWriterFactory = [&output]() { + return std::make_unique(output); + }; + + api::CDataFrameAnalyzer analyzer{analysisSpec("classification", "x5", 1000, 5, + 8000000, 0, 0, {"x1", "x2", "x5"}), + outputWriterFactory}; + + TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; + analyzer.handleRecord(fieldNames, {"x11", "x21", "0", "0", "x51", "0", ""}); + analyzer.handleRecord(fieldNames, {"x12", "x22", "1", "1", "x52", "1", ""}); + analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""}); + analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""}); + analyzer.handleRecord(fieldNames, {"x15", "x25", "4", "4", "x51", "4", ""}); + analyzer.handleRecord(fieldNames, {"x11", "x26", "5", "5", "x52", "5", ""}); + analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""}); + analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""}); + analyzer.handleRecord(fieldNames, {"x14", "x22", "8", "8", "x51", "8", ""}); + analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""}); + analyzer.receivedAllRows(); + + const core::CDataFrame& frame{analyzer.dataFrame()}; + frame.readRows(1, [&](TRowItr beginRows, TRowItr endRows) { + std::vector rows; + std::copy(beginRows, endRows, std::back_inserter(rows)); + CPPUNIT_ASSERT_EQUAL(10UL, rows.size()); + assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]); + assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]); + assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]); + assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), nan() }, rows[3]); + assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]); + assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]); + assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), nan() }, rows[6]); + assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), nan() }, rows[7]); + assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]); + assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]); + }); +} + CppUnit::Test* CDataFrameAnalyzerTest::suite() { CppUnit::TestSuite* suiteOfTests = new CppUnit::TestSuite("CDataFrameAnalyzerTest"); @@ -1058,6 +1121,9 @@ CppUnit::Test* CDataFrameAnalyzerTest::suite() { suiteOfTests->addTest(new CppUnit::TestCaller( "CDataFrameAnalyzerTest::testCategoricalFields", &CDataFrameAnalyzerTest::testCategoricalFields)); + suiteOfTests->addTest(new CppUnit::TestCaller( + "CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing", + &CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing)); return suiteOfTests; } @@ -1137,9 +1203,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti // compute expected tree api::CDataFrameAnalyzer analyzer{ - regressionSpec("c5", numberExamples, 5, 15000000, - numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta, - maximumNumberTrees, featureBagFraction, &persisterSupplier), + analysisSpec("regression", "c5", numberExamples, 5, 15000000, + numberRoundsPerHyperparameter, 12, {}, lambda, gamma, eta, + maximumNumberTrees, featureBagFraction, &persisterSupplier), outputWriterFactory}; std::size_t dependentVariable( std::find(fieldNames.begin(), fieldNames.end(), "c5") - fieldNames.begin()); @@ -1161,9 +1227,9 @@ void CDataFrameAnalyzerTest::testRunBoostedTreeTrainingWithStateRecoverySubrouti }; api::CDataFrameAnalyzer analyzerToRestore{ - regressionSpec("c5", numberExamples, 5, 15000000, numberRoundsPerHyperparameter, - 12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction, - &persisterSupplier, &restoreSearcherSupplier), + analysisSpec("regression", "c5", numberExamples, 5, 15000000, numberRoundsPerHyperparameter, + 12, {}, lambda, gamma, eta, maximumNumberTrees, featureBagFraction, + &persisterSupplier, &restoreSearcherSupplier), outputWriterFactory}; passDataToAnalyzer(fieldNames, fieldValues, analyzerToRestore, weights, values); diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.h b/lib/api/unittest/CDataFrameAnalyzerTest.h index 7943653c08..54100f2858 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.h +++ b/lib/api/unittest/CDataFrameAnalyzerTest.h @@ -39,6 +39,7 @@ class CDataFrameAnalyzerTest : public CppUnit::TestFixture { void testErrors(); void testRoundTripDocHashes(); void testCategoricalFields(); + void testCategoricalFields_EmptyAsMissing(); static CppUnit::Test* suite(); From 97ae2aebdc654a6674236fce253785349a4c7e1f Mon Sep 17 00:00:00 2001 From: Przemyslaw Witek Date: Thu, 3 Oct 2019 16:16:56 +0200 Subject: [PATCH 4/5] Apply review comments --- .../unittest/CDataFrameAnalysisRunnerTest.cc | 12 +++---- .../unittest/CDataFrameAnalysisRunnerTest.h | 4 +-- lib/api/unittest/CDataFrameAnalyzerTest.cc | 33 +++++++++++-------- lib/api/unittest/CDataFrameAnalyzerTest.h | 2 +- 4 files changed, 28 insertions(+), 23 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc index 36446053e9..ba8e814bcc 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.cc @@ -211,11 +211,11 @@ void testColumnsForWhichEmptyIsMissing(const std::string& analysis, bool(emptyAsMissing[3])); } -void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification() { +void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingClassification() { testColumnsForWhichEmptyIsMissing("classification", true); } -void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression() { +void CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingRegression() { testColumnsForWhichEmptyIsMissing("regression", false); } @@ -244,11 +244,11 @@ CppUnit::Test* CDataFrameAnalysisRunnerTest::suite() { "CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000", &CDataFrameAnalysisRunnerTest::testEstimateMemoryUsage_1000)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification", - &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Classification)); + "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingClassification", + &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingClassification)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression", - &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissing_Regression)); + "CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingRegression", + &CDataFrameAnalysisRunnerTest::testColumnsForWhichEmptyIsMissingRegression)); return suiteOfTests; } diff --git a/lib/api/unittest/CDataFrameAnalysisRunnerTest.h b/lib/api/unittest/CDataFrameAnalysisRunnerTest.h index df8bcbc44d..c6ba62cd17 100644 --- a/lib/api/unittest/CDataFrameAnalysisRunnerTest.h +++ b/lib/api/unittest/CDataFrameAnalysisRunnerTest.h @@ -18,8 +18,8 @@ class CDataFrameAnalysisRunnerTest : public CppUnit::TestFixture { void testEstimateMemoryUsage_10(); void testEstimateMemoryUsage_100(); void testEstimateMemoryUsage_1000(); - void testColumnsForWhichEmptyIsMissing_Classification(); - void testColumnsForWhichEmptyIsMissing_Regression(); + void testColumnsForWhichEmptyIsMissingClassification(); + void testColumnsForWhichEmptyIsMissingRegression(); static CppUnit::Test* suite(); diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index 6a58dd022d..668b71bf51 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -1021,13 +1022,17 @@ void CDataFrameAnalyzerTest::testCategoricalFields() { } } -void CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing() { +void CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing() { auto eq = [](double expected) { return [expected](double actual) { return expected == actual; }; }; - auto nan = []() { return [](double actual) { return isnan(actual); }; }; + auto missing = []() { + return [](double actual) { + return maths::CDataFrameUtils::isMissing(actual); + }; + }; auto assertRow = [&](const std::size_t row_i, const std::vector>& matchers, @@ -1068,16 +1073,16 @@ void CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing() { std::vector rows; std::copy(beginRows, endRows, std::back_inserter(rows)); CPPUNIT_ASSERT_EQUAL(10UL, rows.size()); - assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]); - assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]); - assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]); - assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), nan() }, rows[3]); - assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]); - assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]); - assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), nan() }, rows[6]); - assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), nan() }, rows[7]); - assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]); - assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]); + assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]); + assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]); + assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]); + assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing()}, rows[3]); + assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]); + assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]); + assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing()}, rows[6]); + assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing()}, rows[7]); + assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]); + assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]); }); } @@ -1122,8 +1127,8 @@ CppUnit::Test* CDataFrameAnalyzerTest::suite() { "CDataFrameAnalyzerTest::testCategoricalFields", &CDataFrameAnalyzerTest::testCategoricalFields)); suiteOfTests->addTest(new CppUnit::TestCaller( - "CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing", - &CDataFrameAnalyzerTest::testCategoricalFields_EmptyAsMissing)); + "CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing", + &CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing)); return suiteOfTests; } diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.h b/lib/api/unittest/CDataFrameAnalyzerTest.h index 54100f2858..bbf1ecaf49 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.h +++ b/lib/api/unittest/CDataFrameAnalyzerTest.h @@ -39,7 +39,7 @@ class CDataFrameAnalyzerTest : public CppUnit::TestFixture { void testErrors(); void testRoundTripDocHashes(); void testCategoricalFields(); - void testCategoricalFields_EmptyAsMissing(); + void testCategoricalFieldsEmptyAsMissing(); static CppUnit::Test* suite(); From be765526e47955158d4ace590f42b375f00e503d Mon Sep 17 00:00:00 2001 From: Przemyslaw Witek Date: Fri, 4 Oct 2019 06:16:51 +0200 Subject: [PATCH 5/5] Fix compile error and apply clang-format --- lib/api/unittest/CDataFrameAnalyzerTest.cc | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/api/unittest/CDataFrameAnalyzerTest.cc b/lib/api/unittest/CDataFrameAnalyzerTest.cc index 668b71bf51..b87139108b 100644 --- a/lib/api/unittest/CDataFrameAnalyzerTest.cc +++ b/lib/api/unittest/CDataFrameAnalyzerTest.cc @@ -1058,31 +1058,31 @@ void CDataFrameAnalyzerTest::testCategoricalFieldsEmptyAsMissing() { TStrVec fieldNames{"x1", "x2", "x3", "x4", "x5", ".", "."}; analyzer.handleRecord(fieldNames, {"x11", "x21", "0", "0", "x51", "0", ""}); analyzer.handleRecord(fieldNames, {"x12", "x22", "1", "1", "x52", "1", ""}); - analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""}); - analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""}); + analyzer.handleRecord(fieldNames, {"", "x23", "2", "2", "x51", "2", ""}); + analyzer.handleRecord(fieldNames, {"x14", "x24", "3", "3", "", "3", ""}); analyzer.handleRecord(fieldNames, {"x15", "x25", "4", "4", "x51", "4", ""}); analyzer.handleRecord(fieldNames, {"x11", "x26", "5", "5", "x52", "5", ""}); - analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""}); - analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""}); + analyzer.handleRecord(fieldNames, {"x12", "", "6", "6", "", "6", ""}); + analyzer.handleRecord(fieldNames, {"x13", "x21", "7", "7", "", "7", ""}); analyzer.handleRecord(fieldNames, {"x14", "x22", "8", "8", "x51", "8", ""}); - analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""}); + analyzer.handleRecord(fieldNames, {"", "x23", "9", "9", "x52", "9", ""}); analyzer.receivedAllRows(); const core::CDataFrame& frame{analyzer.dataFrame()}; frame.readRows(1, [&](TRowItr beginRows, TRowItr endRows) { std::vector rows; std::copy(beginRows, endRows, std::back_inserter(rows)); - CPPUNIT_ASSERT_EQUAL(10UL, rows.size()); - assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]); - assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]); - assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]); + CPPUNIT_ASSERT_EQUAL(std::size_t{10}, rows.size()); + assertRow(0, {eq(0.0), eq(0.0), eq(0.0), eq(0.0), eq(0.0)}, rows[0]); + assertRow(1, {eq(1.0), eq(1.0), eq(1.0), eq(1.0), eq(1.0)}, rows[1]); + assertRow(2, {eq(2.0), eq(2.0), eq(2.0), eq(2.0), eq(0.0)}, rows[2]); assertRow(3, {eq(3.0), eq(3.0), eq(3.0), eq(3.0), missing()}, rows[3]); - assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]); - assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]); + assertRow(4, {eq(4.0), eq(4.0), eq(4.0), eq(4.0), eq(0.0)}, rows[4]); + assertRow(5, {eq(0.0), eq(5.0), eq(5.0), eq(5.0), eq(1.0)}, rows[5]); assertRow(6, {eq(1.0), eq(6.0), eq(6.0), eq(6.0), missing()}, rows[6]); assertRow(7, {eq(5.0), eq(0.0), eq(7.0), eq(7.0), missing()}, rows[7]); - assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]); - assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]); + assertRow(8, {eq(3.0), eq(1.0), eq(8.0), eq(8.0), eq(0.0)}, rows[8]); + assertRow(9, {eq(2.0), eq(2.0), eq(9.0), eq(9.0), eq(1.0)}, rows[9]); }); }