From 377250c870f19802399e4b9ece69f1d8276cf7b8 Mon Sep 17 00:00:00 2001 From: Andreas Huber Date: Wed, 5 Apr 2023 08:32:05 -0700 Subject: [PATCH] feat: quantiles and average binning for RF histograms / maxBins=0 to disable binning --- .../decision_forest_training_parameter.h | 14 + .../dtrees/dtrees_feature_type_helper.h | 18 +- .../dtrees/dtrees_feature_type_helper.i | 265 +++++++++++++----- ..._classification_train_dense_default_impl.i | 2 +- ...df_classification_train_hist_oneapi_impl.i | 2 +- .../dtrees/forest/df_training_parameter.cpp | 2 +- .../df_regression_train_dense_default_impl.i | 2 +- .../df_regression_train_hist_oneapi_impl.i | 2 +- 8 files changed, 222 insertions(+), 85 deletions(-) diff --git a/cpp/daal/include/algorithms/decision_forest/decision_forest_training_parameter.h b/cpp/daal/include/algorithms/decision_forest/decision_forest_training_parameter.h index 2d46a87e6ca..8636c75154f 100644 --- a/cpp/daal/include/algorithms/decision_forest/decision_forest_training_parameter.h +++ b/cpp/daal/include/algorithms/decision_forest/decision_forest_training_parameter.h @@ -94,6 +94,17 @@ enum splitterMode { best, /* Calculates best split from aggregate best feature splits for every node. */ random /* Calculates best split from aggregate random feature splits for every node. */ +} + +/** + * + * * \brief Available strategies to compute data bins in 'hist' method * / +*/ +enum BinningStrategy { + /* Frequency quantiles -> same number of data points per bin */ + quantiles, + /* Same feature value range per bin */ + averages }; /** @@ -145,6 +156,9 @@ class DAAL_EXPORT Parameter size_t minBinSize; /*!< Used with 'hist' split finding method only. Minimal number of observations in a bin. Default is 5 */ splitterMode splitter; /*!< Sets node splitting method. Default is best */ + BinningStrategy binningStrategy; /*!< Used with 'hist' split finding method only. + Selects the strategy to group data points into bins. + Allowed values are 'quantiles' (default), 'averages' */ }; /* [Parameter source code] */ } // namespace interface2 diff --git a/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.h b/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.h index e71d0ec7bc9..3f5f91a716e 100644 --- a/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.h +++ b/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.h @@ -24,6 +24,7 @@ #ifndef __DTREES_FEATURE_TYPE_HELPER_H__ #define __DTREES_FEATURE_TYPE_HELPER_H__ +#include "include/algorithms/decision_forest/decision_forest_training_parameter.h" #include "src/externals/service_memory.h" #include "src/data_management/service_numeric_table.h" @@ -69,12 +70,20 @@ class FeatureTypes int _lastUnordered = -1; }; +using daal::algorithms::decision_forest::training::BinningStrategy; + struct BinParams { - BinParams(size_t _maxBins, size_t _minBinSize) : maxBins(_maxBins), minBinSize(_minBinSize) {} - BinParams(const BinParams & o) : maxBins(o.maxBins), minBinSize(o.minBinSize) {} - - size_t maxBins = 256; + BinParams(size_t _maxBins, size_t _minBinSize, BinningStrategy _binningStrategy = BinningStrategy::quantiles) + : maxBins(_maxBins), minBinSize(_minBinSize), binningStrategy(_binningStrategy) + {} + BinParams(const BinParams & o) : maxBins(o.maxBins), minBinSize(o.minBinSize), binningStrategy(o.binningStrategy) {} + + /* Strategy to create bins for feature values. Default: quantiles */ + BinningStrategy binningStrategy = BinningStrategy::quantiles; + /* Maximum number of bins for indexed data. Default: 256 */ + size_t maxBins = 256; + /* Minimum bin width (number of data points per bin). Default: 5*/ size_t minBinSize = 5; }; @@ -103,6 +112,7 @@ class IndexedFeatures IndexedFeatures() : _data(nullptr), _entries(nullptr), _sizeOfIndex(sizeof(IndexType)), _nCols(0), _nRows(0), _capacity(0), _maxNumIndices(0) {} ~IndexedFeatures(); + //initialize the feature indices, i.e. bins template services::Status init(const NumericTable & nt, const FeatureTypes * featureTypes = nullptr, const BinParams * pBimPrm = nullptr); diff --git a/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.i b/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.i index a6adbd99535..d7d1b2e2475 100644 --- a/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.i +++ b/cpp/daal/src/algorithms/dtrees/dtrees_feature_type_helper.i @@ -20,6 +20,8 @@ // Cpu-dependent initialization of service data structure //-- */ + +#include "include/services/error_indexes.h" #include "src/algorithms/dtrees/dtrees_feature_type_helper.h" #include "src/threading/threading.h" #include "src/algorithms/service_error_handling.h" @@ -151,8 +153,22 @@ struct ColIndexTaskBins : public ColIndexTask { typedef ColIndexTask super; ColIndexTaskBins(size_t nRows, const BinParams & prm) : super(nRows), _prm(prm), _bins(_prm.maxBins) {} + + /* + * Transform features based on the BinParams _prm. + * - If no BinParams _prm are provided, one bin per unique value in the + * dataset is created + * - If BinParams _prm are provided, the strategy set according to + * BinParams::Strategy is used + */ virtual services::Status makeIndex(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t iCol, size_t nRows, bool bUnorderedFeature) DAAL_C11_OVERRIDE; + /* Function to create feature indices for Strategy == quantiles */ + services::Status makeIndexQuantiles(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t iCol, size_t nRows); + /* Function to create feature indices for Strategy == averages */ + services::Status makeIndexAverages(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t iCol, size_t nRows); + /* Helper to treat constant-valued features */ + services::Status makeIndexConstant(IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t nRows); private: services::Status assignIndexAccordingToBins(IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t nBins, size_t nRows); @@ -226,25 +242,37 @@ template services::Status ColIndexTaskBins::makeIndex(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, IndexType * aRes, size_t iCol, size_t nRows, bool bUnorderedFeature) { + /* feature is not ordered or fewer data points than bins -> no indexing needed */ if (bUnorderedFeature || nRows <= _prm.maxBins) return this->template makeIndexDefault(nt, entry, aRes, iCol, nRows, bUnorderedFeature); + /* sort feature values */ Status s = this->getSorted(nt, iCol, nRows); if (!s) return s; + /* special case: all values are the same -> constant-valued feature */ const typename super::FeatureIdx * index = this->_index.get(); if (index[0].key == index[nRows - 1].key) { - _bins[0] = nRows; - services::internal::service_memset_seq(aRes, 0, nRows); + return makeIndexConstant(entry, aRes, nRows); + } - entry.numIndices = 1; - s |= entry.allocBorders(); - DAAL_CHECK(s, s); - entry.binBorders[0] = index[nRows - 1].key; - return s; + /* Create bins of sorted data according to strategy selected in _prm */ + switch (_prm.binningStrategy) + { + case dtrees::internal::BinningStrategy::quantiles: return makeIndexQuantiles(nt, entry, aRes, iCol, nRows); + case dtrees::internal::BinningStrategy::averages: return makeIndexAverages(nt, entry, aRes, iCol, nRows); + default: return Status(ErrorID::ErrorMethodNotSupported); } +} - size_t nBins = 0; +template +services::Status ColIndexTaskBins::makeIndexQuantiles(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, + IndexType * aRes, size_t iCol, size_t nRows) +{ + const typename super::FeatureIdx * index = this->_index.get(); + + size_t nBins = 0; + DAAL_ASSERT(_prm.maxBins > 0); const size_t binSize = nRows / _prm.maxBins; int64_t remainder = nRows % _prm.maxBins; //allow for negative values size_t dx = 2 * _prm.maxBins; @@ -268,44 +296,59 @@ services::Status ColIndexTaskBins::makeIndex(Nu } size_t iRight = i + newBinSize - 1; //intersperse remainder amongst bins const typename super::FeatureIdx & ri = index[iRight]; - if (ri.key == index[iRight + 1].key) + + if (ri.key != index[iRight + 1].key) { - //right border can't be placed at iRight because it has to be between different feature values - //try moving the border to the right, find the first value bigger than the value at iRight - ++iRight; - size_t r = iRight + binSize; - //at first, roughly locate the value bigger than iRight, jumping by binSize to the right - for (; (r < nRows) && (index[r].key == ri.key); r += binSize) - {} - if (r > nRows) r = nRows; - //then locate a new border as the upper_bound between this rough value and iRight - iRight = upper_bound(index + iRight + 1, index + r, ri) - index; - //this is the size of the bin - newBinSize = iRight - i; - //if the value it is too big (number of feature values equal to ri.key is bigger than binSize) - //then perhaps left border of the bin can be moved to the right - if (newBinSize >= 2 * binSize) + // value changed from one bin to the next, append and continue + append(_bins, nBins, newBinSize); + i += newBinSize; + continue; + } + + /* when arriving here, the feature value has not changed and + * we have to move iRight to the right until we find a new value + * r will be located at the first value that is different from ri.key + */ + ++iRight; + size_t r = iRight + binSize; + while (r < nRows && index[r].key == ri.key) + { + r += binSize; + } + if (r > nRows) + { + r = nRows; + } + // upper_bound() returns the index of the first value change between + // index + iRight + 1 and index + r + iRight = upper_bound(index + iRight + 1, index + r, ri) - index; + newBinSize = iRight - i; + + if (newBinSize >= 2 * binSize) + { + // the new bin is too wide, try insert an additional bin to the left + size_t iClosestSmallerValue = i + binSize - 1; + while (iClosestSmallerValue > i && index[iClosestSmallerValue].key == ri.key) { - size_t iClosestSmallerValue = i + binSize - 1; - for (; (iClosestSmallerValue > i) && (index[iClosestSmallerValue].key == ri.key); --iClosestSmallerValue) - ; - size_t dist = iClosestSmallerValue - i; - if (dist > _prm.minBinSize) - { - //add an extra bin at the left - const size_t newLeftBinSize = dist + 1; - append(_bins, nBins, newLeftBinSize); - i += newLeftBinSize; - newBinSize -= newLeftBinSize; - } - else if ((nBins > 0) && dist) - { - //if it is small and not the first bin, then extend previous bin by the value - const size_t nAddToPrevBin = dist + 1; - _bins[nBins - 1] += nAddToPrevBin; - i += nAddToPrevBin; - newBinSize -= nAddToPrevBin; - } + --iClosestSmallerValue; + } + size_t dist = iClosestSmallerValue - i; + if (dist > _prm.minBinSize) + { + // add an extra bin at the left + const size_t newLeftBinSize = dist + 1; + append(_bins, nBins, newLeftBinSize); + i += newLeftBinSize; + newBinSize -= newLeftBinSize; + } + else if ((nBins > 0) && dist > 0) + { + // no room for an extra bin to the left, extend the previous + // one if possible + const size_t nAddToPrevBin = dist + 1; + _bins[nBins - 1] += nAddToPrevBin; + i += nAddToPrevBin; + newBinSize -= nAddToPrevBin; } if (remainder > 0) { //reset bresenhams line due to unexpected change in remainder @@ -315,9 +358,13 @@ services::Status ColIndexTaskBins::makeIndex(Nu D = dy - _prm.maxBins + nBins + 1; } } + + // append the bin and continue append(_bins, nBins, newBinSize); i += newBinSize; } + + // collect the remaining data rows in the final bin if (i < nRows) { size_t newBinSize = nRows - i; @@ -330,6 +377,7 @@ services::Status ColIndexTaskBins::makeIndex(Nu _bins[nBins - 1] += newBinSize; } } + #if _DEBUG #if 0 //run-time check for bins correctness @@ -347,6 +395,61 @@ services::Status ColIndexTaskBins::makeIndex(Nu return assignIndexAccordingToBins(entry, aRes, nBins, nRows); } +template +services::Status ColIndexTaskBins::makeIndexAverages(NumericTable & nt, IndexedFeatures::FeatureEntry & entry, + IndexType * aRes, size_t iCol, size_t nRows) +{ + const typename super::FeatureIdx * index = this->_index.get(); + + size_t nBins = 0; + size_t i = 0; + DAAL_ASSERT(_prm.maxBins > 0); + algorithmFPType binSize = (index[nRows - 1].key - index[0].key) / _prm.maxBins; + algorithmFPType value = index[0].key; + + while (i < nRows) + { + // next bin border to the right of current index + size_t iRight = i + 1; + + while ((iRight < nRows) && (index[iRight].key < (value + binSize))) + { + ++iRight; + } + + // found a new binEdge + // append the bin and continue + size_t newBinSize = iRight - i; + + append(_bins, nBins, newBinSize); + + i = iRight; + value = index[i].key; + } + + // assert we picked up all data records + DAAL_ASSERT(i == nRows); + DAAL_ASSERT(nBins <= _prm.maxBins); + + return assignIndexAccordingToBins(entry, aRes, nBins, nRows); +} + +template +services::Status ColIndexTaskBins::makeIndexConstant(IndexedFeatures::FeatureEntry & entry, IndexType * aRes, + size_t nRows) +{ + const typename super::FeatureIdx * index = this->_index.get(); + + _bins[0] = nRows; + services::internal::service_memset_seq(aRes, 0, nRows); + + entry.numIndices = 1; + Status s = entry.allocBorders(); + DAAL_CHECK(s, s); + entry.binBorders[0] = index[nRows - 1].key; + return s; +} + template services::Status IndexedFeatures::init(const NumericTable & nt, const FeatureTypes * featureTypes, const BinParams * pBimPrm) { @@ -357,40 +460,50 @@ services::Status IndexedFeatures::init(const NumericTable & nt, const FeatureTyp featureTypes = &autoFT; } - _maxNumIndices = 0; - services::Status s = alloc(nt.getNumberOfColumns(), nt.getNumberOfRows()); - if (!s) return s; - - const size_t nC = nt.getNumberOfColumns(); - typedef ColIndexTask TlsTask; - typedef ColIndexTask DefaultTask; - typedef ColIndexTaskBins BinningTask; - - daal::tls tlsData([=, &nt]() -> TlsTask * { - const size_t nRows = nt.getNumberOfRows(); - TlsTask * res = (pBimPrm ? new BinningTask(nRows, *pBimPrm) : new DefaultTask(nRows)); - if (res && !res->isValid()) + template + services::Status IndexedFeatures::init(const NumericTable & nt, const FeatureTypes * featureTypes, const BinParams * pBimPrm) + { + dtrees::internal::FeatureTypes autoFT; + if (!featureTypes) { - delete res; - res = nullptr; + DAAL_CHECK_MALLOC(autoFT.init(nt)); + featureTypes = &autoFT; } - return res; - }); - - SafeStatus safeStat; - daal::threader_for(nC, nC, [&](size_t iCol) { - //in case of single thread no need to allocate - TlsTask * task = tlsData.local(); - DAAL_CHECK_THR(task, services::ErrorMemoryAllocationFailed); - safeStat |= - task->makeIndex(const_cast(nt), _entries[iCol], _data + iCol * nRows(), iCol, nRows(), featureTypes->isUnordered(iCol)); - }); - tlsData.reduce([&](TlsTask * task) -> void { - if (_maxNumIndices < task->maxNumDiffValues) _maxNumIndices = task->maxNumDiffValues; - delete task; - }); - return safeStat.detach(); -} + + _maxNumIndices = 0; + services::Status s = alloc(nt.getNumberOfColumns(), nt.getNumberOfRows()); + if (!s) return s; + + const size_t nC = nt.getNumberOfColumns(); + typedef ColIndexTask TlsTask; + typedef ColIndexTask DefaultTask; + typedef ColIndexTaskBins BinningTask; + + daal::tls tlsData([=, &nt]() -> TlsTask * { + const size_t nRows = nt.getNumberOfRows(); + TlsTask * res = (!pBimPrm || (pBimPrm->maxBins == 0)) ? new DefaultTask(nRows) : new BinningTask(nRows, *pBimPrm); + if (res && !res->isValid()) + { + delete res; + res = nullptr; + } + return res; + }); + + SafeStatus safeStat; + daal::threader_for(nC, nC, [&](size_t iCol) { + //in case of single thread no need to allocate + TlsTask * task = tlsData.local(); + DAAL_CHECK_THR(task, services::ErrorMemoryAllocationFailed); + safeStat |= task->makeIndex(const_cast(nt), _entries[iCol], _data + iCol * nRows(), iCol, nRows(), + featureTypes->isUnordered(iCol)); + }); + tlsData.reduce([&](TlsTask * task) -> void { + if (_maxNumIndices < task->maxNumDiffValues) _maxNumIndices = task->maxNumDiffValues; + delete task; + }); + return safeStat.detach(); + } } /* namespace internal */ } /* namespace dtrees */ diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i index bc3952e4da2..4431599debc 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_dense_default_impl.i @@ -1700,7 +1700,7 @@ services::Status computeForSpecificHelper(HostAppIface * pHostApp, const Numeric { if (!memSave) { - BinParams prm(par.maxBins, par.minBinSize); + BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy); s = indexedFeatures.init(*x, &featTypes, &prm); DAAL_CHECK_STATUS_VAR(s); diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i index 1f8b829d0e9..09797cfe353 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i @@ -827,7 +827,7 @@ services::Status ClassificationTrainBatchKernelOneAPI::co kernelReducePartialHistograms = kernel_factory.getKernel("reducePartialHistograms", status); DAAL_CHECK_STATUS_VAR(status); - dtrees::internal::BinParams prm(par.maxBins, par.minBinSize); + dtrees::internal::BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy); decision_forest::internal::IndexedFeaturesOneAPI indexedFeatures; dtrees::internal::FeatureTypes featTypes; diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp b/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp index 91e91a38515..0ca6e46e2f2 100755 --- a/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp +++ b/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp @@ -78,7 +78,7 @@ Status checkImpl(const decision_forest::training::interface2::Parameter & prm) if (prm.varImportance == MDA_Raw || prm.varImportance == MDA_Scaled) s.add(Error::create(ErrorDFBootstrapVarImportanceIncompatible)); if (prm.resultsToCompute & computeOutOfBagError) s.add(Error::create(ErrorDFBootstrapOOBIncompatible)); } - DAAL_CHECK_EX((prm.maxBins >= 2), ErrorIncorrectParameter, ParameterName, maxBinsStr()); + DAAL_CHECK_EX((prm.maxBins >= 0), ErrorIncorrectParameter, ParameterName, maxBinsStr()); DAAL_CHECK_EX((prm.minBinSize >= 1), ErrorIncorrectParameter, ParameterName, minBinSizeStr()); return s; } diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i index 743a99ae31c..22b6031f65b 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_impl.i @@ -1321,7 +1321,7 @@ services::Status computeForSpecificHelper(HostAppIface * pHostApp, const Numeric { if (!memSave) { - BinParams prm(par.maxBins, par.minBinSize); + BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy); s = indexedFeatures.init(*x, &featTypes, &prm); DAAL_CHECK_STATUS_VAR(s); diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i index 1ca92d53286..c2fa995c585 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i @@ -808,7 +808,7 @@ services::Status RegressionTrainBatchKernelOneAPI::comput kernelReducePartialHistograms = kernel_factory.getKernel("reducePartialHistograms", status); DAAL_CHECK_STATUS_VAR(status); - dtrees::internal::BinParams prm(par.maxBins, par.minBinSize); + dtrees::internal::BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy); decision_forest::internal::IndexedFeaturesOneAPI indexedFeatures; dtrees::internal::FeatureTypes featTypes;