From af225ad15bfbf6e12411856def90192104bf9296 Mon Sep 17 00:00:00 2001 From: Andreas Huber <9201869+ahuber21@users.noreply.github.com> Date: Fri, 24 Mar 2023 20:20:06 +0100 Subject: [PATCH] fix(RF): performance of feature sampling for node splits (#2292) * disable bottlenecks caused by memorySavingMode=false * feat: add drawSample function with O(k) runtime * Use drawKFromBufferWithoutReplacement in df training * sample directly in findBestSplit function to make it O(_maxFeatures) * rename service_memset_sequential -> service_memset_incrementing * feat: status checks for node splitting algorithms --- .../forest/df_train_dense_default_impl.i | 187 ++++++++++++------ .../dtrees/forest/df_training_parameter.cpp | 4 +- cpp/daal/src/externals/service_memory.h | 13 ++ cpp/daal/src/externals/service_rng.h | 28 +++ 4 files changed, 169 insertions(+), 63 deletions(-) diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i index 65e94dfc992..b28a8a628a0 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i +++ b/cpp/daal/src/algorithms/dtrees/forest/df_train_dense_default_impl.i @@ -115,6 +115,15 @@ private: WorkItem * _data; // array of heap elements, max element is on the left }; +////////////////////////////////////////////////////////////////////////////////////////// +// Service structure, node split error & splitting status +////////////////////////////////////////////////////////////////////////////////////////// +struct NodeSplitResult +{ + services::Status status; + bool bSplitSucceeded; +}; + ////////////////////////////////////////////////////////////////////////////////////////// // Service structure, contains numeric tables to be calculated as result ////////////////////////////////////////////////////////////////////////////////////////// @@ -595,14 +604,14 @@ protected: algorithmFPType imp); typename DataHelper::NodeType::Leaf * makeLeaf(const IndexType * idx, size_t n, typename DataHelper::ImpurityData & imp, size_t makeLeaf); - bool findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature, - typename DataHelper::TSplitData & split, algorithmFPType totalWeights); - bool findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature, - typename DataHelper::TSplitData & split, algorithmFPType totalWeights); - bool findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature, - typename DataHelper::TSplitData & split, algorithmFPType totalWeights); - bool simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest, - typename DataHelper::TSplitData & split); + NodeSplitResult findBestSplit(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, + IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights); + NodeSplitResult findBestSplitSerial(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, + IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights); + NodeSplitResult findBestSplitThreaded(size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, + IndexType & iBestFeature, typename DataHelper::TSplitData & split, algorithmFPType totalWeights); + NodeSplitResult simpleSplit(size_t iStart, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest, + typename DataHelper::TSplitData & split); void addImpurityDecrease(IndexType iFeature, size_t n, const typename DataHelper::ImpurityData & curImpurity, const typename DataHelper::TSplitData & split); @@ -619,7 +628,7 @@ protected: const size_t nGen = (!_par.memorySavingMode && !_maxLeafNodes && !_useConstFeatures) ? n : _nFeaturesPerNode; *_numElems += n; RNGs rng; - rng.uniformWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), 0, n); + rng.drawKFromBufferWithoutReplacement(nGen, _aFeatureIdx.get(), _aFeatureIdx.get() + nGen, _engineImpl->getState(), n); } services::Status computeResults(const dtrees::internal::Tree & t); @@ -683,16 +692,18 @@ services::Status TrainBatchTaskBase(_aConstFeatureIdx.get(), IndexType(0), maxFeatures * 2); - } - else - _aFeatureIdx.reset(_nFeaturesPerNode * 2); // _nFeaturesPerNode elements are used by algorithm, others are used internally by generator + /* first maxFeatures entries serve as a buffer of drawn samples for node splitting */ + /* second maxFeatures entries contains [0, ..., maxFeatures - 1] and is used to randomly draw indices */ + _aFeatureIdx.reset(maxFeatures * 2); + _aConstFeatureIdx.reset(maxFeatures * 2); + + DAAL_CHECK_MALLOC(_aConstFeatureIdx.get()); + services::internal::service_memset_seq(_aConstFeatureIdx.get(), IndexType(0), 2 * maxFeatures); + // in order to use drawKFromBufferWithoutReplacement we need to initialize + // the buffer to contain all indices from [0, 1, ..., maxFeatures - 1] + DAAL_CHECK_MALLOC(_aFeatureIdx.get()); + services::internal::service_memset_seq(_aFeatureIdx.get(), IndexType(0), maxFeatures); + services::internal::service_memset_incrementing(_aFeatureIdx.get() + maxFeatures, IndexType(0), maxFeatures); DAAL_CHECK_MALLOC(_aSample.get() && _helper.reset(_nSamples) && _helper.resetWeights(_nSamples) && _aFeatureBuf.get() && _aFeatureIndexBuf.get() && _aFeatureIdx.get()); @@ -798,7 +809,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBasecount); return res; } + return makeLeaf(_aSample.get() + iStart, n, curImpurity, nClasses); } @@ -859,7 +874,10 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase @@ -1032,11 +1048,12 @@ typename DataHelper::NodeType::Base * TrainBatchTaskBase -bool TrainBatchTaskBase::simpleSplit(size_t iStart, - const typename DataHelper::ImpurityData & curImpurity, - IndexType & iFeatureBest, - typename DataHelper::TSplitData & split) +NodeSplitResult TrainBatchTaskBase::simpleSplit(size_t iStart, + const typename DataHelper::ImpurityData & curImpurity, + IndexType & iFeatureBest, + typename DataHelper::TSplitData & split) { + services::Status st; RNGs rng; algorithmFPType featBuf[2]; IndexType * aIdx = _aSample.get() + iStart; @@ -1044,7 +1061,11 @@ bool TrainBatchTaskBase::simpleS { IndexType iFeature; *_numElems += 1; - rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns()); + int errorcode = rng.uniform(1, &iFeature, _engineImpl->getState(), 0, _data->getNumberOfColumns()); + if (errorcode) + { + st = services::Status(services::ErrorNullResult); + } featureValuesToBuf(iFeature, featBuf, aIdx, 2); if (featBuf[1] - featBuf[0] <= _accuracy) //all values of the feature are the same continue; @@ -1052,17 +1073,15 @@ bool TrainBatchTaskBase::simpleS split.featureUnordered = _featHelper.isUnordered(iFeature); split.impurityDecrease = curImpurity.var; iFeatureBest = iFeature; - return true; + return { st, true }; } - return false; + return { st, false }; } template -bool TrainBatchTaskBase::findBestSplit(size_t level, size_t iStart, size_t n, - const typename DataHelper::ImpurityData & curImpurity, - IndexType & iFeatureBest, - typename DataHelper::TSplitData & split, - algorithmFPType totalWeights) +NodeSplitResult TrainBatchTaskBase::findBestSplit( + size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest, + typename DataHelper::TSplitData & split, algorithmFPType totalWeights) { if (n == 2) { @@ -1078,26 +1097,67 @@ bool TrainBatchTaskBase::findBes //find best split and put it to featureIndexBuf template -bool TrainBatchTaskBase::findBestSplitSerial(size_t level, size_t iStart, size_t n, - const typename DataHelper::ImpurityData & curImpurity, - IndexType & iBestFeature, - typename DataHelper::TSplitData & bestSplit, - algorithmFPType totalWeights) +NodeSplitResult TrainBatchTaskBase::findBestSplitSerial( + size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iBestFeature, + typename DataHelper::TSplitData & bestSplit, algorithmFPType totalWeights) { - chooseFeatures(); - size_t nVisitedFeature = 0; - const size_t maxFeatures = nFeatures(); - const float qMax = 0.02; //min fracture of observations to be handled as indexed feature values - IndexType * bestSplitIdx = featureIndexBuf(0) + iStart; - IndexType * aIdx = _aSample.get() + iStart; - int iBestSplit = -1; - int idxFeatureValueBestSplit = -1; //when sorted feature is used + services::Status st; + + /* counter of the number of visited features, we visit _nFeaturesPerNode + * depending on _useConstFeatures, constant features can be skipped + */ + size_t nVisitedFeature = 0; + /* total number of features */ + const size_t maxFeatures = nFeatures(); + /* minimum fraction of all samples per bin */ + const algorithmFPType qMax = 0.02; + /* index of the best split, initialized to first index we investigate */ + IndexType * bestSplitIdx = featureIndexBuf(0) + iStart; + /* sample index */ + IndexType * aIdx = _aSample.get() + iStart; + /* zero-based index of best split */ + int64_t iBestSplit = -1; + int64_t idxFeatureValueBestSplit = -1; typename DataHelper::TSplitData split; - const float fact = float(n); + /* RNG for sample drawing */ + RNGs rng; + /* index for swapping samples in Fisher-Yates sampling */ + IndexType swapIdx; + for (size_t i = 0; i < maxFeatures && nVisitedFeature < _nFeaturesPerNode; ++i) { - const auto iFeature = _aFeatureIdx[i]; - const bool bUseIndexedFeatures = (!_par.memorySavingMode) && (fact > qMax * float(_helper.indexedFeatures().numIndices(iFeature))); + /* draw a random sample without replacement */ + // based on Fisher Yates sampling + // _aFeatureIdx has length of 2 * _maxFeatures + // first maxFeatures contain the currently selected features + // at iteration i, we have drawn i features and written them to + // _aFeatureIdx[0, 1, ..., i-1] + // + // the second half of the buffer contains all numbers from + // [0, 1, ..., maxFeatures-1] and we randomly select one without + // replacement based on Fisher Yates sampling + // drawing uniformly from [0, maxFeatures-i] and swapping the indices + // assures uniform probability of all drawn numbers + + /* draw the i-th index of the sample */ + int errorcode = rng.uniform(1, &swapIdx, _engineImpl->getState(), 0, maxFeatures - i); + if (errorcode) + { + st = services::Status(services::ErrorNullResult); + } + + /* account for buffer offset from 0 */ + swapIdx += maxFeatures; + /* _aFeatureIdx[swapIdx] was drawn */ + _aFeatureIdx[i] = _aFeatureIdx[swapIdx]; + /* swap in number at [2 * maxFeatures - 1 - i] for next draw */ + _aFeatureIdx[swapIdx] = _aFeatureIdx[2 * maxFeatures - 1 - i]; + /* store drawn number at end of number buffer so that no number is lost */ + _aFeatureIdx[2 * maxFeatures - 1 - i] = _aFeatureIdx[i]; + + const auto iFeature = _aFeatureIdx[i]; + const bool bUseIndexedFeatures = + (!_par.memorySavingMode) && (algorithmFPType(n) > qMax * algorithmFPType(_helper.indexedFeatures().numIndices(iFeature))); if (!_maxLeafNodes && !_useConstFeatures && !_par.memorySavingMode) { @@ -1154,7 +1214,14 @@ bool TrainBatchTaskBase::findBes #endif } } - if (iBestSplit < 0) return false; //not found + + if (!st.ok() || iBestSplit < 0) + { + // either: + // error during splitting -> failure + // or no split found -> not a failure but still have to return + return { st, false }; + } iBestFeature = _aFeatureIdx[iBestSplit]; bool bCopyToIdx = true; @@ -1193,20 +1260,18 @@ bool TrainBatchTaskBase::findBes bCopyToIdx = (iBestSplit + 1 < _nFeaturesPerNode); //if iBestSplit is the last considered feature //then aIdx already contains the best split, no need to copy if (bCopyToIdx) services::internal::tmemcpy(aIdx, bestSplitIdx, n); - return true; + return { st, true }; } template -bool TrainBatchTaskBase::findBestSplitThreaded(size_t level, size_t iStart, size_t n, - const typename DataHelper::ImpurityData & curImpurity, - IndexType & iFeatureBest, - typename DataHelper::TSplitData & split, - algorithmFPType totalWeights) +NodeSplitResult TrainBatchTaskBase::findBestSplitThreaded( + size_t level, size_t iStart, size_t n, const typename DataHelper::ImpurityData & curImpurity, IndexType & iFeatureBest, + typename DataHelper::TSplitData & split, algorithmFPType totalWeights) { chooseFeatures(); TArray aFeatureSplit(_nFeaturesPerNode); //TODO, if parallel for features - return false; + return { services::Status(services::ErrorMethodNotSupported), false }; } template diff --git a/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp b/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp index b8266376683..a2a83d82954 100755 --- a/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp +++ b/cpp/daal/src/algorithms/dtrees/forest/df_training_parameter.cpp @@ -56,8 +56,8 @@ Parameter::Parameter() minWeightFractionInLeafNode(0.), minImpurityDecreaseInSplitNode(0.), maxLeafNodes(0), - minBinSize(5), - maxBins(256) + maxBins(256), + minBinSize(5) {} } // namespace interface2 Status checkImpl(const decision_forest::training::interface2::Parameter & prm) diff --git a/cpp/daal/src/externals/service_memory.h b/cpp/daal/src/externals/service_memory.h index 06d654b4880..ab29d6cc554 100755 --- a/cpp/daal/src/externals/service_memory.h +++ b/cpp/daal/src/externals/service_memory.h @@ -127,6 +127,7 @@ T * service_memset(T * const ptr, const T value, const size_t num) return ptr; } +/* Initialize block of memory of length num value */ template void service_memset_seq(T * const ptr, const T value, const size_t num) { @@ -138,6 +139,18 @@ void service_memset_seq(T * const ptr, const T value, const size_t num) } } +/* Initialize block of memory of length num with entries [startValue, ..., startValue + num -1]*/ +template +void service_memset_incrementing(T * const ptr, const T startValue, const size_t num) +{ + PRAGMA_IVDEP + PRAGMA_VECTOR_ALWAYS + for (size_t i = 0; i < num; i++) + { + ptr[i] = startValue + i; + } +} + } // namespace internal } // namespace services } // namespace daal diff --git a/cpp/daal/src/externals/service_rng.h b/cpp/daal/src/externals/service_rng.h index 002b285f529..453889a9244 100644 --- a/cpp/daal/src/externals/service_rng.h +++ b/cpp/daal/src/externals/service_rng.h @@ -141,6 +141,34 @@ class RNGs return errorcode; } + /* Draw a random sample of length k from the numbers that are provided in buffer of length n + * \param[in] k The length of the sample to be drawn + * \param[out] r A pointer to the result buffer + * \param[in] buffer A pointer to the buffer containing the numbers + * \param[in] n Length of the buffer + * \param[in] method Method handed to the uniform random number generator + * + * This method is based on the Fisher Yates sampling technique, but since we are re-using the provided buffer, there + * is no need to initialize it to [0, 1, 2, ..., n-1] first, providing us with a speed-up from O(n) -> O(k) runtime + */ + template + int drawKFromBufferWithoutReplacement(const SizeType k, DstType * r, Type * buffer, void * state, const Type n, + const int method = __DAAL_RNG_METHOD_UNIFORM_STD) + { + int errorcode = 0; + Type swapIdx; + + for (SizeType i = 0; i < k; ++i) + { + errorcode = uniform(1, &swapIdx, state, 0, n - i, method); + r[i] = (DstType)buffer[swapIdx]; + buffer[swapIdx] = buffer[n - 1 - i]; + buffer[n - 1 - i] = (Type)r[i]; + } + + return errorcode; + } + template int uniformWithoutReplacement(const SizeType n, Type * r, BaseRNGs & brng, const Type a, const Type b, const int method = __DAAL_RNG_METHOD_UNIFORM_STD)