RF/DT cleanup (#4005)

* Prunes RF/DT C++ layers by purging legacy code and wrapper classes * Unifies Regression and Classification under a singular class for DecisionTree and RandomForest code-base * some bug fixes * effort to tackle issue #3999 and issue #3089 --- EDIT: Tasks list: - [x] Unify and eliminate code duplication in `DecisionTreeClassifier`, `DecisionTreeRegressor` and `DecisionTreeBase` - [x] Unify and eliminate code duplication in `rf` , `rfClassifier`, `rfRegressor` - [x] file naming rearrangements (get rid of `*_impl.cuh` files ) - [x] Remove exposed Decision Tree C++ `fit`, `predict` API as it's currently not being used - [x] Tune/clean up metric/timing calculation in RF and remove unused variables - [x] cython layer refactorings for checks and warnings pertaining to keyword-arguments Authors: - Venkat (https://github.com/venkywonka) - Rory Mitchell (https://github.com/RAMitchell) Approvers: - Rory Mitchell (https://github.com/RAMitchell) - Dante Gama Dessavre (https://github.com/dantegd) URL: #4005
rapidsai · Jul 1, 2021 · 033a21f · 033a21f
1 parent 73d946d
commit 033a21f
Show file tree

Hide file tree

Showing 28 changed files with 958 additions and 1,678 deletions.
diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -84,7 +84,7 @@ struct RF_params {
    * N streams need N times RF workspace.
    */
   int n_streams;
-  DecisionTree::DecisionTreeParams tree_params;
+  DT::DecisionTreeParams tree_params;
 };
 
 void validity_check(const RF_params rf_params);
@@ -104,7 +104,7 @@ void postprocess_labels(int n_rows, std::vector<int>& labels,
 
 template <class T, class L>
 struct RandomForestMetaData {
-  DecisionTree::TreeMetaDataNode<T, L>* trees;
+  DT::TreeMetaDataNode<T, L>* trees;
   RF_params rf_params;
   //TODO can add prepare, train time, if needed
 

diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp
@@ -25,7 +25,7 @@ class handle_t;
 
 namespace ML {
 
-namespace DecisionTree {
+namespace DT {
 
 struct DecisionTreeParams {
   /**
@@ -146,148 +146,10 @@ std::string get_tree_text(const TreeMetaDataNode<T, L> *tree);
 template <class T, class L>
 std::string get_tree_json(const TreeMetaDataNode<T, L> *tree);
 
-// ----------------------------- Classification ----------------------------------- //
-
 typedef TreeMetaDataNode<float, int> TreeClassifierF;
 typedef TreeMetaDataNode<double, int> TreeClassifierD;
-
-/**
- * @defgroup DecisionTreeClassifierFit Fit functions
- * @brief Build (i.e., fit, train) Decision Tree classifier for input data.
- * @param[in] handle: raft::handle_t
- * @param[in, out] tree: CPU pointer to TreeMetaDataNode. User allocated.
- * @param[in] data: train data (nrows samples, ncols features) in column major format,
- *    excluding labels. Device pointer.
- * @param[in] ncols: number of features (i.e., columns) excluding target feature.
- * @param[in] nrows: number of training data samples of the whole unsampled dataset.
- * @param[in] labels: 1D array of target features (int only). One label per training
- *    sample. Device pointer.
- *    Assumption: labels need to be preprocessed to map to ascending numbers from 0;
- *    needed for current gini impl. in decision tree.
- * @param[in,out] rowids: array of n_sampled_rows integers in [0, nrows) range.
- *    Device pointer. The same array is then rearranged when splits are made,
- *    allowing us to construct trees without rearranging the actual dataset.
- * @param[in] n_sampled_rows: number of training samples, after sampling.
- *    If using decision tree directly over the whole dataset: n_sampled_rows = nrows
- * @param[in] n_unique_labels: number of unique label values. Number of
- *                             categories of classification.
- * @param[in] tree_params: Decision Tree training hyper parameter struct.
- * @param[in] seed: Controls the randomness in tree fitting/growing algorithm.
- * @{
- */
-void decisionTreeClassifierFit(const raft::handle_t &handle,
-                               TreeClassifierF *&tree, float *data,
-                               const int ncols, const int nrows, int *labels,
-                               unsigned int *rowids, const int n_sampled_rows,
-                               int unique_labels,
-                               DecisionTree::DecisionTreeParams tree_params,
-                               uint64_t seed);
-void decisionTreeClassifierFit(const raft::handle_t &handle,
-                               TreeClassifierD *&tree, double *data,
-                               const int ncols, const int nrows, int *labels,
-                               unsigned int *rowids, const int n_sampled_rows,
-                               int unique_labels,
-                               DecisionTree::DecisionTreeParams tree_params,
-                               uint64_t seed);
-/** @} */
-
-/**
- * @defgroup DecisionTreeClassifierPredict Predict functions
- * @brief Predict target feature for input data; n-ary classification for
- *   single feature supported. Inference of trees is CPU only for now.
- * @param[in] handle: raft::handle_t (currently unused; API placeholder)
- * @param[in] tree: CPU pointer to TreeMetaDataNode.
- * @param[in] rows: test data (n_rows samples, n_cols features) in row major format.
- *    Current impl. expects a CPU pointer. TODO future API change.
- * @param[in] n_rows: number of  data samples.
- * @param[in] n_cols: number of features (excluding target feature).
- * @param[in,out] predictions: n_rows predicted labels. Current impl. expects a
- *    CPU pointer, user allocated. TODO future API change.
- * @param[in] verbosity: verbosity level for logging messages during execution.
- *                       A negative value means to not perform an explicit
- *                       `setLevel()` call, but to continue with the level that
- *                       the caller itself might have set.
- * @{
- */
-void decisionTreeClassifierPredict(const raft::handle_t &handle,
-                                   const TreeClassifierF *tree,
-                                   const float *rows, const int n_rows,
-                                   const int n_cols, int *predictions,
-                                   int verbosity = -1);
-void decisionTreeClassifierPredict(const raft::handle_t &handle,
-                                   const TreeClassifierD *tree,
-                                   const double *rows, const int n_rows,
-                                   const int n_cols, int *predictions,
-                                   int verbosity = -1);
-/** @} */
-
-// ----------------------------- Regression ----------------------------------- //
-
 typedef TreeMetaDataNode<float, float> TreeRegressorF;
 typedef TreeMetaDataNode<double, double> TreeRegressorD;
 
-/**
- * @defgroup DecisionTreeRegressorFit Fit functions
- * @brief Build (i.e., fit, train) Decision Tree regressor for input data.
- * @param[in] handle: raft::handle_t
- * @param[in, out] tree: CPU pointer to TreeMetaDataNode. User allocated.
- * @param[in] data: train data (nrows samples, ncols features) in column major format,
- *   excluding labels. Device pointer.
- * @param[in] ncols: number of features (i.e., columns) excluding target feature.
- * @param[in] nrows: number of training data samples of the whole unsampled dataset.
- * @param[in] labels: 1D array of target features (float or double). One label per
- *    training sample. Device pointer.
- * @param[in,out] rowids: array of n_sampled_rows integers in [0, nrows) range.
- *   Device pointer. The same array is then rearranged when splits are made,
- *   allowing us to construct trees without rearranging the actual dataset.
- * @param[in] n_sampled_rows: number of training samples, after sampling. If using decision
- *   tree directly over the whole dataset: n_sampled_rows = nrows
- * @param[in] tree_params: Decision Tree training hyper parameter struct.
- * @param[in] seed: Controls the randomness in tree fitting/growing algorithm.
- * @{
- */
-void decisionTreeRegressorFit(const raft::handle_t &handle,
-                              TreeRegressorF *&tree, float *data,
-                              const int ncols, const int nrows, float *labels,
-                              unsigned int *rowids, const int n_sampled_rows,
-                              DecisionTree::DecisionTreeParams tree_params,
-                              uint64_t seed);
-void decisionTreeRegressorFit(const raft::handle_t &handle,
-                              TreeRegressorD *&tree, double *data,
-                              const int ncols, const int nrows, double *labels,
-                              unsigned int *rowids, const int n_sampled_rows,
-                              DecisionTree::DecisionTreeParams tree_params,
-                              uint64_t seed);
-/** @} */
-
-/**
- * @defgroup DecisionTreeRegressorPredict Predict functions
- * @brief Predict target feature for input data; regression for single feature supported.
- *   Inference of trees is CPU only for now.
- * @param[in] handle: raft::handle_t (currently unused; API placeholder)
- * @param[in] tree: CPU pointer to TreeMetaDataNode.
- * @param[in] rows: test data (n_rows samples, n_cols features) in row major format.
- *   Current impl. expects a CPU pointer. TODO future API change.
- * @param[in] n_rows: number of  data samples.
- * @param[in] n_cols: number of features (excluding target feature).
- * @param[in,out] predictions: n_rows predicted labels. Current impl. expects a CPU
- *   pointer, user allocated. TODO future API change.
- * @param[in] verbosity: verbosity level for logging messages during execution.
- *                       A negative value means to not perform an explicit
- *                       `setLevel()` call, but to continue with the level that
- *                       the caller itself might have set.
- * @{
- */
-void decisionTreeRegressorPredict(const raft::handle_t &handle,
-                                  const TreeRegressorF *tree, const float *rows,
-                                  const int n_rows, const int n_cols,
-                                  float *predictions, int verbosity = -1);
-void decisionTreeRegressorPredict(const raft::handle_t &handle,
-                                  const TreeRegressorD *tree,
-                                  const double *rows, const int n_rows,
-                                  const int n_cols, double *predictions,
-                                  int verbosity = -1);
-/** @} */
-
-}  // End namespace DecisionTree
+}  // End namespace DT
 }  //End namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -27,7 +27,7 @@
 #include <common/nvtx.hpp>
 
 namespace ML {
-namespace DecisionTree {
+namespace DT {
 
 template <typename ObjectiveT, typename DataT = typename ObjectiveT::DataT,
           typename LabelT = typename ObjectiveT::LabelT,
@@ -57,7 +57,7 @@ void grow_tree(std::shared_ptr<raft::mr::device::allocator> d_allocator,
                const DecisionTreeParams& params, cudaStream_t stream,
                std::vector<SparseTreeNode<DataT, LabelT>>& sparsetree,
                IdxT& num_leaves, IdxT& depth) {
-  ML::PUSH_RANGE("DecisionTree::grow_tree in batched-levelalgo @builder.cuh");
+  ML::PUSH_RANGE("DT::grow_tree in batched-levelalgo @builder.cuh");
   Builder<ObjectiveT> builder;
   size_t d_wsize, h_wsize;
   builder.workspaceSize(d_wsize, h_wsize, treeid, seed, params, data, labels,
@@ -138,5 +138,5 @@ void grow_tree(std::shared_ptr<raft::mr::device::allocator> d_allocator,
     ASSERT(false, "Unknown split criterion.");
   }
 }
-}  // namespace DecisionTree
+}  // namespace DT
 }  // namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
@@ -29,7 +29,7 @@
 #include <common/nvtx.hpp>
 
 namespace ML {
-namespace DecisionTree {
+namespace DT {
 
 /**
  * Internal struct used to do all the heavy-lifting required for tree building
@@ -430,5 +430,5 @@ struct Builder {
   }
 };  // end Builder
 
-}  // namespace DecisionTree
+}  // namespace DT
 }  // namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/input.cuh b/cpp/src/decisiontree/batched-levelalgo/input.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 namespace ML {
-namespace DecisionTree {
+namespace DT {
 
 template <typename DataT, typename LabelT, typename IdxT>
 struct Input {
@@ -41,5 +41,5 @@ struct Input {
   const DataT* quantiles;
 };
 
-}  // namespace DecisionTree
+}  // namespace DT
 }  // namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/kernels.cuh b/cpp/src/decisiontree/batched-levelalgo/kernels.cuh
@@ -27,7 +27,7 @@
 #include "split.cuh"
 
 namespace ML {
-namespace DecisionTree {
+namespace DT {
 
 /**
  * This struct has information about workload of a single threadblock of
@@ -419,5 +419,5 @@ __global__ void computeSplitKernel(
   sp.evalBestSplit(smem, splits + nid, mutex + nid);
 }
 
-}  // namespace DecisionTree
+}  // namespace DT
 }  // namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/metrics.cuh b/cpp/src/decisiontree/batched-levelalgo/metrics.cuh
@@ -25,7 +25,7 @@
 #include "split.cuh"
 
 namespace ML {
-namespace DecisionTree {
+namespace DT {
 
 struct IntBin {
   int x;
@@ -273,5 +273,5 @@ class MSEObjectiveFunction {
   }
 };
 
-}  // namespace DecisionTree
+}  // namespace DT
 }  // namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/node.cuh b/cpp/src/decisiontree/batched-levelalgo/node.cuh
@@ -20,7 +20,7 @@
 #include "split.cuh"
 
 namespace ML {
-namespace DecisionTree {
+namespace DT {
 
 /**
  * @brief All info pertaining to a node in the decision tree.
@@ -134,5 +134,5 @@ void printNodes(Node<DataT, LabelT, IdxT>* nodes, IdxT len, cudaStream_t s) {
   CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-}  // namespace DecisionTree
+}  // namespace DT
 }  // namespace ML
diff --git a/cpp/src/decisiontree/batched-levelalgo/split.cuh b/cpp/src/decisiontree/batched-levelalgo/split.cuh
@@ -20,7 +20,7 @@
 #include <raft/linalg/unary_op.cuh>
 
 namespace ML {
-namespace DecisionTree {
+namespace DT {
 
 /**
  * @brief All info pertaining to splitting a node
@@ -62,7 +62,7 @@ struct Split {
    * @brief Assignment operator overload
    *
    * @param[in] other source object from where to copy
-   * 
+   *
    * @return the reference to the copied object (typically useful for chaining)
    */
   DI volatile SplitT& operator=(const SplitT& other) volatile {
@@ -171,5 +171,5 @@ void printSplits(Split<DataT, IdxT>* splits, IdxT len, cudaStream_t s) {
   CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-}  // namespace DecisionTree
+}  // namespace DT
 }  // namespace ML