Skip to content

Commit

Permalink
RF/DT cleanup (#4005)
Browse files Browse the repository at this point in the history
* Prunes RF/DT C++ layers by purging legacy code and wrapper classes
* Unifies Regression and Classification under a singular class for DecisionTree and RandomForest code-base
* some bug fixes
* effort to tackle issue #3999 and issue #3089 

---
EDIT: 
Tasks list:
- [x] Unify and eliminate code duplication in `DecisionTreeClassifier`, `DecisionTreeRegressor` and `DecisionTreeBase` 
- [x] Unify and eliminate code duplication in `rf` , `rfClassifier`, `rfRegressor`
- [x] file naming rearrangements (get rid of `*_impl.cuh` files )
- [x] Remove exposed Decision Tree C++ `fit`, `predict` API as it's currently not being used
- [x] Tune/clean up metric/timing calculation in RF and remove unused variables
- [x] cython layer refactorings for checks and warnings pertaining to keyword-arguments

Authors:
  - Venkat (https://github.com/venkywonka)
  - Rory Mitchell (https://github.com/RAMitchell)

Approvers:
  - Rory Mitchell (https://github.com/RAMitchell)
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: #4005
  • Loading branch information
venkywonka authored Jul 1, 2021
1 parent 73d946d commit 033a21f
Show file tree
Hide file tree
Showing 28 changed files with 958 additions and 1,678 deletions.
4 changes: 2 additions & 2 deletions cpp/include/cuml/ensemble/randomforest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ struct RF_params {
* N streams need N times RF workspace.
*/
int n_streams;
DecisionTree::DecisionTreeParams tree_params;
DT::DecisionTreeParams tree_params;
};

void validity_check(const RF_params rf_params);
Expand All @@ -104,7 +104,7 @@ void postprocess_labels(int n_rows, std::vector<int>& labels,

template <class T, class L>
struct RandomForestMetaData {
DecisionTree::TreeMetaDataNode<T, L>* trees;
DT::TreeMetaDataNode<T, L>* trees;
RF_params rf_params;
//TODO can add prepare, train time, if needed

Expand Down
142 changes: 2 additions & 140 deletions cpp/include/cuml/tree/decisiontree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class handle_t;

namespace ML {

namespace DecisionTree {
namespace DT {

struct DecisionTreeParams {
/**
Expand Down Expand Up @@ -146,148 +146,10 @@ std::string get_tree_text(const TreeMetaDataNode<T, L> *tree);
template <class T, class L>
std::string get_tree_json(const TreeMetaDataNode<T, L> *tree);

// ----------------------------- Classification ----------------------------------- //

typedef TreeMetaDataNode<float, int> TreeClassifierF;
typedef TreeMetaDataNode<double, int> TreeClassifierD;

/**
* @defgroup DecisionTreeClassifierFit Fit functions
* @brief Build (i.e., fit, train) Decision Tree classifier for input data.
* @param[in] handle: raft::handle_t
* @param[in, out] tree: CPU pointer to TreeMetaDataNode. User allocated.
* @param[in] data: train data (nrows samples, ncols features) in column major format,
* excluding labels. Device pointer.
* @param[in] ncols: number of features (i.e., columns) excluding target feature.
* @param[in] nrows: number of training data samples of the whole unsampled dataset.
* @param[in] labels: 1D array of target features (int only). One label per training
* sample. Device pointer.
* Assumption: labels need to be preprocessed to map to ascending numbers from 0;
* needed for current gini impl. in decision tree.
* @param[in,out] rowids: array of n_sampled_rows integers in [0, nrows) range.
* Device pointer. The same array is then rearranged when splits are made,
* allowing us to construct trees without rearranging the actual dataset.
* @param[in] n_sampled_rows: number of training samples, after sampling.
* If using decision tree directly over the whole dataset: n_sampled_rows = nrows
* @param[in] n_unique_labels: number of unique label values. Number of
* categories of classification.
* @param[in] tree_params: Decision Tree training hyper parameter struct.
* @param[in] seed: Controls the randomness in tree fitting/growing algorithm.
* @{
*/
void decisionTreeClassifierFit(const raft::handle_t &handle,
TreeClassifierF *&tree, float *data,
const int ncols, const int nrows, int *labels,
unsigned int *rowids, const int n_sampled_rows,
int unique_labels,
DecisionTree::DecisionTreeParams tree_params,
uint64_t seed);
void decisionTreeClassifierFit(const raft::handle_t &handle,
TreeClassifierD *&tree, double *data,
const int ncols, const int nrows, int *labels,
unsigned int *rowids, const int n_sampled_rows,
int unique_labels,
DecisionTree::DecisionTreeParams tree_params,
uint64_t seed);
/** @} */

/**
* @defgroup DecisionTreeClassifierPredict Predict functions
* @brief Predict target feature for input data; n-ary classification for
* single feature supported. Inference of trees is CPU only for now.
* @param[in] handle: raft::handle_t (currently unused; API placeholder)
* @param[in] tree: CPU pointer to TreeMetaDataNode.
* @param[in] rows: test data (n_rows samples, n_cols features) in row major format.
* Current impl. expects a CPU pointer. TODO future API change.
* @param[in] n_rows: number of data samples.
* @param[in] n_cols: number of features (excluding target feature).
* @param[in,out] predictions: n_rows predicted labels. Current impl. expects a
* CPU pointer, user allocated. TODO future API change.
* @param[in] verbosity: verbosity level for logging messages during execution.
* A negative value means to not perform an explicit
* `setLevel()` call, but to continue with the level that
* the caller itself might have set.
* @{
*/
void decisionTreeClassifierPredict(const raft::handle_t &handle,
const TreeClassifierF *tree,
const float *rows, const int n_rows,
const int n_cols, int *predictions,
int verbosity = -1);
void decisionTreeClassifierPredict(const raft::handle_t &handle,
const TreeClassifierD *tree,
const double *rows, const int n_rows,
const int n_cols, int *predictions,
int verbosity = -1);
/** @} */

// ----------------------------- Regression ----------------------------------- //

typedef TreeMetaDataNode<float, float> TreeRegressorF;
typedef TreeMetaDataNode<double, double> TreeRegressorD;

/**
* @defgroup DecisionTreeRegressorFit Fit functions
* @brief Build (i.e., fit, train) Decision Tree regressor for input data.
* @param[in] handle: raft::handle_t
* @param[in, out] tree: CPU pointer to TreeMetaDataNode. User allocated.
* @param[in] data: train data (nrows samples, ncols features) in column major format,
* excluding labels. Device pointer.
* @param[in] ncols: number of features (i.e., columns) excluding target feature.
* @param[in] nrows: number of training data samples of the whole unsampled dataset.
* @param[in] labels: 1D array of target features (float or double). One label per
* training sample. Device pointer.
* @param[in,out] rowids: array of n_sampled_rows integers in [0, nrows) range.
* Device pointer. The same array is then rearranged when splits are made,
* allowing us to construct trees without rearranging the actual dataset.
* @param[in] n_sampled_rows: number of training samples, after sampling. If using decision
* tree directly over the whole dataset: n_sampled_rows = nrows
* @param[in] tree_params: Decision Tree training hyper parameter struct.
* @param[in] seed: Controls the randomness in tree fitting/growing algorithm.
* @{
*/
void decisionTreeRegressorFit(const raft::handle_t &handle,
TreeRegressorF *&tree, float *data,
const int ncols, const int nrows, float *labels,
unsigned int *rowids, const int n_sampled_rows,
DecisionTree::DecisionTreeParams tree_params,
uint64_t seed);
void decisionTreeRegressorFit(const raft::handle_t &handle,
TreeRegressorD *&tree, double *data,
const int ncols, const int nrows, double *labels,
unsigned int *rowids, const int n_sampled_rows,
DecisionTree::DecisionTreeParams tree_params,
uint64_t seed);
/** @} */

/**
* @defgroup DecisionTreeRegressorPredict Predict functions
* @brief Predict target feature for input data; regression for single feature supported.
* Inference of trees is CPU only for now.
* @param[in] handle: raft::handle_t (currently unused; API placeholder)
* @param[in] tree: CPU pointer to TreeMetaDataNode.
* @param[in] rows: test data (n_rows samples, n_cols features) in row major format.
* Current impl. expects a CPU pointer. TODO future API change.
* @param[in] n_rows: number of data samples.
* @param[in] n_cols: number of features (excluding target feature).
* @param[in,out] predictions: n_rows predicted labels. Current impl. expects a CPU
* pointer, user allocated. TODO future API change.
* @param[in] verbosity: verbosity level for logging messages during execution.
* A negative value means to not perform an explicit
* `setLevel()` call, but to continue with the level that
* the caller itself might have set.
* @{
*/
void decisionTreeRegressorPredict(const raft::handle_t &handle,
const TreeRegressorF *tree, const float *rows,
const int n_rows, const int n_cols,
float *predictions, int verbosity = -1);
void decisionTreeRegressorPredict(const raft::handle_t &handle,
const TreeRegressorD *tree,
const double *rows, const int n_rows,
const int n_cols, double *predictions,
int verbosity = -1);
/** @} */

} // End namespace DecisionTree
} // End namespace DT
} //End namespace ML
6 changes: 3 additions & 3 deletions cpp/src/decisiontree/batched-levelalgo/builder.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include <common/nvtx.hpp>

namespace ML {
namespace DecisionTree {
namespace DT {

template <typename ObjectiveT, typename DataT = typename ObjectiveT::DataT,
typename LabelT = typename ObjectiveT::LabelT,
Expand Down Expand Up @@ -57,7 +57,7 @@ void grow_tree(std::shared_ptr<raft::mr::device::allocator> d_allocator,
const DecisionTreeParams& params, cudaStream_t stream,
std::vector<SparseTreeNode<DataT, LabelT>>& sparsetree,
IdxT& num_leaves, IdxT& depth) {
ML::PUSH_RANGE("DecisionTree::grow_tree in batched-levelalgo @builder.cuh");
ML::PUSH_RANGE("DT::grow_tree in batched-levelalgo @builder.cuh");
Builder<ObjectiveT> builder;
size_t d_wsize, h_wsize;
builder.workspaceSize(d_wsize, h_wsize, treeid, seed, params, data, labels,
Expand Down Expand Up @@ -138,5 +138,5 @@ void grow_tree(std::shared_ptr<raft::mr::device::allocator> d_allocator,
ASSERT(false, "Unknown split criterion.");
}
}
} // namespace DecisionTree
} // namespace DT
} // namespace ML
4 changes: 2 additions & 2 deletions cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include <common/nvtx.hpp>

namespace ML {
namespace DecisionTree {
namespace DT {

/**
* Internal struct used to do all the heavy-lifting required for tree building
Expand Down Expand Up @@ -430,5 +430,5 @@ struct Builder {
}
}; // end Builder

} // namespace DecisionTree
} // namespace DT
} // namespace ML
4 changes: 2 additions & 2 deletions cpp/src/decisiontree/batched-levelalgo/input.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#pragma once

namespace ML {
namespace DecisionTree {
namespace DT {

template <typename DataT, typename LabelT, typename IdxT>
struct Input {
Expand All @@ -41,5 +41,5 @@ struct Input {
const DataT* quantiles;
};

} // namespace DecisionTree
} // namespace DT
} // namespace ML
4 changes: 2 additions & 2 deletions cpp/src/decisiontree/batched-levelalgo/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include "split.cuh"

namespace ML {
namespace DecisionTree {
namespace DT {

/**
* This struct has information about workload of a single threadblock of
Expand Down Expand Up @@ -419,5 +419,5 @@ __global__ void computeSplitKernel(
sp.evalBestSplit(smem, splits + nid, mutex + nid);
}

} // namespace DecisionTree
} // namespace DT
} // namespace ML
4 changes: 2 additions & 2 deletions cpp/src/decisiontree/batched-levelalgo/metrics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include "split.cuh"

namespace ML {
namespace DecisionTree {
namespace DT {

struct IntBin {
int x;
Expand Down Expand Up @@ -273,5 +273,5 @@ class MSEObjectiveFunction {
}
};

} // namespace DecisionTree
} // namespace DT
} // namespace ML
4 changes: 2 additions & 2 deletions cpp/src/decisiontree/batched-levelalgo/node.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include "split.cuh"

namespace ML {
namespace DecisionTree {
namespace DT {

/**
* @brief All info pertaining to a node in the decision tree.
Expand Down Expand Up @@ -134,5 +134,5 @@ void printNodes(Node<DataT, LabelT, IdxT>* nodes, IdxT len, cudaStream_t s) {
CUDA_CHECK(cudaDeviceSynchronize());
}

} // namespace DecisionTree
} // namespace DT
} // namespace ML
6 changes: 3 additions & 3 deletions cpp/src/decisiontree/batched-levelalgo/split.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <raft/linalg/unary_op.cuh>

namespace ML {
namespace DecisionTree {
namespace DT {

/**
* @brief All info pertaining to splitting a node
Expand Down Expand Up @@ -62,7 +62,7 @@ struct Split {
* @brief Assignment operator overload
*
* @param[in] other source object from where to copy
*
*
* @return the reference to the copied object (typically useful for chaining)
*/
DI volatile SplitT& operator=(const SplitT& other) volatile {
Expand Down Expand Up @@ -171,5 +171,5 @@ void printSplits(Split<DataT, IdxT>* splits, IdxT len, cudaStream_t s) {
CUDA_CHECK(cudaDeviceSynchronize());
}

} // namespace DecisionTree
} // namespace DT
} // namespace ML
Loading

0 comments on commit 033a21f

Please sign in to comment.