diff --git a/doc/parameter.rst b/doc/parameter.rst index 3ad297b5a3c0..d977a35e50a2 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -181,12 +181,17 @@ Parameters for Tree Booster - Maximum number of discrete bins to bucket continuous features. - Increasing this number improves the optimality of splits at the cost of higher computation time. -* ``predictor``, [default=``cpu_predictor``] +* ``predictor``, [default=``auto``] - The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU. + - ``auto``: Configure predictor based on heuristics. - ``cpu_predictor``: Multicore CPU prediction algorithm. - - ``gpu_predictor``: Prediction using GPU. Default when ``tree_method`` is ``gpu_hist``. + - ``gpu_predictor``: Prediction using GPU. Used when ``tree_method`` is ``gpu_hist``. + When ``predictor`` is set to default value ``auto``, the ``gpu_hist`` tree method is + able to provide GPU based prediction without copying training data to GPU memory. + If ``gpu_predictor`` is explicitly specified, then all data is copied into GPU, only + recommended for performing prediction tasks. * ``num_parallel_tree``, [default=1] - Number of parallel trees constructed during each iteration. This option is used to support boosted random forest. diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index b7ba637cf75a..9779c76d9850 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -28,7 +28,7 @@ namespace xgboost { */ class GradientBooster { protected: - GenericParameter const* learner_param_; + GenericParameter const* generic_param_; public: /*! \brief virtual destructor */ diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index b250c020c30d..17b4225b1133 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -12,6 +12,9 @@ namespace xgboost { struct GenericParameter : public XGBoostParameter { + // Constant representing the device ID of CPU. + static int constexpr kCpuId = -1; + // stored random seed int seed; // whether seed the PRNG each iteration @@ -24,6 +27,8 @@ struct GenericParameter : public XGBoostParameter { // gpu page size in external memory mode, 0 means using the default. size_t gpu_page_size; + void ConfigureGpuId(bool require_gpu); + void CheckDeprecated() { if (this->n_gpus != 0) { LOG(WARNING) diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h index 8960a1acf284..b49a4899e33f 100644 --- a/include/xgboost/predictor.h +++ b/include/xgboost/predictor.h @@ -26,6 +26,15 @@ struct GBTreeModel; } namespace xgboost { +/** + * \struct PredictionCacheEntry + * + * \brief Contains pointer to input matrix and associated cached predictions. + */ +struct PredictionCacheEntry { + std::shared_ptr data; + HostDeviceVector predictions; +}; /** * \class Predictor @@ -41,23 +50,37 @@ namespace xgboost { class Predictor { protected: - GenericParameter const* learner_param_; + /* + * \brief Runtime parameters. + */ + GenericParameter const* generic_param_; + /** + * \brief Map of matrices and associated cached predictions to facilitate + * storing and looking up predictions. + */ + std::shared_ptr> cache_; + + std::unordered_map::iterator FindCache(DMatrix const* dmat) { + auto cache_emtry = std::find_if( + cache_->begin(), cache_->end(), + [dmat](std::pair const &kv) { + return kv.second.data.get() == dmat; + }); + return cache_emtry; + } public: + Predictor(GenericParameter const* generic_param, + std::shared_ptr> cache) : + generic_param_{generic_param}, cache_{cache} {} virtual ~Predictor() = default; /** - * \fn virtual void Predictor::Init(const std::vector >&cfg ,const std::vector > &cache); - * * \brief Configure and register input matrices in prediction cache. * * \param cfg The configuration. - * \param cache Vector of DMatrix's to be used in prediction. */ - - virtual void Configure(const std::vector>& cfg, - const std::vector>& cache); + virtual void Configure(const std::vector>& cfg); /** * \brief Generate batch predictions for a given feature matrix. May use @@ -162,45 +185,33 @@ class Predictor { unsigned condition_feature = 0) = 0; virtual void PredictInteractionContributions(DMatrix* dmat, - std::vector* out_contribs, - const gbm::GBTreeModel& model, - unsigned ntree_limit = 0, - std::vector* tree_weights = nullptr, - bool approximate = false) = 0; - - /** - * \fn static Predictor* Predictor::Create(std::string name); - * - * \brief Creates a new Predictor*. - * - */ + std::vector* out_contribs, + const gbm::GBTreeModel& model, + unsigned ntree_limit = 0, + std::vector* tree_weights = nullptr, + bool approximate = false) = 0; - static Predictor* Create(std::string const& name, GenericParameter const*); - protected: /** - * \struct PredictionCacheEntry + * \brief Creates a new Predictor*. * - * \brief Contains pointer to input matrix and associated cached predictions. - */ - struct PredictionCacheEntry { - std::shared_ptr data; - HostDeviceVector predictions; - }; - - /** - * \brief Map of matrices and associated cached predictions to facilitate - * storing and looking up predictions. + * \param name Name of the predictor. + * \param generic_param Pointer to runtime parameters. + * \param cache Pointer to prediction cache. */ - std::unordered_map cache_; + static Predictor* Create( + std::string const& name, GenericParameter const* generic_param, + std::shared_ptr> cache); }; /*! * \brief Registry entry for predictor. */ struct PredictorReg - : public dmlc::FunctionRegEntryBase> {}; + : public dmlc::FunctionRegEntryBase< + PredictorReg, std::function>)>> {}; #define XGBOOST_REGISTER_PREDICTOR(UniqueId, Name) \ static DMLC_ATTRIBUTE_UNUSED ::xgboost::PredictorReg& \ diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index f1d2b3d94148..f5735202a517 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -65,7 +65,7 @@ class GBLinear : public GradientBooster { model_.param.InitAllowUnknown(cfg); } param_.UpdateAllowUnknown(cfg); - updater_.reset(LinearUpdater::Create(param_.updater, learner_param_)); + updater_.reset(LinearUpdater::Create(param_.updater, generic_param_)); updater_->Configure(cfg); monitor_.Init("GBLinear"); if (param_.updater == "gpu_coord_descent") { diff --git a/src/gbm/gbm.cc b/src/gbm/gbm.cc index e0bc355f8241..644888b88ab4 100644 --- a/src/gbm/gbm.cc +++ b/src/gbm/gbm.cc @@ -13,7 +13,7 @@ DMLC_REGISTRY_ENABLE(::xgboost::GradientBoosterReg); namespace xgboost { GradientBooster* GradientBooster::Create( const std::string& name, - GenericParameter const* learner_param, + GenericParameter const* generic_param, const std::vector >& cache_mats, bst_float base_margin) { auto *e = ::dmlc::Registry< ::xgboost::GradientBoosterReg>::Get()->Find(name); @@ -21,7 +21,7 @@ GradientBooster* GradientBooster::Create( LOG(FATAL) << "Unknown gbm type " << name; } auto p_bst = (e->body)(cache_mats, base_margin); - p_bst->learner_param_ = learner_param; + p_bst->generic_param_ = generic_param; return p_bst; } diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 694d8cc2f3ed..ec21cdd014ca 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -46,42 +46,44 @@ void GBTree::Configure(const Args& cfg) { // configure predictors if (!cpu_predictor_) { cpu_predictor_ = std::unique_ptr( - Predictor::Create("cpu_predictor", this->learner_param_)); - cpu_predictor_->Configure(cfg, cache_); + Predictor::Create("cpu_predictor", this->generic_param_, cache_)); } + cpu_predictor_->Configure(cfg); #if defined(XGBOOST_USE_CUDA) - if (!gpu_predictor_) { + auto n_gpus = common::AllVisibleGPUs(); + if (!gpu_predictor_ && n_gpus != 0) { gpu_predictor_ = std::unique_ptr( - Predictor::Create("gpu_predictor", this->learner_param_)); - gpu_predictor_->Configure(cfg, cache_); + Predictor::Create("gpu_predictor", this->generic_param_, cache_)); + } + if (n_gpus != 0) { + gpu_predictor_->Configure(cfg); } #endif // defined(XGBOOST_USE_CUDA) monitor_.Init("GBTree"); - specified_predictor_ = std::any_of(cfg.cbegin(), cfg.cend(), - [](std::pair const& arg) { - return arg.first == "predictor"; - }); - if (!specified_predictor_ && tparam_.tree_method == TreeMethod::kGPUHist) { - tparam_.predictor = "gpu_predictor"; - } - specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(), [](std::pair const& arg) { return arg.first == "updater"; }); - if (specified_updater_) { + + if (specified_updater_ && !showed_updater_warning_) { LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` " "parameter. The `tree_method` parameter will be ignored. " "Incorrect sequence of updaters will produce undefined " - "behavior. For common uses, we recommend using " + "behavior. For common uses, we recommend using" "`tree_method` parameter instead."; + // Don't drive users to silent XGBOost. + showed_updater_warning_ = true; } else { this->ConfigureUpdaters(); LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq; } + for (auto& up : updaters_) { + up->Configure(cfg); + } + configured_ = true; } @@ -162,9 +164,6 @@ void GBTree::ConfigureUpdaters() { case TreeMethod::kGPUHist: this->AssertGPUSupport(); tparam_.updater_seq = "grow_gpu_hist"; - if (!specified_predictor_) { - tparam_.predictor = "gpu_predictor"; - } break; default: LOG(FATAL) << "Unknown tree_method (" @@ -239,7 +238,7 @@ void GBTree::InitUpdater(Args const& cfg) { } for (const std::string& pstr : ups) { - std::unique_ptr up(TreeUpdater::Create(pstr.c_str(), learner_param_)); + std::unique_ptr up(TreeUpdater::Create(pstr.c_str(), generic_param_)); up->Configure(cfg); updaters_.push_back(std::move(up)); } diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 5c9373cfd848..cc10e2c91f92 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -10,21 +10,22 @@ #include #include -#include -#include -#include -#include -#include - #include #include #include #include #include - -#include "gbtree_model.h" +#include + +#include "xgboost/logging.h" +#include "xgboost/gbm.h" +#include "xgboost/predictor.h" +#include "xgboost/tree_updater.h" +#include "xgboost/parameter.h" +#include "xgboost/json.h" #include "xgboost/host_device_vector.h" +#include "gbtree_model.h" #include "../common/common.h" #include "../common/timer.h" @@ -39,10 +40,17 @@ enum class TreeProcessType : int { kDefault = 0, kUpdate = 1 }; + +enum class PredictorType : int { + kAuto = 0, + kCPUPredictor, + kGPUPredictor +}; } // namespace xgboost DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod); DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType); +DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType); namespace xgboost { namespace gbm { @@ -58,8 +66,8 @@ struct GBTreeTrainParam : public XGBoostParameter { std::string updater_seq; /*! \brief type of boosting process to run */ TreeProcessType process_type; - // predictor name - std::string predictor; + // predictor type + PredictorType predictor; // tree construction method TreeMethod tree_method; // declare parameters @@ -81,8 +89,11 @@ struct GBTreeTrainParam : public XGBoostParameter { // add alias DMLC_DECLARE_ALIAS(updater_seq, updater); DMLC_DECLARE_FIELD(predictor) - .set_default("cpu_predictor") - .describe("Predictor algorithm type"); + .set_default(PredictorType::kAuto) + .add_enum("auto", PredictorType::kAuto) + .add_enum("cpu_predictor", PredictorType::kCPUPredictor) + .add_enum("gpu_predictor", PredictorType::kGPUPredictor) + .describe("Predictor algorithm type"); DMLC_DECLARE_FIELD(tree_method) .set_default(TreeMethod::kAuto) .add_enum("auto", TreeMethod::kAuto) @@ -145,7 +156,10 @@ class GBTree : public GradientBooster { explicit GBTree(bst_float base_margin) : model_(base_margin) {} void InitCache(const std::vector > &cache) { - cache_ = cache; + cache_ = std::make_shared>(); + for (std::shared_ptr const& d : cache) { + (*cache_)[d.get()].data = d; + } } void Configure(const Args& cfg) override; @@ -163,7 +177,7 @@ class GBTree : public GradientBooster { bool UseGPU() const override { return - tparam_.predictor == "gpu_predictor" || + tparam_.predictor == PredictorType::kGPUPredictor || tparam_.tree_method == TreeMethod::kGPUHist; } @@ -246,62 +260,82 @@ class GBTree : public GradientBooster { std::unique_ptr const& GetPredictor(HostDeviceVector const* out_pred = nullptr, DMatrix* f_dmat = nullptr) const { CHECK(configured_); - auto on_device = f_dmat && (*(f_dmat->GetBatches().begin())).data.DeviceCanRead(); + if (tparam_.predictor != PredictorType::kAuto) { + if (tparam_.predictor == PredictorType::kGPUPredictor) { #if defined(XGBOOST_USE_CUDA) + CHECK(gpu_predictor_); + return gpu_predictor_; +#else + this->AssertGPUSupport(); +#endif // defined(XGBOOST_USE_CUDA) + } + CHECK(cpu_predictor_); + return cpu_predictor_; + } + + auto on_device = f_dmat && (*(f_dmat->GetBatches().begin())).data.DeviceCanRead(); + // Use GPU Predictor if data is already on device. - if (!specified_predictor_ && on_device) { + if (on_device) { +#if defined(XGBOOST_USE_CUDA) CHECK(gpu_predictor_); return gpu_predictor_; - } +#else + LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with CUDA support."; + return cpu_predictor_; #endif // defined(XGBOOST_USE_CUDA) + } + // GPU_Hist by default has prediction cache calculated from quantile values, so GPU // Predictor is not used for training dataset. But when XGBoost performs continue // training with an existing model, the prediction cache is not availbale and number - // of tree doesn't equal zero, the whole training dataset got copied into GPU for + // of trees doesn't equal zero, the whole training dataset got copied into GPU for // precise prediction. This condition tries to avoid such copy by calling CPU - // Predictor. + // Predictor instead. if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) && // FIXME(trivialfis): Implement a better method for testing whether data is on // device after DMatrix refactoring is done. !on_device) { - return cpu_predictor_; - } - if (tparam_.predictor == "cpu_predictor") { CHECK(cpu_predictor_); return cpu_predictor_; - } else if (tparam_.predictor == "gpu_predictor") { + } + + if (tparam_.tree_method == TreeMethod::kGPUHist) { #if defined(XGBOOST_USE_CUDA) CHECK(gpu_predictor_); return gpu_predictor_; #else - LOG(FATAL) << "XGBoost is not compiled with CUDA support."; + this->AssertGPUSupport(); return cpu_predictor_; #endif // defined(XGBOOST_USE_CUDA) - } else { - LOG(FATAL) << "Unknown predictor: " << tparam_.predictor; - return cpu_predictor_; } + + CHECK(cpu_predictor_); + return cpu_predictor_; } // commit new trees all at once - virtual void CommitModel( - std::vector>>&& new_trees); + virtual void CommitModel(std::vector>>&& new_trees); // --- data structure --- GBTreeModel model_; // training parameter GBTreeTrainParam tparam_; // ----training fields---- + bool showed_updater_warning_ {false}; bool specified_updater_ {false}; - bool specified_predictor_ {false}; bool configured_ {false}; // configurations for tree Args cfg_; // the updaters that can be applied to each of tree std::vector> updaters_; - // Cached matrices - std::vector> cache_; + /** + * \brief Map of matrices and associated cached predictions to facilitate + * storing and looking up predictions. + */ + std::shared_ptr> cache_; + // Predictors std::unique_ptr cpu_predictor_; #if defined(XGBOOST_USE_CUDA) std::unique_ptr gpu_predictor_; diff --git a/src/learner.cc b/src/learner.cc index 21c00046bb08..fa1f289fa051 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -138,6 +138,36 @@ DMLC_REGISTER_PARAMETER(LearnerModelParam); DMLC_REGISTER_PARAMETER(LearnerTrainParam); DMLC_REGISTER_PARAMETER(GenericParameter); +int constexpr GenericParameter::kCpuId; + +void GenericParameter::ConfigureGpuId(bool require_gpu) { +#if defined(XGBOOST_USE_CUDA) + int32_t n_visible = common::AllVisibleGPUs(); + if (n_visible == 0) { + // Running XGBoost compiled with CUDA on CPU only machine. + this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}}); + return; + } + + if (this->gpu_id == kCpuId) { // 0. User didn't specify the `gpu_id' + if (require_gpu) { // 1. `tree_method' or `predictor' or both are using + // GPU. + // 2. Use device 0 as default. + this->UpdateAllowUnknown(Args{{"gpu_id", "0"}}); + } + } + + // 3. When booster is loaded from a memory image (Python pickle or R + // raw model), number of available GPUs could be different. Wrap around it. + if (this->gpu_id != kCpuId && this->gpu_id >= n_visible) { + this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}}); + } +#else + // Just set it to CPU, don't think about it. + this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}}); +#endif // defined(XGBOOST_USE_CUDA) +} + /*! * \brief learner that performs gradient boosting for a specific objective * function. It does training and prediction. @@ -182,6 +212,8 @@ class LearnerImpl : public Learner { this->ConfigureGBM(old_tparam, args); this->ConfigureMetrics(args); + generic_param_.ConfigureGpuId(this->gbm_->UseGPU()); + this->configured_ = true; monitor_.Stop("Configure"); } @@ -282,12 +314,16 @@ class LearnerImpl : public Learner { kv.second = "cpu_predictor"; } #endif // XGBOOST_USE_CUDA +#if defined(XGBOOST_USE_CUDA) // NO visible GPU in current environment if (is_gpu_predictor && common::AllVisibleGPUs() == 0) { cfg_["predictor"] = "cpu_predictor"; kv.second = "cpu_predictor"; LOG(INFO) << "Switch gpu_predictor to cpu_predictor."; + } else if (is_gpu_predictor) { + cfg_["predictor"] = "gpu_predictor"; } +#endif // defined(XGBOOST_USE_CUDA) if (saved_configs_.find(saved_param) != saved_configs_.end()) { cfg_[saved_param] = kv.second; } @@ -327,6 +363,8 @@ class LearnerImpl : public Learner { if (tparam_.dsplit == DataSplitMode::kAuto && rabit::IsDistributed()) { tparam_.dsplit = DataSplitMode::kRow; } + + this->generic_param_.ConfigureGpuId(gbm_->UseGPU()); this->configured_ = true; } @@ -373,6 +411,18 @@ class LearnerImpl : public Learner { } } } +#if defined(XGBOOST_USE_CUDA) + { + // Force save gpu_id. + if (std::none_of(extra_attr.cbegin(), extra_attr.cend(), + [](std::pair const& it) { + return it.first == "SAVED_PARAM_gpu_id"; + })) { + mparam.contain_extra_attrs = 1; + extra_attr.emplace_back("SAVED_PARAM_gpu_id", std::to_string(generic_param_.gpu_id)); + } + } +#endif // defined(XGBOOST_USE_CUDA) fo->Write(&mparam, sizeof(LearnerModelParam)); fo->Write(tparam_.objective); fo->Write(tparam_.booster); @@ -611,12 +661,6 @@ class LearnerImpl : public Learner { cache_, mparam_.base_score)); } gbm_->Configure(args); - - if (this->gbm_->UseGPU()) { - if (generic_param_.gpu_id == -1) { - generic_param_.gpu_id = 0; - } - } } // set number of features correctly. diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index 4d04b1f4ad50..f9db9bf70156 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -1,6 +1,8 @@ /*! - * Copyright by Contributors 2017 + * Copyright by Contributors 2017-2019 */ +#include + #include "xgboost/predictor.h" #include "xgboost/tree_model.h" #include "xgboost/tree_updater.h" @@ -43,10 +45,11 @@ class CPUPredictor : public Predictor { } } } - inline void PredLoopSpecalize(DMatrix* p_fmat, - std::vector* out_preds, - const gbm::GBTreeModel& model, int num_group, - unsigned tree_begin, unsigned tree_end) { + + void PredLoopInternal(DMatrix* p_fmat, std::vector* out_preds, + gbm::GBTreeModel const& model, int32_t tree_begin, + int32_t tree_end) { + int32_t const num_group = model.param.num_output_group; const int nthread = omp_get_max_threads(); InitThreadTemp(nthread, model.param.num_feature); std::vector& preds = *out_preds; @@ -99,22 +102,15 @@ class CPUPredictor : public Predictor { } } - void PredLoopInternal(DMatrix* dmat, std::vector* out_preds, - const gbm::GBTreeModel& model, int tree_begin, - unsigned ntree_limit) { - // TODO(Rory): Check if this specialisation actually improves performance - PredLoopSpecalize(dmat, out_preds, model, model.param.num_output_group, - tree_begin, ntree_limit); - } - bool PredictFromCache(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, - unsigned ntree_limit) { + unsigned ntree_limit) const { + CHECK(cache_); if (ntree_limit == 0 || ntree_limit * model.param.num_output_group >= model.trees.size()) { - auto it = cache_.find(dmat); - if (it != cache_.end()) { + auto it = cache_->find(dmat); + if (it != cache_->end()) { const HostDeviceVector& y = it->second.predictions; if (y.Size() != 0) { out_preds->Resize(y.Size()); @@ -130,6 +126,7 @@ class CPUPredictor : public Predictor { void InitOutPredictions(const MetaInfo& info, HostDeviceVector* out_preds, const gbm::GBTreeModel& model) const { + CHECK_NE(model.param.num_output_group, 0); size_t n = model.param.num_output_group * info.num_row_; const auto& base_margin = info.base_margin_.HostVector(); out_preds->Resize(n); @@ -150,21 +147,24 @@ class CPUPredictor : public Predictor { oss << "[number of data points], i.e. " << info.num_row_ << ". "; } oss << "Instead, all data points will use " - << "base_score = " << model.base_margin; + << "base_margin = " << model.base_margin; LOG(WARNING) << oss.str(); } - std::fill(out_preds_h.begin(), out_preds_h.end(), model.base_margin); + std::fill(out_preds_h.begin(), out_preds_h.end(), + model.base_margin); } } public: + CPUPredictor(GenericParameter const* generic_param, + std::shared_ptr> cache) : + Predictor::Predictor{generic_param, cache} {} void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, unsigned ntree_limit = 0) override { if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) { return; } - this->InitOutPredictions(dmat->Info(), out_preds, model); ntree_limit *= model.param.num_output_group; @@ -174,6 +174,15 @@ class CPUPredictor : public Predictor { this->PredLoopInternal(dmat, &out_preds->HostVector(), model, tree_begin, ntree_limit); + + auto cache_emtry = this->FindCache(dmat); + if (cache_emtry == cache_->cend()) { return; } + if (cache_emtry->second.predictions.Size() == 0) { + // See comment in GPUPredictor::PredictBatch. + InitOutPredictions(cache_emtry->second.data->Info(), + &(cache_emtry->second.predictions), model); + cache_emtry->second.predictions.Copy(*out_preds); + } } void UpdatePredictionCache( @@ -182,7 +191,7 @@ class CPUPredictor : public Predictor { int num_new_trees) override { int old_ntree = model.trees.size() - num_new_trees; // update cache entry - for (auto& kv : cache_) { + for (auto& kv : (*cache_)) { PredictionCacheEntry& e = kv.second; if (e.predictions.Size() == 0) { @@ -215,7 +224,7 @@ class CPUPredictor : public Predictor { out_preds->resize(model.param.num_output_group * (model.param.size_leaf_vector + 1)); // loop over output groups - for (int gid = 0; gid < model.param.num_output_group; ++gid) { + for (uint32_t gid = 0; gid < model.param.num_output_group; ++gid) { (*out_preds)[gid] = PredValue(inst, model.trees, model.tree_info, gid, &thread_temp[0], 0, ntree_limit) + @@ -254,10 +263,9 @@ class CPUPredictor : public Predictor { } void PredictContribution(DMatrix* p_fmat, std::vector* out_contribs, - const gbm::GBTreeModel& model, unsigned ntree_limit, + const gbm::GBTreeModel& model, uint32_t ntree_limit, std::vector* tree_weights, - bool approximate, - int condition, + bool approximate, int condition, unsigned condition_feature) override { const int nthread = omp_get_max_threads(); InitThreadTemp(nthread, model.param.num_feature); @@ -268,7 +276,9 @@ class CPUPredictor : public Predictor { ntree_limit = static_cast(model.trees.size()); } const int ngroup = model.param.num_output_group; + CHECK_NE(ngroup, 0); size_t const ncolumns = model.param.num_feature + 1; + CHECK_NE(ncolumns, 0); // allocate space for (number of features + bias) times the number of rows std::vector& contribs = *out_contribs; contribs.resize(info.num_row_ * ncolumns * model.param.num_output_group); @@ -292,8 +302,7 @@ class CPUPredictor : public Predictor { RegTree::FVec& feats = thread_temp[omp_get_thread_num()]; // loop over all classes for (int gid = 0; gid < ngroup; ++gid) { - bst_float* p_contribs = - &contribs[(row_idx * ngroup + gid) * ncolumns]; + bst_float* p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns]; feats.Fill(batch[i]); // calculate contributions for (unsigned j = 0; j < ntree_limit; ++j) { @@ -307,7 +316,7 @@ class CPUPredictor : public Predictor { } else { model.trees[j]->CalculateContributionsApprox(feats, &this_tree_contribs[0]); } - for (int ci = 0 ; ci < ncolumns ; ++ci) { + for (size_t ci = 0 ; ci < ncolumns ; ++ci) { p_contribs[ci] += this_tree_contribs[ci] * (tree_weights == nullptr ? 1 : (*tree_weights)[j]); } @@ -330,7 +339,7 @@ class CPUPredictor : public Predictor { bool approximate) override { const MetaInfo& info = p_fmat->Info(); const int ngroup = model.param.num_output_group; - size_t ncolumns = model.param.num_feature; + size_t const ncolumns = model.param.num_feature; const unsigned row_chunk = ngroup * (ncolumns + 1) * (ncolumns + 1); const unsigned mrow_chunk = (ncolumns + 1) * (ncolumns + 1); const unsigned crow_chunk = ngroup * (ncolumns + 1); @@ -375,7 +384,10 @@ class CPUPredictor : public Predictor { }; XGBOOST_REGISTER_PREDICTOR(CPUPredictor, "cpu_predictor") - .describe("Make predictions using CPU.") - .set_body([]() { return new CPUPredictor(); }); +.describe("Make predictions using CPU.") +.set_body([](GenericParameter const* generic_param, + std::shared_ptr> cache) { + return new CPUPredictor(generic_param, cache); + }); } // namespace predictor } // namespace xgboost diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index e3371bfcefe5..9cc91b966bfc 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -202,7 +202,7 @@ class GPUPredictor : public xgboost::Predictor { const thrust::host_vector& h_tree_segments, const thrust::host_vector& h_nodes, size_t tree_begin, size_t tree_end) { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id)); nodes_.resize(h_nodes.size()); dh::safe_cuda(cudaMemcpyAsync(nodes_.data().get(), h_nodes.data(), sizeof(DevicePredictionNode) * h_nodes.size(), @@ -224,7 +224,11 @@ class GPUPredictor : public xgboost::Predictor { size_t num_features, HostDeviceVector* predictions, size_t batch_offset) { - dh::safe_cuda(cudaSetDevice(device_)); + dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id)); + batch.data.SetDevice(generic_param_->gpu_id); + batch.offset.SetDevice(generic_param_->gpu_id); + predictions->SetDevice(generic_param_->gpu_id); + const uint32_t BLOCK_THREADS = 128; size_t num_rows = batch.Size(); auto GRID_SIZE = static_cast(common::DivRoundUp(num_rows, BLOCK_THREADS)); @@ -271,16 +275,19 @@ class GPUPredictor : public xgboost::Predictor { HostDeviceVector* out_preds, const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) { - if (tree_end - tree_begin == 0) { return; } + if (tree_end - tree_begin == 0) { + return; + } monitor_.StartCuda("DevicePredictInternal"); InitModel(model, tree_begin, tree_end); size_t batch_offset = 0; for (auto &batch : dmat->GetBatches()) { - batch.offset.SetDevice(device_); - batch.data.SetDevice(device_); - PredictInternal(batch, model.param.num_feature, out_preds, batch_offset); + batch.offset.SetDevice(generic_param_->gpu_id); + batch.data.SetDevice(generic_param_->gpu_id); + PredictInternal(batch, model.param.num_feature, + out_preds, batch_offset); batch_offset += batch.Size() * model.param.num_output_group; } @@ -288,19 +295,21 @@ class GPUPredictor : public xgboost::Predictor { } public: - GPUPredictor() : device_{-1} {} + GPUPredictor(GenericParameter const* generic_param, + std::shared_ptr> cache) : + Predictor::Predictor{generic_param, cache} {} ~GPUPredictor() override { - if (device_ >= 0) { - dh::safe_cuda(cudaSetDevice(device_)); + if (generic_param_->gpu_id >= 0) { + dh::safe_cuda(cudaSetDevice(generic_param_->gpu_id)); } } void PredictBatch(DMatrix* dmat, HostDeviceVector* out_preds, const gbm::GBTreeModel& model, int tree_begin, unsigned ntree_limit = 0) override { - int device = learner_param_->gpu_id; - CHECK_GE(device, 0); + int device = generic_param_->gpu_id; + CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data."; ConfigureDevice(device); if (this->PredictFromCache(dmat, out_preds, model, ntree_limit)) { @@ -308,13 +317,30 @@ class GPUPredictor : public xgboost::Predictor { } this->InitOutPredictions(dmat->Info(), out_preds, model); - int tree_end = ntree_limit * model.param.num_output_group; + int32_t tree_end = ntree_limit * model.param.num_output_group; if (ntree_limit == 0 || ntree_limit > model.trees.size()) { tree_end = static_cast(model.trees.size()); } DevicePredictInternal(dmat, out_preds, model, tree_begin, tree_end); + + auto cache_emtry = this->FindCache(dmat); + if (cache_emtry == cache_->cend()) { return; } + if (cache_emtry->second.predictions.Size() == 0) { + // Initialise the cache on first iteration, this comes useful + // when performing training continuation: + // + // 1. PredictBatch + // 2. CommitModel + // - updater->UpdatePredictionCache + // + // If we don't initialise this cache, the 2 step will recieve an invalid cache as + // the first step only modifies prediction store in learner without following code. + InitOutPredictions(cache_emtry->second.data->Info(), + &(cache_emtry->second.predictions), model); + cache_emtry->second.predictions.Copy(*out_preds); + } } protected: @@ -324,7 +350,7 @@ class GPUPredictor : public xgboost::Predictor { size_t n_classes = model.param.num_output_group; size_t n = n_classes * info.num_row_; const HostDeviceVector& base_margin = info.base_margin_; - out_preds->SetDevice(device_); + out_preds->SetDevice(generic_param_->gpu_id); out_preds->Resize(n); if (base_margin.Size() != 0) { CHECK_EQ(base_margin.Size(), n); @@ -338,8 +364,8 @@ class GPUPredictor : public xgboost::Predictor { const gbm::GBTreeModel& model, unsigned ntree_limit) { if (ntree_limit == 0 || ntree_limit * model.param.num_output_group >= model.trees.size()) { - auto it = cache_.find(dmat); - if (it != cache_.end()) { + auto it = (*cache_).find(dmat); + if (it != cache_->cend()) { const HostDeviceVector& y = it->second.predictions; if (y.Size() != 0) { monitor_.StartCuda("PredictFromCache"); @@ -360,7 +386,7 @@ class GPUPredictor : public xgboost::Predictor { int num_new_trees) override { auto old_ntree = model.trees.size() - num_new_trees; // update cache entry - for (auto& kv : cache_) { + for (auto& kv : (*cache_)) { PredictionCacheEntry& e = kv.second; DMatrix* dmat = kv.first; HostDeviceVector& predictions = e.predictions; @@ -382,14 +408,14 @@ class GPUPredictor : public xgboost::Predictor { void PredictInstance(const SparsePage::Inst& inst, std::vector* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit) override { - LOG(FATAL) << "Internal error: " << __func__ + LOG(FATAL) << "[Internal error]: " << __func__ << " is not implemented in GPU Predictor."; } void PredictLeaf(DMatrix* p_fmat, std::vector* out_preds, const gbm::GBTreeModel& model, unsigned ntree_limit) override { - LOG(FATAL) << "Internal error: " << __func__ + LOG(FATAL) << "[Internal error]: " << __func__ << " is not implemented in GPU Predictor."; } @@ -399,7 +425,7 @@ class GPUPredictor : public xgboost::Predictor { std::vector* tree_weights, bool approximate, int condition, unsigned condition_feature) override { - LOG(FATAL) << "Internal error: " << __func__ + LOG(FATAL) << "[Internal error]: " << __func__ << " is not implemented in GPU Predictor."; } @@ -409,15 +435,14 @@ class GPUPredictor : public xgboost::Predictor { unsigned ntree_limit, std::vector* tree_weights, bool approximate) override { - LOG(FATAL) << "Internal error: " << __func__ + LOG(FATAL) << "[Internal error]: " << __func__ << " is not implemented in GPU Predictor."; } - void Configure(const std::vector>& cfg, - const std::vector>& cache) override { - Predictor::Configure(cfg, cache); + void Configure(const std::vector>& cfg) override { + Predictor::Configure(cfg); - int device = learner_param_->gpu_id; + int device = generic_param_->gpu_id; if (device >= 0) { ConfigureDevice(device); } @@ -426,14 +451,11 @@ class GPUPredictor : public xgboost::Predictor { private: /*! \brief Reconfigure the device when GPU is changed. */ void ConfigureDevice(int device) { - if (device_ == device) return; - device_ = device; - if (device_ >= 0) { - max_shared_memory_bytes_ = dh::MaxSharedMemory(device_); + if (device >= 0) { + max_shared_memory_bytes_ = dh::MaxSharedMemory(device); } } - int device_; common::Monitor monitor_; dh::device_vector nodes_; dh::device_vector tree_segments_; @@ -445,8 +467,11 @@ class GPUPredictor : public xgboost::Predictor { }; XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor") - .describe("Make predictions using GPU.") - .set_body([]() { return new GPUPredictor(); }); +.describe("Make predictions using GPU.") +.set_body([](GenericParameter const* generic_param, + std::shared_ptr> cache) { + return new GPUPredictor(generic_param, cache); + }); } // namespace predictor } // namespace xgboost diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc index 3aab7c4e8ac8..115caa8396e9 100644 --- a/src/predictor/predictor.cc +++ b/src/predictor/predictor.cc @@ -9,19 +9,16 @@ DMLC_REGISTRY_ENABLE(::xgboost::PredictorReg); } // namespace dmlc namespace xgboost { void Predictor::Configure( - const std::vector>& cfg, - const std::vector>& cache) { - for (const std::shared_ptr& d : cache) { - cache_[d.get()].data = d; - } + const std::vector>& cfg) { } -Predictor* Predictor::Create(std::string const& name, GenericParameter const* learner_param) { +Predictor* Predictor::Create( + std::string const& name, GenericParameter const* generic_param, + std::shared_ptr> cache) { auto* e = ::dmlc::Registry::Get()->Find(name); if (e == nullptr) { LOG(FATAL) << "Unknown predictor type " << name; } - auto p_predictor = (e->body)(); - p_predictor->learner_param_ = learner_param; + auto p_predictor = (e->body)(generic_param, cache); return p_predictor; } } // namespace xgboost diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index 13afff9485d3..621b63658331 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -29,21 +29,17 @@ TEST(GBTree, SelectTreeMethod) { ASSERT_EQ(tparam.updater_seq, "grow_colmaker,prune"); gbtree.Configure({{"tree_method", "hist"}, {"num_feature", n_feat}}); ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker"); - ASSERT_EQ(tparam.predictor, "cpu_predictor"); gbtree.Configure({{"booster", "dart"}, {"tree_method", "hist"}, {"num_feature", n_feat}}); ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker"); - ASSERT_EQ(tparam.predictor, "cpu_predictor"); #ifdef XGBOOST_USE_CUDA generic_param.UpdateAllowUnknown(Args{{"gpu_id", "0"}}); gbtree.Configure({{"tree_method", "gpu_hist"}, {"num_feature", n_feat}}); ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist"); - ASSERT_EQ(tparam.predictor, "gpu_predictor"); gbtree.Configure({{"booster", "dart"}, {"tree_method", "gpu_hist"}, {"num_feature", n_feat}}); ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist"); - ASSERT_EQ(tparam.predictor, "gpu_predictor"); #endif } diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index 2f22f2abd00b..38b4eba9db04 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -9,8 +9,9 @@ namespace xgboost { TEST(cpu_predictor, Test) { auto lparam = CreateEmptyGenericParam(GPUIDX); + auto cache = std::make_shared>(); std::unique_ptr cpu_predictor = - std::unique_ptr(Predictor::Create("cpu_predictor", &lparam)); + std::unique_ptr(Predictor::Create("cpu_predictor", &lparam, cache)); gbm::GBTreeModel model = CreateTestModel(); @@ -62,8 +63,10 @@ TEST(cpu_predictor, ExternalMemoryTest) { std::string filename = tmpdir.path + "/big.libsvm"; std::unique_ptr dmat = CreateSparsePageDMatrix(12, 64, filename); auto lparam = CreateEmptyGenericParam(GPUIDX); + auto cache = std::make_shared>(); + std::unique_ptr cpu_predictor = - std::unique_ptr(Predictor::Create("cpu_predictor", &lparam)); + std::unique_ptr(Predictor::Create("cpu_predictor", &lparam, cache)); gbm::GBTreeModel model = CreateTestModel(); diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 487bad8ef27a..b997ecf23ae4 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -36,14 +36,15 @@ namespace predictor { TEST(gpu_predictor, Test) { auto cpu_lparam = CreateEmptyGenericParam(-1); auto gpu_lparam = CreateEmptyGenericParam(0); + auto cache = std::make_shared>(); std::unique_ptr gpu_predictor = - std::unique_ptr(Predictor::Create("gpu_predictor", &gpu_lparam)); + std::unique_ptr(Predictor::Create("gpu_predictor", &gpu_lparam, cache)); std::unique_ptr cpu_predictor = - std::unique_ptr(Predictor::Create("cpu_predictor", &cpu_lparam)); + std::unique_ptr(Predictor::Create("cpu_predictor", &cpu_lparam, cache)); - gpu_predictor->Configure({}, {}); - cpu_predictor->Configure({}, {}); + gpu_predictor->Configure({}); + cpu_predictor->Configure({}); for (size_t i = 1; i < 33; i *= 2) { int n_row = i, n_col = i; @@ -71,9 +72,10 @@ TEST(gpu_predictor, Test) { TEST(gpu_predictor, ExternalMemoryTest) { auto lparam = CreateEmptyGenericParam(0); + auto cache = std::make_shared>(); std::unique_ptr gpu_predictor = - std::unique_ptr(Predictor::Create("gpu_predictor", &lparam)); - gpu_predictor->Configure({}, {}); + std::unique_ptr(Predictor::Create("gpu_predictor", &lparam, cache)); + gpu_predictor->Configure({}); gbm::GBTreeModel model = CreateTestModel(); model.param.num_feature = 3; const int n_classes = 3; diff --git a/tests/python-gpu/test_gpu_training_continuation.py b/tests/python-gpu/test_gpu_training_continuation.py new file mode 100644 index 000000000000..3d4b053dff63 --- /dev/null +++ b/tests/python-gpu/test_gpu_training_continuation.py @@ -0,0 +1,48 @@ +import unittest +import numpy as np +import xgboost as xgb +import json + +rng = np.random.RandomState(1994) + + +class TestGPUTrainingContinuation(unittest.TestCase): + def test_training_continuation_binary(self): + kRows = 32 + kCols = 16 + X = np.random.randn(kRows, kCols) + y = np.random.randn(kRows) + dtrain = xgb.DMatrix(X, y) + params = {'tree_method': 'gpu_hist', 'max_depth': '2'} + bst_0 = xgb.train(params, dtrain, num_boost_round=4) + dump_0 = bst_0.get_dump(dump_format='json') + + bst_1 = xgb.train(params, dtrain, num_boost_round=2) + bst_1 = xgb.train(params, dtrain, num_boost_round=2, xgb_model=bst_1) + dump_1 = bst_1.get_dump(dump_format='json') + + def recursive_compare(obj_0, obj_1): + if isinstance(obj_0, float): + assert np.isclose(obj_0, obj_1) + elif isinstance(obj_0, str): + assert obj_0 == obj_1 + elif isinstance(obj_0, int): + assert obj_0 == obj_1 + elif isinstance(obj_0, dict): + keys_0 = list(obj_0.keys()) + keys_1 = list(obj_1.keys()) + values_0 = list(obj_0.values()) + values_1 = list(obj_1.values()) + for i in range(len(obj_0.items())): + assert keys_0[i] == keys_1[i] + if list(obj_0.keys())[i] != 'missing': + recursive_compare(values_0[i], + values_1[i]) + else: + for i in range(len(obj_0)): + recursive_compare(obj_0[i], obj_1[i]) + + for i in range(len(dump_0)): + obj_0 = json.loads(dump_0[i]) + obj_1 = json.loads(dump_1[i]) + recursive_compare(obj_0, obj_1) diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index 8bdeb8299e3d..f0ae5e292a1a 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -45,9 +45,8 @@ def test_dask_dataframe(self): assert isinstance(out['booster'], dxgb.Booster) assert len(out['history']['X']['rmse']) == 2 - # FIXME(trivialfis): Re-enable this after #5003 is fixed - # predictions = dxgb.predict(client, out, dtrain).compute() - # assert isinstance(predictions, np.ndarray) + predictions = dxgb.predict(client, out, dtrain).compute() + assert isinstance(predictions, np.ndarray) @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.skipif(**tm.no_dask_cuda())