diff --git a/.gitignore b/.gitignore index f2a13f361357..5050208c2124 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,7 @@ tags *.class target *.swp +.gdb_history # cpp tests and gcov generated files *.gcov diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc index d3e0c23c616c..d987161acf24 100644 --- a/amalgamation/xgboost-all0.cc +++ b/amalgamation/xgboost-all0.cc @@ -25,7 +25,9 @@ // gbms #include "../src/gbm/gbm.cc" #include "../src/gbm/gbtree.cc" +#include "../src/gbm/gbtree_model.cc" #include "../src/gbm/gblinear.cc" +#include "../src/gbm/gblinear_model.cc" // data #include "../src/data/data.cc" @@ -44,8 +46,8 @@ #endif // tress -#include "../src/tree/split_evaluator.cc" #include "../src/tree/param.cc" +#include "../src/tree/split_evaluator.cc" #include "../src/tree/tree_model.cc" #include "../src/tree/tree_updater.cc" #include "../src/tree/updater_colmaker.cc" diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst index 65481fcbc322..0334385a605c 100644 --- a/doc/tutorials/index.rst +++ b/doc/tutorials/index.rst @@ -10,6 +10,7 @@ See `Awesome XGBoost `_ for mo :caption: Contents: model + saving_model Distributed XGBoost with AWS YARN kubernetes Distributed XGBoost with XGBoost4J-Spark diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst new file mode 100644 index 000000000000..62bdd75fcfac --- /dev/null +++ b/doc/tutorials/saving_model.rst @@ -0,0 +1,197 @@ +######################## +Introduction to Model IO +######################## + +In XGBoost 1.0.0, we introduced experimental support of using `JSON +`_ for saving/loading XGBoost models and related +hyper-parameters for training, aiming to replace the old binary internal format with an +open format that can be easily reused. The support for binary format will be continued in +the future until JSON format is no-longer experimental and has satisfying performance. +This tutorial aims to share some basic insights into the JSON serialisation method used in +XGBoost. Without explicitly mentioned, the following sections assume you are using the +experimental JSON format, which can be enabled by passing +``enable_experimental_json_serialization=True`` as training parameter, or provide the file +name with ``.json`` as file extension when saving/loading model: +``booster.save_model('model.json')``. More details below. + +Before we get started, XGBoost is a gradient boosting library with focus on tree model, +which means inside XGBoost, there are 2 distinct parts: the model and algorithms used to +build it. If you come from Deep Learning community, then it should be clear to you that +there are differences between the neural network structures composed of weights with fixed +tensor operations, and the optimizers used to train them. + +So when one calls ``booster.save_model``, XGBoost saves the trees, some model parameters +like number of input columns in trained trees, and the objective function, which combined +to represent the concept of "model" in XGBoost. As for why are we saving the objective as +part of model, that's because objective controls transformation of global bias (called +``base_score`` in XGBoost). Users can share this model with others for prediction, +evaluation or continue the training with a different set of hyper-parameters etc. +However, this is not the end of story. There are cases where we need to save something +more than just the model itself. For example, in distrbuted training, XGBoost performs +checkpointing operation. Or for some reasons, your favorite distributed computing +framework decide to copy the model from one worker to another and continue the training in +there. In such cases, the serialisation output is required to contain enougth information +to continue previous training without user providing any parameters again. We consider +such scenario as memory snapshot (or memory based serialisation method) and distinguish it +with normal model IO operation. In Python, this can be invoked by pickling the +``Booster`` object, while in R the same can be achieved by accessing ``bst$raw``. Please +refer to corresponding language binding document for precise API (as this feature is quite +new, please open an issue if you can't find appropriate documents, or better, a PR). + +.. note:: + + The old binary format doesn't distinguish difference between model and raw memory + serialisation format, it's a mix of everything, which is part of the reason why we want + to replace it with a more robust serialisation method. JVM Package has its own memory + based serialisation methods, which may lead to some inconsistance in output model. It's + a known issue we are trying to address. + +To enable JSON format support for model IO (saving only the trees and objective), provide +a filename with ``.json`` as file extension: + +.. code-block:: python + + bst.save_model('model_file_name.json') + +While for enabling JSON as memory based serialisation format, pass +``enable_experimental_json_serialization`` as a training parameter. In Python this can be +done by: + +.. code-block:: python + + bst = xgboost.train({'enable_experimental_json_serialization': True}, dtrain) + with open('filename', 'wb') as fd: + pickle.dump(bst, fd) + +Notice the ``filename`` is for Python intrinsic function ``open``, not for XGBoost. Hence +parameter ``enable_experimental_json_serialization`` is required to enable JSON format. +As the name suggested, memory based serialisation captures many stuffs internal to +XGBoost, so it's only suitable to be used for checkpoints, which doesn't require stable +output format. That being said, loading pickled booster (memory snapshot) in a different +XGBoost version may lead to errors or undefined behaviors. But we promise the stable +output format of binary model and JSON model (once it's no-longer experimental) as they +are designed to be reusable. This scheme fits as Python itself doesn't guarantee pickled +bytecode can be used in different Python version. + +*************************** +Custom objective and metric +*************************** + +XGBoost accepts user provided objective and metric functions as an extension. These +functions are not saved in model file as they are language dependent feature. With +Python, user can pickle the model to include these functions in saved binary. One +drawback is, the output from pickle is not a stable serialization format and doesn't work +on different Python version or XGBoost version, not to mention different language +environment. Another way to workaround this limitation is to provide these functions +again after the model is loaded. If the customized function is useful, please consider +making a PR for implementing it inside XGBoost, this way we can have your functions +working with different language bindings. + +******************************************************** +Saving and Loading the internal parameters configuration +******************************************************** + +XGBoost's ``C API`` and ``Python API`` supports saving and loading the internal +configuration directly as a JSON string. In Python package: + +.. code-block:: python + + bst = xgboost.train(...) + config = bst.save_config() + print(config) + +Will print out something similiar to (not actual output as it's too long for demonstration): + +.. code-block:: json + + { + "Learner": { + "generic_parameter": { + "enable_experimental_json_serialization": "0", + "gpu_id": "0", + "gpu_page_size": "0", + "n_jobs": "0", + "random_state": "0", + "seed": "0", + "seed_per_iteration": "0" + }, + "gradient_booster": { + "gbtree_train_param": { + "num_parallel_tree": "1", + "predictor": "gpu_predictor", + "process_type": "default", + "tree_method": "gpu_hist", + "updater": "grow_gpu_hist", + "updater_seq": "grow_gpu_hist" + }, + "name": "gbtree", + "updater": { + "grow_gpu_hist": { + "gpu_hist_train_param": { + "debug_synchronize": "0", + "gpu_batch_nrows": "0", + "single_precision_histogram": "0" + }, + "train_param": { + "alpha": "0", + "cache_opt": "1", + "colsample_bylevel": "1", + "colsample_bynode": "1", + "colsample_bytree": "1", + "default_direction": "learn", + "enable_feature_grouping": "0", + "eta": "0.300000012", + "gamma": "0", + "grow_policy": "depthwise", + "interaction_constraints": "", + "lambda": "1", + "learning_rate": "0.300000012", + "max_bin": "256", + "max_conflict_rate": "0", + "max_delta_step": "0", + "max_depth": "6", + "max_leaves": "0", + "max_search_group": "100", + "refresh_leaf": "1", + "sketch_eps": "0.0299999993", + "sketch_ratio": "2", + "subsample": "1" + } + } + } + }, + "learner_train_param": { + "booster": "gbtree", + "disable_default_eval_metric": "0", + "dsplit": "auto", + "objective": "reg:squarederror" + }, + "metrics": [], + "objective": { + "name": "reg:squarederror", + "reg_loss_param": { + "scale_pos_weight": "1" + } + } + }, + "version": [1, 0, 0] + } + + +You can load it back to the model generated by same version of XGBoost by: + +.. code-block:: python + + bst.load_config(config) + +This way users can study the internal representation more closely. + +************ +Future Plans +************ + +Right now using the JSON format incurs longer serialisation time, we have been working on +optimizing the JSON implementation to close the gap between binary format and JSON format. +You can track the progress in `#5046 `_. +Another important item for JSON format support is a stable and documented `schema +`_, based on which one can easily reuse the saved model. diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 2a9d492ac034..4962970ad084 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -428,7 +428,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, const float **out_result); /*! - * \brief load model from existing file + * \brief Load model from existing file * \param handle handle * \param fname file name * \return 0 when success, -1 when failure happens @@ -436,7 +436,7 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char *fname); /*! - * \brief save model into existing file + * \brief Save model into existing file * \param handle handle * \param fname file name * \return 0 when success, -1 when failure happens @@ -464,6 +464,45 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len, const char **out_dptr); + +/*! + * \brief Initialize the booster from rabit checkpoint. + * This is used in distributed training API. + * \param handle handle + * \param version The output version of the model. + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle, + int* version); + +/*! + * \brief Save the current checkpoint to rabit. + * \param handle handle + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle); + + +/*! + * \brief Save XGBoost's internal configuration into a JSON document. + * \param handle handle to Booster object. + * \param out_str A valid pointer an array of characters. The characters array is + * allocated and managed by XGBoost, while pointer to that array needs to + * be managed by caller. + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterSaveJsonParameters(BoosterHandle handle, + bst_ulong *out_len, + char const** out_str); +/*! + * \brief Load XGBoost's internal configuration from a JSON document. + * \param handle handle to Booster object. + * \param json_parameters string representation of a JSON document. + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterLoadJsonParameters(BoosterHandle handle, + char const* json_parameters); + /*! * \brief dump model, return array of strings representing model dump * \param handle handle @@ -570,25 +609,4 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle, XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle, bst_ulong* out_len, const char*** out); - -// --- Distributed training API---- -// NOTE: functions in rabit/c_api.h will be also available in libxgboost.so -/*! - * \brief Initialize the booster from rabit checkpoint. - * This is used in distributed training API. - * \param handle handle - * \param version The output version of the model. - * \return 0 when success, -1 when failure happens - */ -XGB_DLL int XGBoosterLoadRabitCheckpoint( - BoosterHandle handle, - int* version); - -/*! - * \brief Save the current checkpoint to rabit. - * \param handle handle - * \return 0 when success, -1 when failure happens - */ -XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle); - #endif // XGBOOST_C_API_H_ diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index 7eff453df668..fde8d2e0dd1a 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -32,7 +32,7 @@ struct LearnerModelParam; /*! * \brief interface of gradient boosting model. */ -class GradientBooster { +class GradientBooster : public Model, public Configurable { protected: GenericParameter const* generic_param_; diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h index deaacf8c6dbd..fb70b8951fbf 100644 --- a/include/xgboost/json_io.h +++ b/include/xgboost/json_io.h @@ -55,6 +55,12 @@ class JsonReader { } cursor_; StringView raw_str_; + bool initialized_; + + public: + size_t Pos() const { return cursor_.Pos(); } + size_t Length() const { return raw_str_.size(); } + bool Initialized() const { return initialized_; } protected: void SkipSpaces(); @@ -109,8 +115,9 @@ class JsonReader { public: explicit JsonReader(StringView str) : - raw_str_{str} {} + raw_str_{str}, initialized_{true} {} + JsonReader() : initialized_{false} {}; virtual ~JsonReader() = default; Json Load(); diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 8b953af357b7..243693171ca6 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -45,7 +45,7 @@ class Json; * * \endcode */ -class Learner : public Model, public rabit::Serializable { +class Learner : public Model, public Configurable, public rabit::Serializable { public: /*! \brief virtual destructor */ ~Learner() override; @@ -53,16 +53,6 @@ class Learner : public Model, public rabit::Serializable { * \brief Configure Learner based on set parameters. */ virtual void Configure() = 0; - /*! - * \brief load model from stream - * \param fi input stream. - */ - void Load(dmlc::Stream* fi) override = 0; - /*! - * \brief save model to stream. - * \param fo output stream - */ - void Save(dmlc::Stream* fo) const override = 0; /*! * \brief update the model for one iteration * With the specified objective function. diff --git a/include/xgboost/model.h b/include/xgboost/model.h index 8d45c69d5ef4..b1f024973b11 100644 --- a/include/xgboost/model.h +++ b/include/xgboost/model.h @@ -16,15 +16,15 @@ class Json; struct Model { /*! - * \brief Save the model to stream. - * \param fo output write stream + * \brief load the model from a json object + * \param in json object where to load the model from */ - virtual void SaveModel(dmlc::Stream* fo) const = 0; + virtual void LoadModel(Json const& in) = 0; /*! - * \brief Load the model from stream. - * \param fi input read stream + * \brief saves the model config to a json object + * \param out json container where to save the model to */ - virtual void LoadModel(dmlc::Stream* fi) = 0; + virtual void SaveModel(Json* out) const = 0; }; struct Configurable { diff --git a/include/xgboost/parameter.h b/include/xgboost/parameter.h index f9130b1fa4e9..8484af80923c 100644 --- a/include/xgboost/parameter.h +++ b/include/xgboost/parameter.h @@ -80,6 +80,23 @@ class FieldEntry : public FieldEntry { \ } /* namespace dmlc */ namespace xgboost { + +template +Args UpdateAllowUnknownImpl(P* parameter, Base* base, + Container const& kwargs, bool* out_changed = nullptr) { + static_assert(std::is_base_of::value, ""); + if (parameter->GetInitialised()) { + return base->UpdateAllowUnknown(kwargs, out_changed); + } else { + auto unknown = base->InitAllowUnknown(kwargs); + if (out_changed) { + *out_changed = true; + } + parameter->SetInitialised(); + return unknown; + } +} + template struct XGBoostParameter : public dmlc::Parameter { protected: @@ -88,17 +105,12 @@ struct XGBoostParameter : public dmlc::Parameter { public: template Args UpdateAllowUnknown(Container const& kwargs, bool* out_changed = nullptr) { - if (initialised_) { - return dmlc::Parameter::UpdateAllowUnknown(kwargs, out_changed); - } else { - auto unknown = dmlc::Parameter::InitAllowUnknown(kwargs); - if (out_changed) { - *out_changed = true; - } - initialised_ = true; - return unknown; - } + return UpdateAllowUnknownImpl(this, dynamic_cast*>(this), + kwargs, out_changed); } + + void SetInitialised() { this->initialised_ = true; } + bool GetInitialised() const { return static_cast(this->initialised_); } }; } // namespace xgboost diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index 965e8c0d35df..fa65ebb64c5b 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -303,12 +303,15 @@ class RegTree : public Model { * \brief load model from stream * \param fi input stream */ - void LoadModel(dmlc::Stream* fi) override; + void Load(dmlc::Stream* fi); /*! * \brief save model to stream * \param fo output stream */ - void SaveModel(dmlc::Stream* fo) const override; + void Save(dmlc::Stream* fo) const; + + void LoadModel(Json const& in) override; + void SaveModel(Json* out) const override; bool operator==(const RegTree& b) const { return nodes_ == b.nodes_ && stats_ == b.stats_ && diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 444d36ece1cf..28445a4ff476 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1098,6 +1098,7 @@ def __getstate__(self): if handle is not None: raw = self.save_raw() this["handle"] = raw + this['config'] = self.save_config() return this def __setstate__(self, state): @@ -1107,12 +1108,35 @@ def __setstate__(self, state): buf = handle dmats = c_array(ctypes.c_void_p, []) handle = ctypes.c_void_p() - _check_call(_LIB.XGBoosterCreate(dmats, c_bst_ulong(0), ctypes.byref(handle))) + _check_call(_LIB.XGBoosterCreate( + dmats, c_bst_ulong(0), ctypes.byref(handle))) length = c_bst_ulong(len(buf)) ptr = (ctypes.c_char * len(buf)).from_buffer(buf) _check_call(_LIB.XGBoosterLoadModelFromBuffer(handle, ptr, length)) state['handle'] = handle + config = state['config'] + del state['config'] self.__dict__.update(state) + self.load_config(config) + + def save_config(self): + '''Output internal parameter configuration of Booster as a JSON + string.''' + json_string = ctypes.c_char_p() + length = c_bst_ulong() + _check_call(_LIB.XGBoosterSaveJsonParameters( + self.handle, + ctypes.byref(length), + ctypes.byref(json_string))) + json_string = json_string.value.decode() + return json_string + + def load_config(self, config): + '''Load configuration returned by `save_config`.''' + assert isinstance(config, str) + _check_call(_LIB.XGBoosterLoadJsonParameters( + self.handle, + c_str(config))) def __copy__(self): return self.__deepcopy__(None) diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 66280c6b837d..fd3eb633ca2f 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -5,23 +5,25 @@ #include #include +#include #include #include #include #include + #include "xgboost/data.h" #include "xgboost/learner.h" #include "xgboost/c_api.h" #include "xgboost/logging.h" #include "xgboost/version_config.h" +#include "xgboost/json.h" #include "c_api_error.h" #include "../data/simple_csr_source.h" #include "../common/io.h" #include "../data/adapter.h" - namespace xgboost { // declare the data callback. XGB_EXTERN_C int XGBoostNativeDataIterSetData( @@ -454,8 +456,8 @@ XGB_DLL int XGDMatrixNumCol(const DMatrixHandle handle, // xgboost implementation XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[], - xgboost::bst_ulong len, - BoosterHandle *out) { + xgboost::bst_ulong len, + BoosterHandle *out) { API_BEGIN(); std::vector > mats; for (xgboost::bst_ulong i = 0; i < len; ++i) { @@ -481,6 +483,32 @@ XGB_DLL int XGBoosterSetParam(BoosterHandle handle, API_END(); } +XGB_DLL int XGBoosterLoadJsonParameters(BoosterHandle handle, + char const* json_parameters) { + API_BEGIN(); + CHECK_HANDLE(); + std::string str {json_parameters}; + Json config { Json::Load(StringView{str.c_str(), str.size()}) }; + static_cast(handle)->LoadConfig(config); + API_END(); +} + +XGB_DLL int XGBoosterSaveJsonParameters(BoosterHandle handle, + xgboost::bst_ulong *out_len, + char const** out_str) { + API_BEGIN(); + CHECK_HANDLE(); + Json config { Object() }; + auto* learner = static_cast(handle); + learner->Configure(); + learner->SaveConfig(&config); + std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str; + Json::Dump(config, &raw_str); + *out_str = raw_str.c_str(); + *out_len = static_cast(raw_str.length()); + API_END(); +} + XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle, int iter, DMatrixHandle dtrain) { @@ -567,23 +595,43 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) { API_BEGIN(); CHECK_HANDLE(); - std::unique_ptr fi(dmlc::Stream::Create(fname, "r")); - static_cast(handle)->Load(fi.get()); + if (common::FileExtension(fname) == "json") { + auto str = common::LoadSequentialFile(fname); + CHECK_GT(str.size(), 2); + CHECK_EQ(str[0], '{'); + Json in { Json::Load({str.c_str(), str.size()}) }; + static_cast(handle)->LoadModel(in); + } else { + std::unique_ptr fi(dmlc::Stream::Create(fname, "r")); + static_cast(handle)->Load(fi.get()); + } API_END(); } -XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char* fname) { +XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char* c_fname) { API_BEGIN(); CHECK_HANDLE(); - std::unique_ptr fo(dmlc::Stream::Create(fname, "w")); - auto *bst = static_cast(handle); - bst->Save(fo.get()); + std::unique_ptr fo(dmlc::Stream::Create(c_fname, "w")); + auto *learner = static_cast(handle); + learner->Configure(); + if (common::FileExtension(c_fname) == "json") { + Json out { Object() }; + learner->SaveModel(&out); + std::string str; + Json::Dump(out, &str); + fo->Write(str.c_str(), str.size()); + } else { + auto *bst = static_cast(handle); + bst->Save(fo.get()); + } API_END(); } +// The following two functions are `Load` and `Save` for memory based serialization +// methods. E.g. Python pickle. XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, - const void* buf, - xgboost::bst_ulong len) { + const void* buf, + xgboost::bst_ulong len) { API_BEGIN(); CHECK_HANDLE(); common::MemoryFixSizeBuffer fs((void*)buf, len); // NOLINT(*) @@ -592,21 +640,47 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, } XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, - xgboost::bst_ulong* out_len, - const char** out_dptr) { + xgboost::bst_ulong* out_len, + const char** out_dptr) { std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str; raw_str.resize(0); API_BEGIN(); CHECK_HANDLE(); common::MemoryBufferStream fo(&raw_str); - auto *bst = static_cast(handle); - bst->Save(&fo); + auto *learner = static_cast(handle); + learner->Configure(); + learner->Save(&fo); *out_dptr = dmlc::BeginPtr(raw_str); *out_len = static_cast(raw_str.length()); API_END(); } +XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle, + int* version) { + API_BEGIN(); + CHECK_HANDLE(); + auto* bst = static_cast(handle); + *version = rabit::LoadCheckPoint(bst); + if (*version != 0) { + bst->Configure(); + } + API_END(); +} + +XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) { + API_BEGIN(); + CHECK_HANDLE(); + auto* learner = static_cast(handle); + learner->Configure(); + if (learner->AllowLazyCheckPoint()) { + rabit::LazyCheckPoint(learner); + } else { + rabit::CheckPoint(learner); + } + API_END(); +} + inline void XGBoostDumpModelImpl( BoosterHandle handle, const FeatureMap& fmap, @@ -617,6 +691,7 @@ inline void XGBoostDumpModelImpl( std::vector& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str; std::vector& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp; auto *bst = static_cast(handle); + bst->Configure(); str_vecs = bst->DumpModel(fmap, with_stats != 0, format); charp_vecs.resize(str_vecs.size()); for (size_t i = 0; i < str_vecs.size(); ++i) { @@ -732,29 +807,5 @@ XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle, API_END(); } -XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle, - int* version) { - API_BEGIN(); - CHECK_HANDLE(); - auto* bst = static_cast(handle); - *version = rabit::LoadCheckPoint(bst); - if (*version != 0) { - bst->Configure(); - } - API_END(); -} - -XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) { - API_BEGIN(); - CHECK_HANDLE(); - auto* bst = static_cast(handle); - if (bst->AllowLazyCheckPoint()) { - rabit::LazyCheckPoint(bst); - } else { - rabit::CheckPoint(bst); - } - API_END(); -} - // force link rabit static DMLC_ATTRIBUTE_UNUSED int XGBOOST_LINK_RABIT_C_API_ = RabitLinkTag(); diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index f1f94f762fbf..714a342bd849 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -44,11 +44,15 @@ void HistogramCuts::Build(DMatrix* dmat, uint32_t const max_num_bins) { float constexpr kSparsityThreshold = 0.0005; // FIXME(trivialfis): Distributed environment is not supported. if (sparsity < kSparsityThreshold && (!rabit::IsDistributed())) { - LOG(INFO) << "Building quantile cut on a sparse dataset."; SparseCuts cuts(this); cuts.Build(dmat, max_num_bins); } else { - LOG(INFO) << "Building quantile cut on a dense dataset or distributed environment."; + if (rabit::IsDistributed() && sparsity < kSparsityThreshold) { + LOG(WARNING) << "Building quantile cuts with a sparse dataset on distributed " + << "environment, which may incur higher memory usage and longer " + << "build time."; + } + DenseCuts cuts(this); cuts.Build(dmat, max_num_bins); } diff --git a/src/common/io.cc b/src/common/io.cc index de0541030d36..7a19acd9b528 100644 --- a/src/common/io.cc +++ b/src/common/io.cc @@ -115,7 +115,7 @@ std::string LoadSequentialFile(std::string fname) { } size_t f_size_bytes = fs.st_size; - buffer.resize(f_size_bytes+1); + buffer.resize(f_size_bytes + 1); int32_t fd = open(fname.c_str(), O_RDONLY); posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL); ssize_t bytes_read = read(fd, &buffer[0], f_size_bytes); diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index ae9dddbac089..46a1706e4227 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -85,6 +85,30 @@ class GBLinear : public GradientBooster { model_.Save(fo); } + void SaveModel(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String{"gblinear"}; + + out["model"] = Object(); + auto& model = out["model"]; + model_.SaveModel(&model); + } + void LoadModel(Json const& in) override { + CHECK_EQ(get(in["name"]), "gblinear"); + auto const& model = in["model"]; + model_.LoadModel(model); + } + + void LoadConfig(Json const& in) override { + CHECK_EQ(get(in["name"]), "gblinear"); + fromJson(in["gblinear_train_param"], ¶m_); + } + void SaveConfig(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String{"gblinear"}; + out["gblinear_train_param"] = toJson(param_); + } + void DoBoost(DMatrix *p_fmat, HostDeviceVector *in_gpair, ObjFunction* obj) override { diff --git a/src/gbm/gblinear_model.cc b/src/gbm/gblinear_model.cc new file mode 100644 index 000000000000..a1c63b6f53ad --- /dev/null +++ b/src/gbm/gblinear_model.cc @@ -0,0 +1,38 @@ +/*! + * Copyright 2019 by Contributors + */ +#include +#include +#include "xgboost/json.h" +#include "gblinear_model.h" + +namespace xgboost { +namespace gbm { + +void GBLinearModel::SaveModel(Json* p_out) const { + using WeightType = std::remove_reference().back())>::type; + using JsonFloat = Number::Float; + static_assert(std::is_same::value, + "Weight type should be of the same type with JSON float"); + auto& out = *p_out; + + size_t const n_weights = weight.size(); + std::vector j_weights(n_weights); + for (size_t i = 0; i < n_weights; ++i) { + j_weights[i] = weight[i]; + } + out["weights"] = std::move(j_weights); +} + +void GBLinearModel::LoadModel(Json const& in) { + auto const& j_weights = get(in["weights"]); + auto n_weights = j_weights.size(); + weight.resize(n_weights); + for (size_t i = 0; i < n_weights; ++i) { + weight[i] = get(j_weights[i]); + } +} + +DMLC_REGISTER_PARAMETER(DeprecatedGBLinearModelParam); +} // namespace gbm +} // namespace xgboost diff --git a/src/gbm/gblinear_model.h b/src/gbm/gblinear_model.h index e91fecde60f9..71b8bcd061c2 100644 --- a/src/gbm/gblinear_model.h +++ b/src/gbm/gblinear_model.h @@ -62,27 +62,21 @@ class GBLinearModel : public Model { learner_model_param_->num_output_group); std::fill(weight.begin(), weight.end(), 0.0f); } + + void SaveModel(Json *p_out) const override; + void LoadModel(Json const &in) override; + // save the model to file - inline void Save(dmlc::Stream* fo) const { + void Save(dmlc::Stream *fo) const { fo->Write(¶m, sizeof(param)); fo->Write(weight); } // load model from file - inline void Load(dmlc::Stream* fi) { + void Load(dmlc::Stream *fi) { CHECK_EQ(fi->Read(¶m, sizeof(param)), sizeof(param)); fi->Read(&weight); } - void LoadModel(dmlc::Stream* fi) override { - // They are the same right now until we can split up the saved parameter from model. - this->Load(fi); - } - - void SaveModel(dmlc::Stream* fo) const override { - // They are the same right now until we can split up the saved parameter from model. - this->Save(fo); - } - // model bias inline bst_float *bias() { return &weight[learner_model_param_->num_feature * @@ -118,7 +112,8 @@ class GBLinearModel : public Model { << " \"weight\": [" << std::endl; for (unsigned i = 0; i < nfeature; ++i) { for (int gid = 0; gid < ngroup; ++gid) { - if (i != 0 || gid != 0) fo << "," << std::endl; + if (i != 0 || gid != 0) + fo << "," << std::endl; fo << " " << (*this)[i][gid]; } } @@ -140,5 +135,6 @@ class GBLinearModel : public Model { return v; } }; + } // namespace gbm } // namespace xgboost diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 96635e7449a1..53d583065721 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -289,8 +289,57 @@ void GBTree::CommitModel(std::vector>>&& ne monitor_.Stop("CommitModel"); } +void GBTree::LoadConfig(Json const& in) { + CHECK_EQ(get(in["name"]), "gbtree"); + fromJson(in["gbtree_train_param"], &tparam_); + int32_t const n_gpus = xgboost::common::AllVisibleGPUs(); + if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) { + tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}}); + } + if (n_gpus == 0 && tparam_.tree_method == TreeMethod::kGPUHist) { + tparam_.tree_method = TreeMethod::kHist; + tparam_.UpdateAllowUnknown(Args{{"tree_method", "hist"}}); + LOG(WARNING) + << "Loading from a raw memory buffer on CPU only machine. " + "Change tree_method to hist."; + } + + auto const& j_updaters = get(in["updater"]); + updaters_.clear(); + for (auto const& kv : j_updaters) { + std::unique_ptr up(TreeUpdater::Create(kv.first, generic_param_)); + up->LoadConfig(kv.second); + updaters_.push_back(std::move(up)); + } +} + +void GBTree::SaveConfig(Json* p_out) const { + auto& out = *p_out; + out["name"] = String("gbtree"); + out["gbtree_train_param"] = toJson(tparam_); + out["updater"] = Object(); + + auto& j_updaters = out["updater"]; + for (auto const& up : updaters_) { + j_updaters[up->Name()] = Object(); + auto& j_up = j_updaters[up->Name()]; + up->SaveConfig(&j_up); + } +} + +void GBTree::LoadModel(Json const& in) { + CHECK_EQ(get(in["name"]), "gbtree"); + model_.LoadModel(in["model"]); +} + +void GBTree::SaveModel(Json* p_out) const { + auto& out = *p_out; + out["name"] = String("gbtree"); + out["model"] = Object(); + auto& model = out["model"]; + model_.SaveModel(&model); +} -// dart class Dart : public GBTree { public: explicit Dart(LearnerModelParam const* booster_config) : @@ -303,6 +352,30 @@ class Dart : public GBTree { } } + void SaveModel(Json *p_out) const override { + auto &out = *p_out; + out["name"] = String("dart"); + out["gbtree"] = Object(); + GBTree::SaveModel(&(out["gbtree"])); + + std::vector j_weight_drop(weight_drop_.size()); + for (size_t i = 0; i < weight_drop_.size(); ++i) { + j_weight_drop[i] = Number(weight_drop_[i]); + } + out["weight_drop"] = Array(j_weight_drop); + } + void LoadModel(Json const& in) override { + CHECK_EQ(get(in["name"]), "dart"); + auto const& gbtree = in["gbtree"]; + GBTree::LoadModel(gbtree); + + auto const& j_weight_drop = get(in["weight_drop"]); + weight_drop_.resize(j_weight_drop.size()); + for (size_t i = 0; i < weight_drop_.size(); ++i) { + weight_drop_[i] = get(j_weight_drop[i]); + } + } + void Load(dmlc::Stream* fi) override { GBTree::Load(fi); weight_drop_.resize(model_.param.num_trees); @@ -317,6 +390,21 @@ class Dart : public GBTree { } } + void LoadConfig(Json const& in) override { + CHECK_EQ(get(in["name"]), "dart"); + auto const& gbtree = in["gbtree"]; + GBTree::LoadConfig(gbtree); + fromJson(in["dart_train_param"], &dparam_); + } + void SaveConfig(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("dart"); + out["gbtree"] = Object(); + auto& gbtree = out["gbtree"]; + GBTree::SaveConfig(&gbtree); + out["dart_train_param"] = toJson(dparam_); + } + // predict the leaf scores with dropout if ntree_limit = 0 void PredictBatch(DMatrix* p_fmat, HostDeviceVector* out_preds, @@ -387,7 +475,7 @@ class Dart : public GBTree { if (init_out_preds) { size_t n = num_group * p_fmat->Info().num_row_; const auto& base_margin = - p_fmat->Info().base_margin_.ConstHostVector(); + p_fmat->Info().base_margin_.ConstHostVector(); out_preds->resize(n); if (base_margin.size() != 0) { CHECK_EQ(out_preds->size(), n); diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 12645648848a..09f1c4f0a103 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -192,6 +192,12 @@ class GBTree : public GradientBooster { model_.Save(fo); } + void LoadConfig(Json const& in) override; + void SaveConfig(Json* p_out) const override; + + void SaveModel(Json* p_out) const override; + void LoadModel(Json const& in) override; + bool AllowLazyCheckPoint() const override { return model_.learner_model_param_->num_output_group == 1 || tparam_.updater_seq.find("distcol") != std::string::npos; diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc new file mode 100644 index 000000000000..ccbdcd8cf4ee --- /dev/null +++ b/src/gbm/gbtree_model.cc @@ -0,0 +1,85 @@ +/*! + * Copyright 2019 by Contributors + */ +#include "xgboost/json.h" +#include "xgboost/logging.h" +#include "gbtree_model.h" + +namespace xgboost { +namespace gbm { + +void GBTreeModel::Save(dmlc::Stream* fo) const { + CHECK_EQ(param.num_trees, static_cast(trees.size())); + fo->Write(¶m, sizeof(param)); + for (const auto & tree : trees) { + tree->Save(fo); + } + if (tree_info.size() != 0) { + fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size()); + } +} + +void GBTreeModel::Load(dmlc::Stream* fi) { + CHECK_EQ(fi->Read(¶m, sizeof(param)), sizeof(param)) + << "GBTree: invalid model file"; + trees.clear(); + trees_to_update.clear(); + for (int32_t i = 0; i < param.num_trees; ++i) { + std::unique_ptr ptr(new RegTree()); + ptr->Load(fi); + trees.push_back(std::move(ptr)); + } + tree_info.resize(param.num_trees); + if (param.num_trees != 0) { + CHECK_EQ( + fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees), + sizeof(int32_t) * param.num_trees); + } +} + +void GBTreeModel::SaveModel(Json* p_out) const { + auto& out = *p_out; + CHECK_EQ(param.num_trees, static_cast(trees.size())); + out["model_param"] = toJson(param); + std::vector trees_json; + size_t t = 0; + for (auto const& tree : trees) { + Json tree_json{Object()}; + tree->SaveModel(&tree_json); + tree_json["id"] = std::to_string(t); + trees_json.emplace_back(tree_json); + t++; + } + + std::vector tree_info_json(tree_info.size()); + for (size_t i = 0; i < tree_info.size(); ++i) { + tree_info_json[i] = Integer(tree_info[i]); + } + + out["trees"] = Array(std::move(trees_json)); + out["tree_info"] = Array(std::move(tree_info_json)); +} + +void GBTreeModel::LoadModel(Json const& in) { + fromJson(in["model_param"], ¶m); + + trees.clear(); + trees_to_update.clear(); + + auto const& trees_json = get(in["trees"]); + trees.resize(trees_json.size()); + + for (size_t t = 0; t < trees.size(); ++t) { + trees[t].reset( new RegTree() ); + trees[t]->LoadModel(trees_json[t]); + } + + tree_info.resize(param.num_trees); + auto const& tree_info_json = get(in["tree_info"]); + for (int32_t i = 0; i < param.num_trees; ++i) { + tree_info[i] = get(tree_info_json[i]); + } +} + +} // namespace gbm +} // namespace xgboost diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h index c231d4adec1d..627f950dd97d 100644 --- a/src/gbm/gbtree_model.h +++ b/src/gbm/gbtree_model.h @@ -84,43 +84,11 @@ struct GBTreeModel : public Model { } } - void LoadModel(dmlc::Stream* fi) override { - // They are the same right now until we can split up the saved parameter from model. - this->Load(fi); - } - void SaveModel(dmlc::Stream* fo) const override { - // They are the same right now until we can split up the saved parameter from model. - this->Save(fo); - } - - void Load(dmlc::Stream* fi) { - CHECK_EQ(fi->Read(¶m, sizeof(param)), sizeof(param)) - << "GBTree: invalid model file"; - trees.clear(); - trees_to_update.clear(); - for (int i = 0; i < param.num_trees; ++i) { - std::unique_ptr ptr(new RegTree()); - ptr->LoadModel(fi); - trees.push_back(std::move(ptr)); - } - tree_info.resize(param.num_trees); - if (param.num_trees != 0) { - CHECK_EQ( - fi->Read(dmlc::BeginPtr(tree_info), sizeof(int) * param.num_trees), - sizeof(int) * param.num_trees); - } - } + void Load(dmlc::Stream* fi); + void Save(dmlc::Stream* fo) const; - void Save(dmlc::Stream* fo) const { - CHECK_EQ(param.num_trees, static_cast(trees.size())); - fo->Write(¶m, sizeof(param)); - for (const auto & tree : trees) { - tree->SaveModel(fo); - } - if (tree_info.size() != 0) { - fo->Write(dmlc::BeginPtr(tree_info), sizeof(int) * tree_info.size()); - } - } + void SaveModel(Json* p_out) const override; + void LoadModel(Json const& p_out) override; std::vector DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const { diff --git a/src/learner.cc b/src/learner.cc index da3cbe69adff..7a981f2577b4 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -30,6 +30,7 @@ #include "common/common.h" #include "common/io.h" +#include "common/observer.h" #include "common/random.h" #include "common/timer.h" #include "common/version.h" @@ -266,14 +267,114 @@ class LearnerImpl : public Learner { } } - void LoadModel(dmlc::Stream* fi) override { - // They are the same right now until we can split up the saved parameter from model. - this->Load(fi); + void LoadModel(Json const& in) override { + CHECK(IsA(in)); + Version::Load(in, false); + auto const& learner = get(in["Learner"]); + mparam_.FromJson(learner.at("learner_model_param")); + + auto const& objective_fn = learner.at("objective"); + + std::string name = get(objective_fn["name"]); + tparam_.UpdateAllowUnknown(Args{{"objective", name}}); + obj_.reset(ObjFunction::Create(name, &generic_parameters_)); + obj_->LoadConfig(objective_fn); + + auto const& gradient_booster = learner.at("gradient_booster"); + name = get(gradient_booster["name"]); + tparam_.UpdateAllowUnknown(Args{{"booster", name}}); + gbm_.reset(GradientBooster::Create(tparam_.booster, + &generic_parameters_, &learner_model_param_, + cache_)); + gbm_->LoadModel(gradient_booster); + + learner_model_param_ = LearnerModelParam(mparam_, + obj_->ProbToMargin(mparam_.base_score)); + + this->need_configuration_ = true; + } + + void SaveModel(Json* p_out) const override { + CHECK(!this->need_configuration_) << "Call Configure before saving model."; + + Version::Save(p_out); + Json& out { *p_out }; + + out["Learner"] = Object(); + auto& learner = out["Learner"]; + + learner["learner_model_param"] = mparam_.ToJson(); + learner["gradient_booster"] = Object(); + auto& gradient_booster = learner["gradient_booster"]; + gbm_->SaveModel(&gradient_booster); + + learner["objective"] = Object(); + auto& objective_fn = learner["objective"]; + obj_->SaveConfig(&objective_fn); } - void SaveModel(dmlc::Stream* fo) const override { - // They are the same right now until we can split up the saved parameter from model. - this->Save(fo); + void LoadConfig(Json const& in) override { + CHECK(IsA(in)); + Version::Load(in, true); + + auto const& learner_parameters = get(in["Learner"]); + fromJson(learner_parameters.at("learner_train_param"), &tparam_); + + auto const& gradient_booster = learner_parameters.at("gradient_booster"); + + auto const& objective_fn = learner_parameters.at("objective"); + if (!obj_) { + obj_.reset(ObjFunction::Create(tparam_.objective, &generic_parameters_)); + } + obj_->LoadConfig(objective_fn); + + tparam_.booster = get(gradient_booster["name"]); + if (!gbm_) { + gbm_.reset(GradientBooster::Create(tparam_.booster, + &generic_parameters_, &learner_model_param_, + cache_)); + } + gbm_->LoadConfig(gradient_booster); + + auto const& j_metrics = learner_parameters.at("metrics"); + auto n_metrics = get(j_metrics).size(); + metric_names_.resize(n_metrics); + metrics_.resize(n_metrics); + for (size_t i = 0; i < n_metrics; ++i) { + metric_names_[i]= get(j_metrics[i]); + metrics_[i] = std::unique_ptr( + Metric::Create(metric_names_.back(), &generic_parameters_)); + } + + fromJson(learner_parameters.at("generic_parameter"), &generic_parameters_); + + this->need_configuration_ = true; + } + + void SaveConfig(Json* p_out) const override { + CHECK(!this->need_configuration_) << "Call Configure before saving model."; + Version::Save(p_out); + Json& out { *p_out }; + // parameters + out["Learner"] = Object(); + auto& learner_parameters = out["Learner"]; + + learner_parameters["learner_train_param"] = toJson(tparam_); + learner_parameters["gradient_booster"] = Object(); + auto& gradient_booster = learner_parameters["gradient_booster"]; + gbm_->SaveConfig(&gradient_booster); + + learner_parameters["objective"] = Object(); + auto& objective_fn = learner_parameters["objective"]; + obj_->SaveConfig(&objective_fn); + + std::vector metrics(metrics_.size()); + for (size_t i = 0; i < metrics_.size(); ++i) { + metrics[i] = String(metrics_[i]->Name()); + } + learner_parameters["metrics"] = Array(metrics); + + learner_parameters["generic_parameter"] = toJson(generic_parameters_); } void Load(dmlc::Stream* fi) override { @@ -281,6 +382,7 @@ class LearnerImpl : public Learner { tparam_.Init(std::vector>{}); // TODO(tqchen) mark deprecation of old format. common::PeekableInStream fp(fi); + // backward compatible header check. std::string header; header.resize(4); @@ -291,6 +393,16 @@ class LearnerImpl : public Learner { CHECK_EQ(fp.Read(&header[0], 4), 4U); } } + + if (header[0] == '{') { + auto json_stream = common::FixedSizeStream(&fp); + std::string buffer; + json_stream.Take(&buffer); + auto memory_snapshot = Json::Load({buffer.c_str(), buffer.size()}); + this->LoadModel(memory_snapshot["Model"]); + this->LoadConfig(memory_snapshot["Config"]); + return; + } // use the peekable reader. fi = &fp; // read parameter @@ -323,43 +435,9 @@ class LearnerImpl : public Learner { std::vector > attr; fi->Read(&attr); for (auto& kv : attr) { - // Load `predictor`, `gpu_id` parameters from extra attributes const std::string prefix = "SAVED_PARAM_"; if (kv.first.find(prefix) == 0) { const std::string saved_param = kv.first.substr(prefix.length()); - bool is_gpu_predictor = saved_param == "predictor" && kv.second == "gpu_predictor"; -#ifdef XGBOOST_USE_CUDA - if (saved_param == "predictor" || saved_param == "gpu_id") { - cfg_[saved_param] = kv.second; - LOG(INFO) - << "Parameter '" << saved_param << "' has been recovered from " - << "the saved model. It will be set to " - << RenderParamVal(kv.second) << " for prediction. To " - << "override the predictor behavior, explicitly set '" - << saved_param << "' parameter as follows:\n" - << " * Python package: bst.set_param('" - << saved_param << "', [new value])\n" - << " * R package: xgb.parameters(bst) <- list(" - << saved_param << " = [new value])\n" - << " * JVM packages: bst.setParam(\"" - << saved_param << "\", [new value])"; - } -#else - if (is_gpu_predictor) { - cfg_["predictor"] = "cpu_predictor"; - kv.second = "cpu_predictor"; - } -#endif // XGBOOST_USE_CUDA -#if defined(XGBOOST_USE_CUDA) - // NO visible GPU in current environment - if (is_gpu_predictor && common::AllVisibleGPUs() == 0) { - cfg_["predictor"] = "cpu_predictor"; - kv.second = "cpu_predictor"; - LOG(INFO) << "Switch gpu_predictor to cpu_predictor."; - } else if (is_gpu_predictor) { - cfg_["predictor"] = "gpu_predictor"; - } -#endif // defined(XGBOOST_USE_CUDA) if (saved_configs_.find(saved_param) != saved_configs_.end()) { cfg_[saved_param] = kv.second; } @@ -407,6 +485,20 @@ class LearnerImpl : public Learner { // rabit save model to rabit checkpoint void Save(dmlc::Stream* fo) const override { + if (generic_parameters_.enable_experimental_json_serialization) { + Json memory_snapshot{Object()}; + memory_snapshot["Model"] = Object(); + auto &model = memory_snapshot["Model"]; + this->SaveModel(&model); + memory_snapshot["Config"] = Object(); + auto &config = memory_snapshot["Config"]; + this->SaveConfig(&config); + std::string out_str; + Json::Dump(memory_snapshot, &out_str); + fo->Write(out_str.c_str(), out_str.size()); + return; + } + if (this->need_configuration_) { // Save empty model. Calling Configure in a dummy LearnerImpl avoids violating // constness. @@ -432,7 +524,7 @@ class LearnerImpl : public Learner { } } { - std::vector saved_params{"predictor", "gpu_id"}; + std::vector saved_params; // check if rabit_bootstrap_cache were set to non zero before adding to checkpoint if (cfg_.find("rabit_bootstrap_cache") != cfg_.end() && (cfg_.find("rabit_bootstrap_cache"))->second != "0") { @@ -448,19 +540,6 @@ class LearnerImpl : public Learner { } } } -#if defined(XGBOOST_USE_CUDA) - { - // Force save gpu_id. - if (std::none_of(extra_attr.cbegin(), extra_attr.cend(), - [](std::pair const& it) { - return it.first == "SAVED_PARAM_gpu_id"; - })) { - mparam.contain_extra_attrs = 1; - extra_attr.emplace_back("SAVED_PARAM_gpu_id", - std::to_string(generic_parameters_.gpu_id)); - } - } -#endif // defined(XGBOOST_USE_CUDA) fo->Write(&mparam, sizeof(LearnerModelParamLegacy)); fo->Write(tparam_.objective); fo->Write(tparam_.booster); @@ -504,6 +583,7 @@ class LearnerImpl : public Learner { void UpdateOneIter(int iter, DMatrix* train) override { monitor_.Start("UpdateOneIter"); + TrainingObserver::Instance().Update(iter); this->Configure(); if (generic_parameters_.seed_per_iteration || rabit::IsDistributed()) { common::GlobalRandom().seed(generic_parameters_.seed * kRandSeedMagic + iter); @@ -514,9 +594,13 @@ class LearnerImpl : public Learner { monitor_.Start("PredictRaw"); this->PredictRaw(train, &preds_[train]); monitor_.Stop("PredictRaw"); + TrainingObserver::Instance().Observe(preds_[train], "Predictions"); + monitor_.Start("GetGradient"); obj_->GetGradient(preds_[train], train->Info(), iter, &gpair_); monitor_.Stop("GetGradient"); + TrainingObserver::Instance().Observe(gpair_, "Gradients"); + gbm_->DoBoost(train, &gpair_, obj_.get()); monitor_.Stop("UpdateOneIter"); } @@ -765,9 +849,9 @@ class LearnerImpl : public Learner { common::Monitor monitor_; - /*! \brief saved config keys used to restore failed worker */ + /*! \brief (Deprecated) saved config keys used to restore failed worker */ std::set saved_configs_ = {"max_depth", "tree_method", "dsplit", - "seed", "silent", "num_round", "gamma", "min_child_weight"}; + "seed", "num_round", "gamma", "min_child_weight"}; }; std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc index 810906a2be60..49b2178c93ae 100644 --- a/src/tree/tree_model.cc +++ b/src/tree/tree_model.cc @@ -8,12 +8,15 @@ #include #include +#include + #include #include #include #include #include "param.h" +#include "../common/common.h" namespace xgboost { // register tree parameter @@ -615,7 +618,7 @@ std::string RegTree::DumpModel(const FeatureMap& fmap, return result; } -void RegTree::LoadModel(dmlc::Stream* fi) { +void RegTree::Load(dmlc::Stream* fi) { CHECK_EQ(fi->Read(¶m, sizeof(TreeParam)), sizeof(TreeParam)); nodes_.resize(param.num_nodes); stats_.resize(param.num_nodes); @@ -633,11 +636,7 @@ void RegTree::LoadModel(dmlc::Stream* fi) { } CHECK_EQ(static_cast(deleted_nodes_.size()), param.num_deleted); } -/*! - * \brief save model to stream - * \param fo output stream - */ -void RegTree::SaveModel(dmlc::Stream* fo) const { +void RegTree::Save(dmlc::Stream* fo) const { CHECK_EQ(param.num_nodes, static_cast(nodes_.size())); CHECK_EQ(param.num_nodes, static_cast(stats_.size())); fo->Write(¶m, sizeof(TreeParam)); @@ -646,6 +645,114 @@ void RegTree::SaveModel(dmlc::Stream* fo) const { fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size()); } +void RegTree::LoadModel(Json const& in) { + fromJson(in["tree_param"], ¶m); + auto n_nodes = param.num_nodes; + CHECK_NE(n_nodes, 0); + // stats + auto const& loss_changes = get(in["loss_changes"]); + CHECK_EQ(loss_changes.size(), n_nodes); + auto const& sum_hessian = get(in["sum_hessian"]); + CHECK_EQ(sum_hessian.size(), n_nodes); + auto const& base_weights = get(in["base_weights"]); + CHECK_EQ(base_weights.size(), n_nodes); + auto const& leaf_child_counts = get(in["leaf_child_counts"]); + CHECK_EQ(leaf_child_counts.size(), n_nodes); + // nodes + auto const& lefts = get(in["left_children"]); + CHECK_EQ(lefts.size(), n_nodes); + auto const& rights = get(in["right_children"]); + CHECK_EQ(rights.size(), n_nodes); + auto const& parents = get(in["parents"]); + CHECK_EQ(parents.size(), n_nodes); + auto const& indices = get(in["split_indices"]); + CHECK_EQ(indices.size(), n_nodes); + auto const& conds = get(in["split_conditions"]); + CHECK_EQ(conds.size(), n_nodes); + auto const& default_left = get(in["default_left"]); + CHECK_EQ(default_left.size(), n_nodes); + + stats_.resize(n_nodes); + nodes_.resize(n_nodes); + for (int32_t i = 0; i < n_nodes; ++i) { + auto& s = stats_[i]; + s.loss_chg = get(loss_changes[i]); + s.sum_hess = get(sum_hessian[i]); + s.base_weight = get(base_weights[i]); + s.leaf_child_cnt = get(leaf_child_counts[i]); + + auto& n = nodes_[i]; + auto left = get(lefts[i]); + auto right = get(rights[i]); + auto parent = get(parents[i]); + auto ind = get(indices[i]); + auto cond = get(conds[i]); + auto dft_left = get(default_left[i]); + n = Node(left, right, parent, ind, cond, dft_left); + } + + + deleted_nodes_.resize(0); + for (bst_node_t i = 1; i < param.num_nodes; ++i) { + if (nodes_[i].IsDeleted()) { + deleted_nodes_.push_back(i); + } + } + CHECK_EQ(static_cast(deleted_nodes_.size()), param.num_deleted); +} + +void RegTree::SaveModel(Json* p_out) const { + auto& out = *p_out; + CHECK_EQ(param.num_nodes, static_cast(nodes_.size())); + CHECK_EQ(param.num_nodes, static_cast(stats_.size())); + out["tree_param"] = toJson(param); + CHECK_EQ(get(out["tree_param"]["num_nodes"]), std::to_string(param.num_nodes)); + using I = Integer::Int; + auto n_nodes = param.num_nodes; + + // stats + std::vector loss_changes(n_nodes); + std::vector sum_hessian(n_nodes); + std::vector base_weights(n_nodes); + std::vector leaf_child_counts(n_nodes); + + // nodes + std::vector lefts(n_nodes); + std::vector rights(n_nodes); + std::vector parents(n_nodes); + std::vector indices(n_nodes); + std::vector conds(n_nodes); + std::vector default_left(n_nodes); + + for (int32_t i = 0; i < n_nodes; ++i) { + auto const& s = stats_[i]; + loss_changes[i] = s.loss_chg; + sum_hessian[i] = s.sum_hess; + base_weights[i] = s.base_weight; + leaf_child_counts[i] = static_cast(s.leaf_child_cnt); + + auto const& n = nodes_[i]; + lefts[i] = static_cast(n.LeftChild()); + rights[i] = static_cast(n.RightChild()); + parents[i] = static_cast(n.Parent()); + indices[i] = static_cast(n.SplitIndex()); + conds[i] = n.SplitCond(); + default_left[i] = n.DefaultLeft(); + } + + out["loss_changes"] = std::move(loss_changes); + out["sum_hessian"] = std::move(sum_hessian); + out["base_weights"] = std::move(base_weights); + out["leaf_child_counts"] = std::move(leaf_child_counts); + + out["left_children"] = std::move(lefts); + out["right_children"] = std::move(rights); + out["parents"] = std::move(parents); + out["split_indices"] = std::move(indices); + out["split_conditions"] = std::move(conds); + out["default_left"] = std::move(default_left); +} + void RegTree::FillNodeMeanValues() { size_t num_nodes = this->param.num_nodes; if (this->node_mean_values_.size() == num_nodes) { diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 9bbbbb5340cb..271bc8ae68f1 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -1110,12 +1110,12 @@ class GPUHistMakerSpecialised { common::MemoryBufferStream fs(&s_model); int rank = rabit::GetRank(); if (rank == 0) { - local_tree->SaveModel(&fs); + local_tree->Save(&fs); } fs.Seek(0); rabit::Broadcast(&s_model, 0); RegTree reference_tree {}; // rank 0 tree - reference_tree.LoadModel(&fs); + reference_tree.Load(&fs); CHECK(*local_tree == reference_tree); } @@ -1162,7 +1162,17 @@ class GPUHistMakerSpecialised { class GPUHistMaker : public TreeUpdater { public: void Configure(const Args& args) override { - hist_maker_param_.UpdateAllowUnknown(args); + bool changed {false}; + hist_maker_param_.UpdateAllowUnknown(args, &changed); + if (!changed && (float_maker_ || double_maker_)) { + if (hist_maker_param_.single_precision_histogram) { + float_maker_->Configure(args, tparam_); + } else { + double_maker_->Configure(args, tparam_); + } + return; + } + float_maker_.reset(); double_maker_.reset(); if (hist_maker_param_.single_precision_histogram) { diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index f02e8231a8f0..53a1f07c2dc3 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -41,7 +41,6 @@ void QuantileHistMaker::Configure(const Args& args) { } pruner_->Configure(args); param_.UpdateAllowUnknown(args); - is_gmat_initialized_ = false; // initialise the split evaluator if (!spliteval_) { diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc index 7d2240fbdde5..578bfb83cea9 100644 --- a/src/tree/updater_sync.cc +++ b/src/tree/updater_sync.cc @@ -40,13 +40,13 @@ class TreeSyncher: public TreeUpdater { int rank = rabit::GetRank(); if (rank == 0) { for (auto tree : trees) { - tree->SaveModel(&fs); + tree->Save(&fs); } } fs.Seek(0); rabit::Broadcast(&s_model, 0); for (auto tree : trees) { - tree->LoadModel(&fs); + tree->Load(&fs); } } }; diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc index 3d096a1eaaf6..2f958ec35ae1 100644 --- a/tests/cpp/c_api/test_c_api.cc +++ b/tests/cpp/c_api/test_c_api.cc @@ -3,6 +3,9 @@ #include #include #include +#include + +#include "../helpers.h" TEST(c_api, XGDMatrixCreateFromMatDT) { std::vector col0 = {0, -1, 3}; @@ -70,3 +73,41 @@ TEST(c_api, Version) { XGBoostVersion(NULL, NULL, &patch); // NOLINT ASSERT_EQ(patch, XGBOOST_VER_PATCH); } + +namespace xgboost { + +TEST(c_api, ConfigIO) { + size_t constexpr kRows = 10; + auto pp_dmat = CreateDMatrix(kRows, 10, 0); + auto p_dmat = *pp_dmat; + std::vector> mat {p_dmat}; + std::vector labels(kRows); + for (size_t i = 0; i < labels.size(); ++i) { + labels[i] = i; + } + p_dmat->Info().labels_.HostVector() = labels; + + std::shared_ptr learner { Learner::Create(mat) }; + + BoosterHandle handle = learner.get(); + learner->UpdateOneIter(0, p_dmat.get()); + + char const* out[1]; + bst_ulong len {0}; + XGBoosterSaveJsonParameters(handle, &len, out); + + std::string config_str_0 { out[0] }; + auto config_0 = Json::Load({config_str_0.c_str(), config_str_0.size()}); + XGBoosterLoadJsonParameters(handle, out[0]); + + bst_ulong len_1 {0}; + std::string config_str_1 { out[0] }; + XGBoosterSaveJsonParameters(handle, &len_1, out); + auto config_1 = Json::Load({config_str_1.c_str(), config_str_1.size()}); + + ASSERT_EQ(config_0, config_1); + + delete pp_dmat; +} + +} // namespace xgboost diff --git a/tests/cpp/gbm/test_gblinear.cc b/tests/cpp/gbm/test_gblinear.cc new file mode 100644 index 000000000000..a63040d9d35f --- /dev/null +++ b/tests/cpp/gbm/test_gblinear.cc @@ -0,0 +1,57 @@ +/*! + * Copyright 2019 by Contributors + */ +#include + +#include +#include + +#include "../helpers.h" +#include "xgboost/json.h" +#include "xgboost/logging.h" +#include "xgboost/gbm.h" +#include "xgboost/generic_parameters.h" +#include "xgboost/learner.h" + +namespace xgboost { +namespace gbm { + +TEST(GBLinear, Json_IO) { + size_t constexpr kRows = 16, kCols = 16; + + LearnerModelParam param; + param.num_feature = kCols; + param.num_output_group = 1; + + GenericParameter gparam; + gparam.Init(Args{}); + + std::unique_ptr gbm { + CreateTrainedGBM("gblinear", Args{}, kRows, kCols, ¶m, &gparam) }; + Json model { Object() }; + gbm->SaveModel(&model); + ASSERT_TRUE(IsA(model)); + + std::string model_str; + Json::Dump(model, &model_str); + + model = Json::Load({model_str.c_str(), model_str.size()}); + ASSERT_TRUE(IsA(model)); + model = model["model"]; + + { + auto weights = get(model["weights"]); + ASSERT_EQ(weights.size(), 17); + } + + { + model = Json::Load({model_str.c_str(), model_str.size()}); + model = model["model"]; + auto weights = get(model["weights"]); + ASSERT_EQ(weights.size(), 17); // 16 + 1 (bias) + } + +} + +} // namespace gbm +} // namespace xgboost diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index 22b972e4346e..31d78ddcaa7e 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -96,6 +96,79 @@ TEST(GBTree, ChoosePredictor) { } // data is not pulled back into host ASSERT_FALSE(data.HostCanWrite()); + + delete pp_dmat; +} +#endif // XGBOOST_USE_CUDA + +// Some other parts of test are in `Tree.Json_IO'. +TEST(GBTree, Json_IO) { + size_t constexpr kRows = 16, kCols = 16; + + LearnerModelParam mparam; + mparam.num_feature = kCols; + mparam.num_output_group = 1; + mparam.base_score = 0.5; + + GenericParameter gparam; + gparam.Init(Args{}); + + std::unique_ptr gbm { + CreateTrainedGBM("gbtree", Args{}, kRows, kCols, &mparam, &gparam) }; + + Json model {Object()}; + model["model"] = Object(); + auto& j_model = model["model"]; + model["parameters"] = Object(); + auto& j_param = model["parameters"]; + + gbm->SaveModel(&j_model); + gbm->SaveConfig(&j_param); + + std::stringstream ss; + Json::Dump(model, &ss); + + auto model_str = ss.str(); + model = Json::Load({model_str.c_str(), model_str.size()}); + ASSERT_EQ(get(model["model"]["name"]), "gbtree"); + + auto j_train_param = model["parameters"]["gbtree_train_param"]; + ASSERT_EQ(get(j_train_param["num_parallel_tree"]), "1"); +} + +TEST(Dart, Json_IO) { + size_t constexpr kRows = 16, kCols = 16; + + LearnerModelParam mparam; + mparam.num_feature = kCols; + mparam.base_score = 0.5; + mparam.num_output_group = 1; + + GenericParameter gparam; + gparam.Init(Args{}); + + std::unique_ptr gbm { + CreateTrainedGBM("dart", Args{}, kRows, kCols, &mparam, &gparam) }; + + Json model {Object()}; + model["model"] = Object(); + auto& j_model = model["model"]; + model["parameters"] = Object(); + auto& j_param = model["parameters"]; + + gbm->SaveModel(&j_model); + gbm->SaveConfig(&j_param); + + std::string model_str; + Json::Dump(model, &model_str); + + model = Json::Load({model_str.c_str(), model_str.size()}); + ASSERT_EQ(get(model["model"]["name"]), "dart") << model; + ASSERT_EQ(get(model["parameters"]["name"]), "dart"); + + { + auto const& gbtree = model["model"]["gbtree"]; + ASSERT_TRUE(IsA(gbtree)); + } } -#endif } // namespace xgboost diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index 5f2ba1b23426..ffc8743b42d9 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -13,23 +13,6 @@ #include "../helpers.h" #include "../../../src/gbm/gbtree_model.h" -namespace { - -inline void CheckCAPICall(int ret) { - ASSERT_EQ(ret, 0) << XGBGetLastError(); -} - -} // namespace anonymous - -const std::map& -QueryBoosterConfigurationArguments(BoosterHandle handle) { - CHECK_NE(handle, static_cast(nullptr)); - auto* bst = static_cast(handle); - bst->Configure(); - return bst->GetConfigurationArguments(); -} - - namespace xgboost { namespace predictor { @@ -110,77 +93,5 @@ TEST(gpu_predictor, ExternalMemoryTest) { } } } - -// Test whether pickling preserves predictor parameters -TEST(gpu_predictor, PicklingTest) { - int const gpuid = 0; - - dmlc::TemporaryDirectory tempdir; - const std::string tmp_file = tempdir.path + "/simple.libsvm"; - CreateBigTestData(tmp_file, 600); - - DMatrixHandle dmat[1]; - BoosterHandle bst, bst2; - std::vector label; - for (int i = 0; i < 200; ++i) { - label.push_back((i % 2 ? 1 : 0)); - } - - // Load data matrix - ASSERT_EQ(XGDMatrixCreateFromFile( - tmp_file.c_str(), 0, &dmat[0]), 0) << XGBGetLastError(); - ASSERT_EQ(XGDMatrixSetFloatInfo( - dmat[0], "label", label.data(), 200), 0) << XGBGetLastError(); - // Create booster - ASSERT_EQ(XGBoosterCreate(dmat, 1, &bst), 0) << XGBGetLastError(); - // Set parameters - ASSERT_EQ(XGBoosterSetParam(bst, "seed", "0"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam(bst, "base_score", "0.5"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam(bst, "booster", "gbtree"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam(bst, "learning_rate", "0.01"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam(bst, "max_depth", "8"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam( - bst, "objective", "binary:logistic"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam(bst, "seed", "123"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam( - bst, "tree_method", "gpu_hist"), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam( - bst, "gpu_id", std::to_string(gpuid).c_str()), 0) << XGBGetLastError(); - ASSERT_EQ(XGBoosterSetParam(bst, "predictor", "gpu_predictor"), 0) << XGBGetLastError(); - - // Run boosting iterations - for (int i = 0; i < 10; ++i) { - ASSERT_EQ(XGBoosterUpdateOneIter(bst, i, dmat[0]), 0) << XGBGetLastError(); - } - - // Delete matrix - CheckCAPICall(XGDMatrixFree(dmat[0])); - - // Pickle - const char* dptr; - bst_ulong len; - std::string buf; - CheckCAPICall(XGBoosterGetModelRaw(bst, &len, &dptr)); - buf = std::string(dptr, len); - CheckCAPICall(XGBoosterFree(bst)); - - // Unpickle - CheckCAPICall(XGBoosterCreate(nullptr, 0, &bst2)); - CheckCAPICall(XGBoosterLoadModelFromBuffer(bst2, buf.c_str(), len)); - - { // Query predictor - const auto& kwargs = QueryBoosterConfigurationArguments(bst2); - ASSERT_EQ(kwargs.at("predictor"), "gpu_predictor"); - ASSERT_EQ(kwargs.at("gpu_id"), std::to_string(gpuid).c_str()); - } - - { // Change predictor and query again - CheckCAPICall(XGBoosterSetParam(bst2, "predictor", "cpu_predictor")); - const auto& kwargs = QueryBoosterConfigurationArguments(bst2); - ASSERT_EQ(kwargs.at("predictor"), "cpu_predictor"); - } - - CheckCAPICall(XGBoosterFree(bst2)); -} } // namespace predictor } // namespace xgboost diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 2a1088029bd8..75bb80e8670f 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -6,6 +6,8 @@ #include #include +#include "xgboost/json.h" +#include "../../src/common/io.h" namespace xgboost { @@ -112,83 +114,51 @@ TEST(Learner, Configuration) { } } -TEST(Learner, ObjectiveParameter) { - using Arg = std::pair; - size_t constexpr kRows = 10; - auto pp_dmat = CreateDMatrix(kRows, 10, 0); - auto p_dmat = *pp_dmat; +TEST(Learner, Json_ModelIO) { + // Test of comparing JSON object directly. + size_t constexpr kRows = 8; + int32_t constexpr kIters = 4; - std::vector labels(kRows); - for (size_t i = 0; i < labels.size(); ++i) { - labels[i] = i; - } - p_dmat->Info().labels_.HostVector() = labels; - std::vector> mat {p_dmat}; - - std::unique_ptr learner {Learner::Create(mat)}; - learner->SetParams({Arg{"tree_method", "auto"}, - Arg{"objective", "multi:softprob"}, - Arg{"num_class", "10"}}); - learner->UpdateOneIter(0, p_dmat.get()); - auto attr_names = learner->GetConfigurationArguments(); - ASSERT_EQ(attr_names.at("objective"), "multi:softprob"); - - dmlc::TemporaryDirectory tempdir; - const std::string fname = tempdir.path + "/model_para.bst"; + auto pp_dmat = CreateDMatrix(kRows, 10, 0); + std::shared_ptr p_dmat {*pp_dmat}; + p_dmat->Info().labels_.Resize(kRows); { - // Create a scope to close the stream before next read. - std::unique_ptr fo(dmlc::Stream::Create(fname.c_str(), "w")); - learner->Save(fo.get()); - } + std::unique_ptr learner { Learner::Create({p_dmat}) }; + learner->Configure(); + Json out { Object() }; + learner->SaveModel(&out); - std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); - std::unique_ptr learner1 {Learner::Create(mat)}; - learner1->Load(fi.get()); - auto attr_names1 = learner1->GetConfigurationArguments(); - ASSERT_EQ(attr_names1.at("objective"), "multi:softprob"); + learner->LoadModel(out); + learner->Configure(); - delete pp_dmat; -} - -#if defined(XGBOOST_USE_CUDA) - -TEST(Learner, IO) { - using Arg = std::pair; - size_t constexpr kRows = 10; - auto pp_dmat = CreateDMatrix(kRows, 10, 0); - auto p_dmat = *pp_dmat; - - std::vector labels(kRows); - for (size_t i = 0; i < labels.size(); ++i) { - labels[i] = i; + Json new_in { Object() }; + learner->SaveModel(&new_in); + ASSERT_EQ(new_in, out); } - p_dmat->Info().labels_.HostVector() = labels; - std::vector> mat {p_dmat}; - std::unique_ptr learner {Learner::Create(mat)}; - learner->SetParams({Arg{"tree_method", "auto"}, - Arg{"predictor", "gpu_predictor"}, - Arg{"gpu_id", "0"}}); - learner->UpdateOneIter(0, p_dmat.get()); - ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); + { + std::unique_ptr learner { Learner::Create({p_dmat}) }; + learner->SetParam("verbosity", "3"); + for (int32_t iter = 0; iter < kIters; ++iter) { + learner->UpdateOneIter(iter, p_dmat.get()); + } - dmlc::TemporaryDirectory tempdir; - const std::string fname = tempdir.path + "/model.bst"; + Json out { Object() }; + learner->SaveModel(&out); - { - // Create a scope to close the stream before next read. - std::unique_ptr fo(dmlc::Stream::Create(fname.c_str(), "w")); - learner->Save(fo.get()); - } + learner->LoadModel(out); + Json new_in { Object() }; + learner->Configure(); + learner->SaveModel(&new_in); - std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); - learner->Load(fi.get()); - ASSERT_EQ(learner->GetGenericParameter().gpu_id, 0); + ASSERT_EQ(out, new_in); + } delete pp_dmat; } +#if defined(XGBOOST_USE_CUDA) // Tests for automatic GPU configuration. TEST(Learner, GPUConfiguration) { using Arg = std::pair; @@ -242,6 +212,5 @@ TEST(Learner, GPUConfiguration) { delete pp_dmat; } -#endif // XGBOOST_USE_CUDA - +#endif // defined(XGBOOST_USE_CUDA) } // namespace xgboost diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc new file mode 100644 index 000000000000..a91be3d83105 --- /dev/null +++ b/tests/cpp/test_serialization.cc @@ -0,0 +1,593 @@ +#include +#include +#include +#include +#include +#include "helpers.h" +#include "../../src/common/io.h" +#include "../../src/common/random.h" + +namespace xgboost { + +void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr p_dmat) { + for (auto& batch : p_dmat->GetBatches()) { + batch.data.HostVector(); + batch.offset.HostVector(); + } + + int32_t constexpr kIters = 2; + + dmlc::TemporaryDirectory tempdir; + std::string const fname = tempdir.path + "/model"; + + std::vector dumped_0; + std::string model_at_kiter; + + { + std::unique_ptr fo(dmlc::Stream::Create(fname.c_str(), "w")); + std::unique_ptr learner {Learner::Create({p_dmat})}; + learner->SetParams(args); + for (int32_t iter = 0; iter < kIters; ++iter) { + learner->UpdateOneIter(iter, p_dmat.get()); + } + dumped_0 = learner->DumpModel(fmap, true, "json"); + learner->Save(fo.get()); + + common::MemoryBufferStream mem_out(&model_at_kiter); + learner->Save(&mem_out); + } + + std::vector dumped_1; + { + std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); + std::unique_ptr learner {Learner::Create({p_dmat})}; + learner->Load(fi.get()); + learner->Configure(); + dumped_1 = learner->DumpModel(fmap, true, "json"); + } + ASSERT_EQ(dumped_0, dumped_1); + + std::string model_at_2kiter; + + // Test training continuation with data from host + { + std::string continued_model; + { + // Continue the previous training with another kIters + std::unique_ptr fi( + dmlc::Stream::Create(fname.c_str(), "r")); + std::unique_ptr learner{Learner::Create({p_dmat})}; + learner->Load(fi.get()); + learner->Configure(); + + // verify the loaded model doesn't change. + std::string serialised_model_tmp; + common::MemoryBufferStream mem_out(&serialised_model_tmp); + learner->Save(&mem_out); + ASSERT_EQ(model_at_kiter, serialised_model_tmp); + + for (auto &batch : p_dmat->GetBatches()) { + batch.data.HostVector(); + batch.offset.HostVector(); + } + + for (int32_t iter = kIters; iter < 2 * kIters; ++iter) { + learner->UpdateOneIter(iter, p_dmat.get()); + } + common::MemoryBufferStream fo(&continued_model); + learner->Save(&fo); + } + + { + // Train 2 * kIters in one go + std::unique_ptr learner{Learner::Create({p_dmat})}; + learner->SetParams(args); + for (int32_t iter = 0; iter < 2 * kIters; ++iter) { + learner->UpdateOneIter(iter, p_dmat.get()); + + // Verify model is same at the same iteration during two training + // sessions. + if (iter == kIters - 1) { + std::string reproduced_model; + common::MemoryBufferStream fo(&reproduced_model); + learner->Save(&fo); + ASSERT_EQ(model_at_kiter, reproduced_model); + } + } + common::MemoryBufferStream fo(&model_at_2kiter); + learner->Save(&fo); + } + Json m_0 = Json::Load(StringView{continued_model.c_str(), continued_model.size()}); + Json m_1 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()}); + ASSERT_EQ(m_0, m_1); + } + + // Test training continuation with data from device. + { + // Continue the previous training but on data from device. + std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r")); + std::unique_ptr learner{Learner::Create({p_dmat})}; + learner->Load(fi.get()); + learner->Configure(); + + // verify the loaded model doesn't change. + std::string serialised_model_tmp; + common::MemoryBufferStream mem_out(&serialised_model_tmp); + learner->Save(&mem_out); + ASSERT_EQ(model_at_kiter, serialised_model_tmp); + + learner->SetParam("gpu_id", "0"); + // Pull data to device + for (auto &batch : p_dmat->GetBatches()) { + batch.data.SetDevice(0); + batch.data.DeviceSpan(); + batch.offset.SetDevice(0); + batch.offset.DeviceSpan(); + } + + for (int32_t iter = kIters; iter < 2 * kIters; ++iter) { + learner->UpdateOneIter(iter, p_dmat.get()); + } + serialised_model_tmp = std::string{}; + common::MemoryBufferStream fo(&serialised_model_tmp); + learner->Save(&fo); + + Json m_0 = Json::Load(StringView{model_at_2kiter.c_str(), model_at_2kiter.size()}); + Json m_1 = Json::Load(StringView{serialised_model_tmp.c_str(), serialised_model_tmp.size()}); + // GPU ID is changed as data is coming from device. + ASSERT_EQ(get(m_0["Config"]["Learner"]["generic_parameter"]).erase("gpu_id"), + get(m_1["Config"]["Learner"]["generic_parameter"]).erase("gpu_id")); + } +} + +// Binary is not tested, as it is NOT reproducible. +class SerializationTest : public ::testing::Test { + protected: + size_t constexpr static kRows = 10; + size_t constexpr static kCols = 10; + std::shared_ptr* pp_dmat_; + FeatureMap fmap_; + + protected: + ~SerializationTest() override { + delete pp_dmat_; + } + void SetUp() override { + pp_dmat_ = CreateDMatrix(kRows, kCols, .5f); + + std::shared_ptr p_dmat{*pp_dmat_}; + p_dmat->Info().labels_.Resize(kRows); + auto &h_labels = p_dmat->Info().labels_.HostVector(); + + xgboost::SimpleLCG gen(0); + SimpleRealUniformDistribution dis(0.0f, 1.0f); + + for (auto& v : h_labels) { v = dis(&gen); } + + for (size_t i = 0; i < kCols; ++i) { + std::string name = "feat_" + std::to_string(i); + fmap_.PushBack(i, name.c_str(), "q"); + } + } +}; + +TEST_F(SerializationTest, Exact) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"num_parallel_tree", "4"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); +} + +TEST_F(SerializationTest, Approx) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "approx"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"num_parallel_tree", "4"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "approx"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "approx"}}, + fmap_, *pp_dmat_); +} + +TEST_F(SerializationTest, Hist) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"num_parallel_tree", "4"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); +} + +TEST_F(SerializationTest, CPU_CoordDescent) { + TestLearnerSerialization({{"booster", "gblinear"}, + {"seed", "0"}, + {"nthread", "1"}, + {"enable_experimental_json_serialization", "1"}, + {"updater", "coord_descent"}}, + fmap_, *pp_dmat_); +} + +#if defined(XGBOOST_USE_CUDA) +TEST_F(SerializationTest, GPU_Hist) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"enable_experimental_json_serialization", "1"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"seed", "0"}, + {"enable_experimental_json_serialization", "1"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"num_parallel_tree", "4"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"seed", "0"}, + {"enable_experimental_json_serialization", "1"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); +} + +TEST_F(SerializationTest, GPU_CoordDescent) { + TestLearnerSerialization({{"booster", "gblinear"}, + {"seed", "0"}, + {"nthread", "1"}, + {"enable_experimental_json_serialization", "1"}, + {"updater", "gpu_coord_descent"}}, + fmap_, *pp_dmat_); +} +#endif // defined(XGBOOST_USE_CUDA) + + +class LogitSerializationTest : public SerializationTest { + protected: + void SetUp() override { + pp_dmat_ = CreateDMatrix(kRows, kCols, .5f); + + std::shared_ptr p_dmat{*pp_dmat_}; + p_dmat->Info().labels_.Resize(kRows); + auto &h_labels = p_dmat->Info().labels_.HostVector(); + + std::bernoulli_distribution flip(0.5); + auto& rnd = common::GlobalRandom(); + rnd.seed(0); + + for (auto& v : h_labels) { v = flip(rnd); } + + for (size_t i = 0; i < kCols; ++i) { + std::string name = "feat_" + std::to_string(i); + fmap_.PushBack(i, name.c_str(), "q"); + } + } +}; + +TEST_F(LogitSerializationTest, Exact) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); +} + +TEST_F(LogitSerializationTest, Approx) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "approx"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "approx"}}, + fmap_, *pp_dmat_); +} + +TEST_F(LogitSerializationTest, Hist) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); +} + +TEST_F(LogitSerializationTest, CPU_CoordDescent) { + TestLearnerSerialization({{"booster", "gblinear"}, + {"seed", "0"}, + {"nthread", "1"}, + {"enable_experimental_json_serialization", "1"}, + {"updater", "coord_descent"}}, + fmap_, *pp_dmat_); +} + +#if defined(XGBOOST_USE_CUDA) +TEST_F(LogitSerializationTest, GPU_Hist) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"enable_experimental_json_serialization", "1"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"num_parallel_tree", "4"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", "2"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); +} + +TEST_F(LogitSerializationTest, GPU_CoordDescent) { + TestLearnerSerialization({{"booster", "gblinear"}, + {"objective", "binary:logistic"}, + {"seed", "0"}, + {"nthread", "1"}, + {"enable_experimental_json_serialization", "1"}, + {"updater", "gpu_coord_descent"}}, + fmap_, *pp_dmat_); +} +#endif // defined(XGBOOST_USE_CUDA) + +class MultiClassesSerializationTest : public SerializationTest { + protected: + size_t constexpr static kClasses = 4; + + void SetUp() override { + pp_dmat_ = CreateDMatrix(kRows, kCols, .5f); + + std::shared_ptr p_dmat{*pp_dmat_}; + p_dmat->Info().labels_.Resize(kRows); + auto &h_labels = p_dmat->Info().labels_.HostVector(); + + std::uniform_int_distribution categorical(0, kClasses - 1); + auto& rnd = common::GlobalRandom(); + rnd.seed(0); + + for (auto& v : h_labels) { v = categorical(rnd); } + + for (size_t i = 0; i < kCols; ++i) { + std::string name = "feat_" + std::to_string(i); + fmap_.PushBack(i, name.c_str(), "q"); + } + } +}; + +TEST_F(MultiClassesSerializationTest, Exact) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"num_parallel_tree", "4"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "exact"}}, + fmap_, *pp_dmat_); +} + +TEST_F(MultiClassesSerializationTest, Approx) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "approx"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "approx"}}, + fmap_, *pp_dmat_); +} + +TEST_F(MultiClassesSerializationTest, Hist) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"num_parallel_tree", "4"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "hist"}}, + fmap_, *pp_dmat_); +} + +TEST_F(MultiClassesSerializationTest, CPU_CoordDescent) { + TestLearnerSerialization({{"booster", "gblinear"}, + {"seed", "0"}, + {"nthread", "1"}, + {"enable_experimental_json_serialization", "1"}, + {"updater", "coord_descent"}}, + fmap_, *pp_dmat_); +} + +#if defined(XGBOOST_USE_CUDA) +TEST_F(MultiClassesSerializationTest, GPU_Hist) { + TestLearnerSerialization({{"booster", "gbtree"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "gbtree"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + // GPU_Hist has higher floating point error. 1e-6 doesn't work + // after num_parallel_tree goes to 4 + {"num_parallel_tree", "3"}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); + + TestLearnerSerialization({{"booster", "dart"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"max_depth", std::to_string(kClasses)}, + {"enable_experimental_json_serialization", "1"}, + {"tree_method", "gpu_hist"}}, + fmap_, *pp_dmat_); +} + +TEST_F(MultiClassesSerializationTest, GPU_CoordDescent) { + TestLearnerSerialization({{"booster", "gblinear"}, + {"num_class", std::to_string(kClasses)}, + {"seed", "0"}, + {"nthread", "1"}, + {"enable_experimental_json_serialization", "1"}, + {"updater", "gpu_coord_descent"}}, + fmap_, *pp_dmat_); +} +#endif // defined(XGBOOST_USE_CUDA) + +} // namespace xgboost diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 60a8dd886b98..c4bc29ba5c1a 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -252,6 +252,7 @@ void TestHistogramIndexImpl() { // Build 2 matrices and build a histogram maker with that tree::GPUHistMakerSpecialised hist_maker, hist_maker_ext; + std::unique_ptr hist_maker_dmat( CreateSparsePageDMatrixWithRC(kNRows, kNCols, 0, true)); diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc index 867fc22cc182..39cc5493736b 100644 --- a/tests/cpp/tree/test_tree_model.cc +++ b/tests/cpp/tree/test_tree_model.cc @@ -3,6 +3,7 @@ #include #include "../helpers.h" #include "dmlc/filesystem.h" +#include "xgboost/json_io.h" namespace xgboost { // Manually construct tree in binary format @@ -77,7 +78,7 @@ TEST(Tree, Load) { std::unique_ptr fi(dmlc::Stream::Create(tmp_file.c_str(), "r")); xgboost::RegTree tree; - tree.LoadModel(fi.get()); + tree.Load(fi.get()); EXPECT_EQ(tree.GetDepth(1), 1); EXPECT_EQ(tree[0].SplitCond(), 0.5f); EXPECT_EQ(tree[0].SplitIndex(), 5); @@ -218,4 +219,30 @@ TEST(Tree, DumpDot) { str = tree.DumpModel(fmap, true, R"(dot:{"graph_attrs": {"bgcolor": "#FFFF00"}})"); ASSERT_NE(str.find(R"(graph [ bgcolor="#FFFF00" ])"), std::string::npos); } + +TEST(Tree, Json_IO) { + RegTree tree; + tree.ExpandNode(0, 0, 0.0f, false, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); + Json j_tree{Object()}; + tree.SaveModel(&j_tree); + std::stringstream ss; + Json::Dump(j_tree, &ss); + + auto tparam = j_tree["tree_param"]; + ASSERT_EQ(get(tparam["num_feature"]), "0"); + ASSERT_EQ(get(tparam["num_nodes"]), "3"); + ASSERT_EQ(get(tparam["size_leaf_vector"]), "0"); + + ASSERT_EQ(get(j_tree["left_children"]).size(), 3); + ASSERT_EQ(get(j_tree["right_children"]).size(), 3); + ASSERT_EQ(get(j_tree["parents"]).size(), 3); + ASSERT_EQ(get(j_tree["split_indices"]).size(), 3); + ASSERT_EQ(get(j_tree["split_conditions"]).size(), 3); + ASSERT_EQ(get(j_tree["default_left"]).size(), 3); + + RegTree loaded_tree; + loaded_tree.LoadModel(j_tree); + ASSERT_EQ(loaded_tree.param.num_nodes, 3); +} + } // namespace xgboost diff --git a/tests/python-gpu/load_pickle.py b/tests/python-gpu/load_pickle.py index 45d20aae4547..6fb9cb07c99c 100644 --- a/tests/python-gpu/load_pickle.py +++ b/tests/python-gpu/load_pickle.py @@ -1,20 +1,38 @@ -'''Loading a pickled model generated by test_pickling.py''' -import pickle +'''Loading a pickled model generated by test_pickling.py, only used by +`test_gpu_with_dask.py`''' import unittest import os import xgboost as xgb -import sys +import json -sys.path.append("tests/python") -from test_pickling import build_dataset, model_path +from test_gpu_pickling import build_dataset, model_path, load_pickle class TestLoadPickle(unittest.TestCase): def test_load_pkl(self): - assert os.environ['CUDA_VISIBLE_DEVICES'] == '' - with open(model_path, 'rb') as fd: - bst = pickle.load(fd) + '''Test prediction correct.''' + assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1' + bst = load_pickle(model_path) x, y = build_dataset() test_x = xgb.DMatrix(x) res = bst.predict(test_x) assert len(res) == 10 + + def test_predictor_type_is_auto(self): + '''Under empty CUDA_VISIBLE_DEVICES, predictor should be set to auto''' + assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1' + bst = load_pickle(model_path) + config = bst.save_config() + config = json.loads(config) + assert config['Learner']['gradient_booster']['gbtree_train_param'][ + 'predictor'] == 'auto' + + def test_predictor_type_is_gpu(self): + '''When CUDA_VISIBLE_DEVICES is not specified, keep using + `gpu_predictor`''' + assert 'CUDA_VISIBLE_DEVICES' not in os.environ.keys() + bst = load_pickle(model_path) + config = bst.save_config() + config = json.loads(config) + assert config['Learner']['gradient_booster']['gbtree_train_param'][ + 'predictor'] == 'gpu_predictor' diff --git a/tests/python-gpu/test_pickling.py b/tests/python-gpu/test_gpu_pickling.py similarity index 56% rename from tests/python-gpu/test_pickling.py rename to tests/python-gpu/test_gpu_pickling.py index 9c077e3155a9..9c5157c2e30f 100644 --- a/tests/python-gpu/test_pickling.py +++ b/tests/python-gpu/test_gpu_pickling.py @@ -4,7 +4,7 @@ import numpy as np import subprocess import os -import sys +import json import xgboost as xgb from xgboost import XGBClassifier @@ -39,18 +39,17 @@ def test_pickling(self): bst = xgb.train(param, train_x) save_pickle(bst, model_path) - args = ["pytest", - "--verbose", - "-s", - "--fulltrace", - "./tests/python-gpu/load_pickle.py"] + args = [ + "pytest", "--verbose", "-s", "--fulltrace", + "./tests/python-gpu/load_pickle.py::TestLoadPickle::test_load_pkl" + ] command = '' for arg in args: command += arg command += ' ' - cuda_environment = {'CUDA_VISIBLE_DEVICES': ''} - env = os.environ + cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'} + env = os.environ.copy() # Passing new_environment directly to `env' argument results # in failure on Windows: # Fatal Python error: _Py_HashRandomization_Init: failed to @@ -62,12 +61,55 @@ def test_pickling(self): assert status == 0 os.remove(model_path) + def test_pickled_predictor(self): + args_templae = [ + "pytest", + "--verbose", + "-s", + "--fulltrace"] + + x, y = build_dataset() + train_x = xgb.DMatrix(x, label=y) + + param = {'tree_method': 'gpu_hist', + 'verbosity': 1, 'predictor': 'gpu_predictor'} + bst = xgb.train(param, train_x) + config = json.loads(bst.save_config()) + assert config['Learner']['gradient_booster']['gbtree_train_param'][ + 'predictor'] == 'gpu_predictor' + + save_pickle(bst, model_path) + + args = args_templae.copy() + args.append( + "./tests/python-gpu/" + "load_pickle.py::TestLoadPickle::test_predictor_type_is_auto") + + cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'} + env = os.environ.copy() + env.update(cuda_environment) + + # Load model in a CPU only environment. + status = subprocess.call(args, env=env) + assert status == 0 + + args = args_templae.copy() + args.append( + "./tests/python-gpu/" + "load_pickle.py::TestLoadPickle::test_predictor_type_is_gpu") + + # Load in environment that has GPU. + env = os.environ.copy() + assert 'CUDA_VISIBLE_DEVICES' not in env.keys() + status = subprocess.call(args, env=env) + assert status == 0 + def test_predict_sklearn_pickle(self): x, y = build_dataset() kwargs = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', - 'verbosity': 2, + 'verbosity': 1, 'objective': 'binary:logistic', 'n_estimators': 10} diff --git a/tests/python-gpu/test_gpu_training_continuation.py b/tests/python-gpu/test_gpu_training_continuation.py index 3d4b053dff63..ac52fe464747 100644 --- a/tests/python-gpu/test_gpu_training_continuation.py +++ b/tests/python-gpu/test_gpu_training_continuation.py @@ -7,23 +7,25 @@ class TestGPUTrainingContinuation(unittest.TestCase): - def test_training_continuation_binary(self): - kRows = 32 - kCols = 16 + def run_training_continuation(self, use_json): + kRows = 64 + kCols = 32 X = np.random.randn(kRows, kCols) y = np.random.randn(kRows) dtrain = xgb.DMatrix(X, y) - params = {'tree_method': 'gpu_hist', 'max_depth': '2'} - bst_0 = xgb.train(params, dtrain, num_boost_round=4) + params = {'tree_method': 'gpu_hist', 'max_depth': '2', + 'gamma': '0.1', 'alpha': '0.01', + 'enable_experimental_json_serialization': use_json} + bst_0 = xgb.train(params, dtrain, num_boost_round=64) dump_0 = bst_0.get_dump(dump_format='json') - bst_1 = xgb.train(params, dtrain, num_boost_round=2) - bst_1 = xgb.train(params, dtrain, num_boost_round=2, xgb_model=bst_1) + bst_1 = xgb.train(params, dtrain, num_boost_round=32) + bst_1 = xgb.train(params, dtrain, num_boost_round=32, xgb_model=bst_1) dump_1 = bst_1.get_dump(dump_format='json') def recursive_compare(obj_0, obj_1): if isinstance(obj_0, float): - assert np.isclose(obj_0, obj_1) + assert np.isclose(obj_0, obj_1, atol=1e-6) elif isinstance(obj_0, str): assert obj_0 == obj_1 elif isinstance(obj_0, int): @@ -42,7 +44,14 @@ def recursive_compare(obj_0, obj_1): for i in range(len(obj_0)): recursive_compare(obj_0[i], obj_1[i]) + assert len(dump_0) == len(dump_1) for i in range(len(dump_0)): obj_0 = json.loads(dump_0[i]) obj_1 = json.loads(dump_1[i]) recursive_compare(obj_0, obj_1) + + def test_gpu_training_continuation_binary(self): + self.run_training_continuation(False) + + def test_gpu_training_continuation_json(self): + self.run_training_continuation(True) diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index ab2838cff710..09df895e5c24 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -1,6 +1,8 @@ import numpy as np import xgboost as xgb import unittest +import os +import json dpath = 'demo/data/' dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') @@ -200,3 +202,23 @@ def test_feature_names_validation(self): bst.predict(dm2) # success self.assertRaises(ValueError, bst.predict, dm1) bst.predict(dm2) # success + + def test_json_model_io(self): + X = np.random.random((10, 3)) + y = np.random.randint(2, size=(10,)) + + dm1 = xgb.DMatrix(X, y) + bst = xgb.train({'tree_method': 'hist'}, dm1) + bst.save_model('./model.json') + + with open('./model.json', 'r') as fd: + j_model = json.load(fd) + assert isinstance(j_model['Learner'], dict) + + bst = xgb.Booster(model_file='./model.json') + + with open('./model.json', 'r') as fd: + j_model = json.load(fd) + assert isinstance(j_model['Learner'], dict) + + os.remove('model.json') diff --git a/tests/python/test_pickling.py b/tests/python/test_pickling.py index 1497688d2093..be4b9c743e84 100644 --- a/tests/python/test_pickling.py +++ b/tests/python/test_pickling.py @@ -2,6 +2,7 @@ import numpy as np import xgboost as xgb import os +import unittest kRows = 100 @@ -14,35 +15,45 @@ def generate_data(): return X, y -def test_model_pickling(): - xgb_params = { - 'verbosity': 0, - 'nthread': 1, - 'tree_method': 'hist' - } +class TestPickling(unittest.TestCase): + def run_model_pickling(self, xgb_params): + X, y = generate_data() + dtrain = xgb.DMatrix(X, y) + bst = xgb.train(xgb_params, dtrain) - X, y = generate_data() - dtrain = xgb.DMatrix(X, y) - bst = xgb.train(xgb_params, dtrain) + dump_0 = bst.get_dump(dump_format='json') + assert dump_0 - dump_0 = bst.get_dump(dump_format='json') - assert dump_0 + filename = 'model.pkl' - filename = 'model.pkl' + with open(filename, 'wb') as fd: + pickle.dump(bst, fd) - with open(filename, 'wb') as fd: - pickle.dump(bst, fd) + with open(filename, 'rb') as fd: + bst = pickle.load(fd) - with open(filename, 'rb') as fd: - bst = pickle.load(fd) + with open(filename, 'wb') as fd: + pickle.dump(bst, fd) - with open(filename, 'wb') as fd: - pickle.dump(bst, fd) + with open(filename, 'rb') as fd: + bst = pickle.load(fd) - with open(filename, 'rb') as fd: - bst = pickle.load(fd) + assert bst.get_dump(dump_format='json') == dump_0 - assert bst.get_dump(dump_format='json') == dump_0 + if os.path.exists(filename): + os.remove(filename) - if os.path.exists(filename): - os.remove(filename) + def test_model_pickling_binary(self): + params = { + 'nthread': 1, + 'tree_method': 'hist' + } + self.run_model_pickling(params) + + def test_model_pickling_json(self): + params = { + 'nthread': 1, + 'tree_method': 'hist', + 'enable_experimental_json_serialization': True + } + self.run_model_pickling(params) diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py index 98a931bcd089..5ebb11445c3e 100644 --- a/tests/python/test_training_continuation.py +++ b/tests/python/test_training_continuation.py @@ -10,26 +10,35 @@ class TestTrainingContinuation(unittest.TestCase): num_parallel_tree = 3 - xgb_params_01 = { - 'verbosity': 0, - 'nthread': 1, - } - - xgb_params_02 = { - 'verbosity': 0, - 'nthread': 1, - 'num_parallel_tree': num_parallel_tree - } - - xgb_params_03 = { - 'verbosity': 0, - 'nthread': 1, - 'num_class': 5, - 'num_parallel_tree': num_parallel_tree - } - - @pytest.mark.skipif(**tm.no_sklearn()) - def test_training_continuation(self): + def generate_parameters(self, use_json): + xgb_params_01_binary = { + 'nthread': 1, + } + + xgb_params_02_binary = { + 'nthread': 1, + 'num_parallel_tree': self.num_parallel_tree + } + + xgb_params_03_binary = { + 'nthread': 1, + 'num_class': 5, + 'num_parallel_tree': self.num_parallel_tree + } + if use_json: + xgb_params_01_binary[ + 'enable_experimental_json_serialization'] = True + xgb_params_02_binary[ + 'enable_experimental_json_serialization'] = True + xgb_params_03_binary[ + 'enable_experimental_json_serialization'] = True + + return [ + xgb_params_01_binary, xgb_params_02_binary, xgb_params_03_binary + ] + + def run_training_continuation(self, xgb_params_01, xgb_params_02, + xgb_params_03): from sklearn.datasets import load_digits from sklearn.metrics import mean_squared_error @@ -45,18 +54,18 @@ def test_training_continuation(self): dtrain_2class = xgb.DMatrix(X_2class, label=y_2class) dtrain_5class = xgb.DMatrix(X_5class, label=y_5class) - gbdt_01 = xgb.train(self.xgb_params_01, dtrain_2class, + gbdt_01 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10) ntrees_01 = len(gbdt_01.get_dump()) assert ntrees_01 == 10 - gbdt_02 = xgb.train(self.xgb_params_01, dtrain_2class, + gbdt_02 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=0) gbdt_02.save_model('xgb_tc.model') - gbdt_02a = xgb.train(self.xgb_params_01, dtrain_2class, + gbdt_02a = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02) - gbdt_02b = xgb.train(self.xgb_params_01, dtrain_2class, + gbdt_02b = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.model") ntrees_02a = len(gbdt_02a.get_dump()) ntrees_02b = len(gbdt_02b.get_dump()) @@ -71,13 +80,13 @@ def test_training_continuation(self): res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class)) assert res1 == res2 - gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, + gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3) gbdt_03.save_model('xgb_tc.model') - gbdt_03a = xgb.train(self.xgb_params_01, dtrain_2class, + gbdt_03a = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03) - gbdt_03b = xgb.train(self.xgb_params_01, dtrain_2class, + gbdt_03b = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.model") ntrees_03a = len(gbdt_03a.get_dump()) ntrees_03b = len(gbdt_03b.get_dump()) @@ -88,7 +97,7 @@ def test_training_continuation(self): res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) assert res1 == res2 - gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, + gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree @@ -100,7 +109,7 @@ def test_training_continuation(self): ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 - gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, + gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04) assert gbdt_04.best_ntree_limit == ( gbdt_04.best_iteration + 1) * self.num_parallel_tree @@ -112,11 +121,11 @@ def test_training_continuation(self): ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 - gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, + gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=7) assert gbdt_05.best_ntree_limit == ( gbdt_05.best_iteration + 1) * self.num_parallel_tree - gbdt_05 = xgb.train(self.xgb_params_03, + gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05) @@ -127,3 +136,32 @@ def test_training_continuation(self): res2 = gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit) np.testing.assert_almost_equal(res1, res2) + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_training_continuation_binary(self): + params = self.generate_parameters(False) + self.run_training_continuation(params[0], params[1], params[2]) + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_training_continuation_json(self): + params = self.generate_parameters(True) + for p in params: + p['enable_experimental_json_serialization'] = True + self.run_training_continuation(params[0], params[1], params[2]) + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_training_continuation_updaters_binary(self): + updaters = 'grow_colmaker,prune,refresh' + params = self.generate_parameters(False) + for p in params: + p['updater'] = updaters + self.run_training_continuation(params[0], params[1], params[2]) + + @pytest.mark.skipif(**tm.no_sklearn()) + def test_training_continuation_updaters_json(self): + # Picked up from R tests. + updaters = 'grow_colmaker,prune,refresh' + params = self.generate_parameters(True) + for p in params: + p['updater'] = updaters + self.run_training_continuation(params[0], params[1], params[2])