From b1f94227b540d19a55c18d210d3da2762f8591d4 Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 21 Jun 2020 20:13:27 +0800 Subject: [PATCH 1/2] Update document for model dump. --- doc/tutorials/saving_model.rst | 31 +++++++++++++------------------ python-package/xgboost/core.py | 25 ++++++++++++++++++------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst index fc9a9fecdee7..34c1f76f132f 100644 --- a/doc/tutorials/saving_model.rst +++ b/doc/tutorials/saving_model.rst @@ -112,7 +112,7 @@ configuration directly as a JSON string. In Python package: print(config) -or +or in R: .. code-block:: R @@ -158,22 +158,9 @@ Will print out something similiar to (not actual output as it's too long for dem "colsample_bynode": "1", "colsample_bytree": "1", "default_direction": "learn", - "enable_feature_grouping": "0", - "eta": "0.300000012", - "gamma": "0", - "grow_policy": "depthwise", - "interaction_constraints": "", - "lambda": "1", - "learning_rate": "0.300000012", - "max_bin": "256", - "max_conflict_rate": "0", - "max_delta_step": "0", - "max_depth": "6", - "max_leaves": "0", - "max_search_group": "100", - "refresh_leaf": "1", - "sketch_eps": "0.0299999993", - "sketch_ratio": "2", + + ... + "subsample": "1" } } @@ -207,13 +194,21 @@ This way users can study the internal representation more closely. Please note JSON generators make use of locale dependent floating point serialization methods, which is not supported by XGBoost. +************************************************* +Difference between saving model and dumping model +************************************************* + +XGBoost has a function called ``dump_model`` in Booster object, which lets you to export +the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary +use case for it is for model interpretation or visualization, and is not supposed to be +loaded back to XGBoost. + ************ Future Plans ************ Right now using the JSON format incurs longer serialisation time, we have been working on optimizing the JSON implementation to close the gap between binary format and JSON format. -You can track the progress in `#5046 `_. *********** JSON Schema diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index e74d1e2417b0..079e916c3260 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1444,8 +1444,11 @@ def save_model(self, fname): The model is saved in an XGBoost internal format which is universal among the various XGBoost interfaces. Auxiliary attributes of the - Python Booster object (such as feature_names) will not be saved. To - preserve all attributes, pickle the Booster object. + Python Booster object (such as feature_names) will not be saved. See: + + https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html + + for more info. Parameters ---------- @@ -1460,7 +1463,7 @@ def save_model(self, fname): raise TypeError("fname must be a string or os_PathLike") def save_raw(self): - """Save the model to a in memory buffer representation + """Save the model to a in memory buffer representation instead of file. Returns ------- @@ -1479,8 +1482,11 @@ def load_model(self, fname): The model is loaded from an XGBoost format which is universal among the various XGBoost interfaces. Auxiliary attributes of the Python Booster - object (such as feature_names) will not be loaded. To preserve all - attributes, pickle the Booster object. + object (such as feature_names) will not be loaded. See: + + https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html + + for more info. Parameters ---------- @@ -1503,7 +1509,9 @@ def load_model(self, fname): raise TypeError('Unknown file type: ', fname) def dump_model(self, fout, fmap='', with_stats=False, dump_format="text"): - """Dump model into a text or JSON file. + """Dump model into a text or JSON file. Unlike `save_model`, the + output format is primarily used for visualization or interpretation, + hence it's more human readable but cannot be loaded back to XGBoost. Parameters ---------- @@ -1537,7 +1545,9 @@ def dump_model(self, fout, fmap='', with_stats=False, dump_format="text"): fout.close() def get_dump(self, fmap='', with_stats=False, dump_format="text"): - """Returns the model dump as a list of strings. + """Returns the model dump as a list of strings. Unlike `save_model`, the + output format is primarily used for visualization or interpretation, + hence it's more human readable but cannot be loaded back to XGBoost. Parameters ---------- @@ -1547,6 +1557,7 @@ def get_dump(self, fmap='', with_stats=False, dump_format="text"): Controls whether the split statistics are output. dump_format : string, optional Format of model dump. Can be 'text', 'json' or 'dot'. + """ fmap = os_fspath(fmap) length = c_bst_ulong() From da533e22c8e8114a354f227f3e3f59a2a21fe4a8 Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 21 Jun 2020 20:27:51 +0800 Subject: [PATCH 2/2] Schema. --- doc/tutorials/saving_model.rst | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst index 34c1f76f132f..44a85cb7cc30 100644 --- a/doc/tutorials/saving_model.rst +++ b/doc/tutorials/saving_model.rst @@ -201,14 +201,9 @@ Difference between saving model and dumping model XGBoost has a function called ``dump_model`` in Booster object, which lets you to export the model in a readable format like ``text``, ``json`` or ``dot`` (graphviz). The primary use case for it is for model interpretation or visualization, and is not supposed to be -loaded back to XGBoost. - -************ -Future Plans -************ - -Right now using the JSON format incurs longer serialisation time, we have been working on -optimizing the JSON implementation to close the gap between binary format and JSON format. +loaded back to XGBoost. The JSON version has a `schema +`_. See next section for +more info. *********** JSON Schema @@ -224,3 +219,10 @@ leaf directly, instead it saves the weights as a separated array. .. include:: ../model.schema :code: json + +************ +Future Plans +************ + +Right now using the JSON format incurs longer serialisation time, we have been working on +optimizing the JSON implementation to close the gap between binary format and JSON format.