Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix discrepancies with XGBRegressor and xgboost > 2 #670

Merged
merged 9 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions .azure-pipelines/linux-conda-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,23 @@ jobs:
strategy:
matrix:

Python311-1150-RT1163-xgb2-lgbm40:
python.version: '3.11'
ONNX_PATH: 'onnx==1.15.0'
ONNXRT_PATH: 'onnxruntime==1.16.3'
COREML_PATH: NONE
lightgbm.version: '>=4.0'
xgboost.version: '>=2'
numpy.version: ''
scipy.version: ''

Python311-1150-RT1160-xgb175-lgbm40:
python.version: '3.11'
ONNX_PATH: 'onnx==1.15.0'
ONNXRT_PATH: 'onnxruntime==1.16.2'
COREML_PATH: NONE
lightgbm.version: '>=4.0'
xgboost.version: '>=1.7.5'
xgboost.version: '==1.7.5'
numpy.version: ''
scipy.version: ''

Expand All @@ -31,7 +41,7 @@ jobs:
ONNXRT_PATH: 'onnxruntime==1.16.2'
COREML_PATH: NONE
lightgbm.version: '>=4.0'
xgboost.version: '>=1.7.5'
xgboost.version: '==1.7.5'
numpy.version: ''
scipy.version: ''

Expand All @@ -41,7 +51,7 @@ jobs:
ONNXRT_PATH: 'onnxruntime==1.15.1'
COREML_PATH: NONE
lightgbm.version: '<4.0'
xgboost.version: '>=1.7.5'
xgboost.version: '==1.7.5'
numpy.version: ''
scipy.version: ''

Expand All @@ -51,7 +61,7 @@ jobs:
ONNXRT_PATH: 'onnxruntime==1.14.0'
COREML_PATH: NONE
lightgbm.version: '<4.0'
xgboost.version: '>=1.7.5'
xgboost.version: '==1.7.5'
numpy.version: ''
scipy.version: ''

Expand All @@ -61,7 +71,7 @@ jobs:
ONNXRT_PATH: 'onnxruntime==1.15.1'
COREML_PATH: NONE
lightgbm.version: '>=4.0'
xgboost.version: '>=1.7.5'
xgboost.version: '==1.7.5'
numpy.version: ''
scipy.version: '==1.8.0'

Expand Down
7 changes: 7 additions & 0 deletions .azure-pipelines/win32-conda-CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,34 +21,39 @@ jobs:
ONNXRT_PATH: 'onnxruntime==1.16.2'
COREML_PATH: NONE
numpy.version: ''
xgboost.version: '2.0.2'

Python311-1141-RT1162:
python.version: '3.11'
ONNX_PATH: 'onnx==1.14.1'
ONNXRT_PATH: 'onnxruntime==1.16.2'
COREML_PATH: NONE
numpy.version: ''
xgboost.version: '1.7.5'

Python310-1141-RT1151:
python.version: '3.10'
ONNX_PATH: 'onnx==1.14.1'
ONNXRT_PATH: 'onnxruntime==1.15.1'
COREML_PATH: NONE
numpy.version: ''
xgboost.version: '1.7.5'

Python310-1141-RT1140:
python.version: '3.10'
ONNX_PATH: 'onnx==1.14.1'
ONNXRT_PATH: onnxruntime==1.14.0
COREML_PATH: NONE
numpy.version: ''
xgboost.version: '1.7.5'

Python39-1141-RT1140:
python.version: '3.9'
ONNX_PATH: 'onnx==1.14.1'
ONNXRT_PATH: onnxruntime==1.14.0
COREML_PATH: NONE
numpy.version: ''
xgboost.version: '1.7.5'

maxParallel: 3

Expand All @@ -74,6 +79,8 @@ jobs:
- script: |
call activate py$(python.version)
python -m pip install --upgrade scikit-learn
python -m pip install --upgrade lightgbm
python -m pip install "xgboost=$(xgboost.version)"
displayName: 'Install scikit-learn'

- script: |
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## 1.12.0

* Fix discrepancies with XGBRegressor and xgboost > 2
[#670](https://github.com/onnx/onnxmltools/pull/670)
* Support count:poisson for XGBRegressor
[#666](https://github.com/onnx/onnxmltools/pull/666)
* Supports XGBRFClassifier and XGBRFRegressor
Expand Down
19 changes: 15 additions & 4 deletions onnxmltools/convert/xgboost/_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,20 @@ def _get_attributes(booster):
except AttributeError:
ntrees = trees // num_class if num_class > 0 else trees
else:
trees = len(res)
ntrees = booster.best_ntree_limit
num_class = trees // ntrees
config = json.loads(booster.save_config())["learner"]["learner_model_param"]
if "num_class" in config:
num_class = int(config["num_class"])
ntrees = len(res)
num_class = 1
else:
trees = len(res)
if hasattr(booster, "best_ntree_limit"):
ntrees = booster.best_ntree_limit
elif hasattr(booster, "best_iteration"):
ntrees = booster.best_iteration
else:
raise RuntimeError("Unable to guess the number of classes.")
num_class = trees // ntrees
if num_class == 0:
raise RuntimeError(
"Unable to retrieve the number of classes, trees=%d, ntrees=%d."
Expand Down Expand Up @@ -137,7 +148,7 @@ def __init__(self, booster):
self.operator_name = "XGBRegressor"

def get_xgb_params(self):
return self.kwargs
return {k: v for k, v in self.kwargs.items() if v is not None}

def get_booster(self):
return self.booster_
Expand Down
27 changes: 25 additions & 2 deletions onnxmltools/convert/xgboost/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
Common function to converters and shape calculators.
"""
import json


def get_xgb_params(xgb_node):
Expand All @@ -15,8 +16,30 @@ def get_xgb_params(xgb_node):
else:
# XGBoost < 0.7
params = xgb_node.__dict__

if hasattr("xgb_node", "save_config"):
config = json.loads(xgb_node.save_config())
else:
config = json.loads(xgb_node.get_booster().save_config())
num_class = int(config["learner"]["learner_model_param"]["num_class"])
params = {k: v for k, v in params.items() if v is not None}
params["num_class"] = num_class
if "n_estimators" not in params and hasattr(xgb_node, "n_estimators"):
# xgboost >= 1.0.2
params["n_estimators"] = xgb_node.n_estimators
if xgb_node.n_estimators is not None:
params["n_estimators"] = xgb_node.n_estimators
if params.get("base_score", None) is None:
# xgboost >= 2.0
params["base_score"] = float(
config["learner"]["learner_model_param"]["base_score"]
)
return params


def get_n_estimators_classifier(xgb_node, params, js_trees):
if "n_estimators" not in params:
config = json.loads(xgb_node.get_booster().save_config())
num_class = int(config["learner"]["learner_model_param"]["num_class"])
if num_class == 0:
return len(js_trees)
return len(js_trees) // num_class
return params["n_estimators"]
22 changes: 16 additions & 6 deletions onnxmltools/convert/xgboost/operator_converters/XGBoost.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
except ImportError:
XGBRFClassifier = None
from ...common._registration import register_converter
from ..common import get_xgb_params
from ..common import get_xgb_params, get_n_estimators_classifier


class XGBConverter:
Expand Down Expand Up @@ -293,11 +293,18 @@ def convert(scope, operator, container):
objective, base_score, js_trees = XGBConverter.common_members(xgb_node, inputs)

params = XGBConverter.get_xgb_params(xgb_node)
n_estimators = get_n_estimators_classifier(xgb_node, params, js_trees)

attr_pairs = XGBClassifierConverter._get_default_tree_attribute_pairs()
XGBConverter.fill_tree_attributes(
js_trees, attr_pairs, [1 for _ in js_trees], True
)
ncl = (max(attr_pairs["class_treeids"]) + 1) // params["n_estimators"]
if "num_class" in params:
ncl = params["num_class"]
n_estimators = len(js_trees) // ncl
else:
ncl = (max(attr_pairs["class_treeids"]) + 1) // n_estimators
print("**", params)

bst = xgb_node.get_booster()
best_ntree_limit = getattr(bst, "best_ntree_limit", len(js_trees)) * ncl
Expand All @@ -310,6 +317,7 @@ def convert(scope, operator, container):

if len(attr_pairs["class_treeids"]) == 0:
raise RuntimeError("XGBoost model is empty.")

if ncl <= 1:
ncl = 2
if objective != "binary:hinge":
Expand All @@ -330,8 +338,10 @@ def convert(scope, operator, container):
attr_pairs["class_ids"] = [v % ncl for v in attr_pairs["class_treeids"]]

classes = xgb_node.classes_
if np.issubdtype(classes.dtype, np.floating) or np.issubdtype(
classes.dtype, np.integer
if (
np.issubdtype(classes.dtype, np.floating)
or np.issubdtype(classes.dtype, np.integer)
or np.issubdtype(classes.dtype, np.bool_)
):
attr_pairs["classlabels_int64s"] = classes.astype("int")
else:
Expand Down Expand Up @@ -373,7 +383,7 @@ def convert(scope, operator, container):
"Where", [greater, one, zero], operator.output_full_names[1]
)
elif objective in ("multi:softprob", "multi:softmax"):
ncl = len(js_trees) // params["n_estimators"]
ncl = len(js_trees) // n_estimators
if objective == "multi:softmax":
attr_pairs["post_transform"] = "NONE"
container.add_node(
Expand All @@ -385,7 +395,7 @@ def convert(scope, operator, container):
**attr_pairs,
)
elif objective == "reg:logistic":
ncl = len(js_trees) // params["n_estimators"]
ncl = len(js_trees) // n_estimators
if ncl == 1:
ncl = 2
container.add_node(
Expand Down
16 changes: 12 additions & 4 deletions onnxmltools/convert/xgboost/shape_calculators/Classifier.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0

import json
import numpy as np
from ...common._registration import register_shape_calculator
from ...common.utils import check_input_and_output_numbers, check_input_and_output_types
Expand All @@ -8,7 +9,7 @@
Int64TensorType,
StringTensorType,
)
from ..common import get_xgb_params
from ..common import get_xgb_params, get_n_estimators_classifier


def calculate_xgboost_classifier_output_shapes(operator):
Expand All @@ -22,13 +23,20 @@ def calculate_xgboost_classifier_output_shapes(operator):
params = get_xgb_params(xgb_node)
booster = xgb_node.get_booster()
booster.attributes()
ntrees = len(booster.get_dump(with_stats=True, dump_format="json"))
js_trees = booster.get_dump(with_stats=True, dump_format="json")
ntrees = len(js_trees)
objective = params["objective"]
n_estimators = get_n_estimators_classifier(xgb_node, params, js_trees)
config = json.loads(xgb_node.get_booster().save_config())
num_class = int(config["learner"]["learner_model_param"]["num_class"])

if objective == "binary:logistic":
if num_class is not None:
ncl = num_class
n_estimators = ntrees // ncl

Check notice

Code scanning / CodeQL

Unused local variable

Variable n_estimators is not used.
elif objective == "binary:logistic":
ncl = 2
else:
ncl = ntrees // params["n_estimators"]
ncl = ntrees // n_estimators
if objective == "reg:logistic" and ncl == 1:
ncl = 2
classes = xgb_node.classes_
Expand Down
2 changes: 2 additions & 0 deletions onnxmltools/utils/utils_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ def compare_outputs(expected, output, **kwargs):
Disc = kwargs.pop("Disc", False)
Mism = kwargs.pop("Mism", False)
Opp = kwargs.pop("Opp", False)
if hasattr(expected, "dtype") and expected.dtype == numpy.bool_:
expected = expected.astype(numpy.int64)
if Opp and not NoProb:
raise ValueError("Opp is only available if NoProb is True")

Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ ruff
scikit-learn>=1.2.0
scipy
wheel
xgboost==1.7.5
xgboost
onnxruntime
1 change: 0 additions & 1 deletion tests/h2o/test_h2o_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,6 @@ def test_h2o_classifier_multi_cat(self):
train, test = _prepare_one_hot("airlines.csv", y)
gbm = H2OGradientBoostingEstimator(ntrees=8, max_depth=5)
mojo_path = _make_mojo(gbm, train, y=train.columns.index(y))
print("****", mojo_path)
onnx_model = _convert_mojo(mojo_path)
self.assertIsNot(onnx_model, None)
dump_data_and_model(
Expand Down
2 changes: 1 addition & 1 deletion tests/xgboost/test_xgboost_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,5 +677,5 @@ def test_xgb_classifier_hinge(self):


if __name__ == "__main__":
TestXGBoostModels().test_xgb_regressor_poisson()
TestXGBoostModels().test_xgb_best_tree_limit()
unittest.main(verbosity=2)
Loading