From fd632e6155af182882acead8e7a6202d4e2881eb Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 12 Jun 2020 11:11:47 +0800 Subject: [PATCH 1/5] add new attribute for number of features Fixes issue related to #17353 in scikit-learn. --- python-package/xgboost/sklearn.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 49f4dfa451d3..c0e69778befd 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -499,6 +499,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, [xgb.callback.reset_learning_rate(custom_rates)] """ + self.n_features_in_ = X.shape[1] train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight, base_margin=base_margin, missing=self.missing, @@ -813,6 +814,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, raise ValueError( 'Please reshape the input data X into 2-dimensional matrix.') self._features_count = X.shape[1] + self.n_features_in_ = self._features_count train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) From 6e5fac6b7c30b816e880b1e5ba04376fd61bc147 Mon Sep 17 00:00:00 2001 From: a-wozniakowski Date: Fri, 12 Jun 2020 17:20:06 +0800 Subject: [PATCH 2/5] add new attribute for number of features add n_features_in_ attribute and stacking tests --- python-package/xgboost/sklearn.py | 5 ++++ tests/python/test_with_sklearn.py | 50 +++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index c0e69778befd..7b72098d660b 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -500,6 +500,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, [xgb.callback.reset_learning_rate(custom_rates)] """ self.n_features_in_ = X.shape[1] + train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight, base_margin=base_margin, missing=self.missing, @@ -813,8 +814,10 @@ def fit(self, X, y, sample_weight=None, base_margin=None, # different ways of reshaping raise ValueError( 'Please reshape the input data X into 2-dimensional matrix.') + self._features_count = X.shape[1] self.n_features_in_ = self._features_count + train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) @@ -1197,6 +1200,8 @@ def _dmat_init(group, **params): ret.set_group(group) return ret + self.n_features_in_ = X.shape[1] + train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 439d89afe826..af113df075b7 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -115,6 +115,56 @@ def test_ranking(): np.testing.assert_almost_equal(pred, pred_orig) +def test_stacking_regression(): + from sklearn.model_selection import train_test_split + from sklearn.datasets import load_diabetes + from sklearn.linear_model import RidgeCV + from sklearn.ensemble import RandomForestRegressor + from sklearn.ensemble import StackingRegressor + + X, y = load_diabetes(return_X_y=True) + estimators = [ + ('gbm', xgb.XGBRegressor(objective='reg:squarederror')), ('lr', RidgeCV()) + ] + reg = StackingRegressor( + estimators=estimators, + final_estimator=RandomForestRegressor(n_estimators=10, + random_state=42) + ) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + reg.fit(X_train, y_train).score(X_test, y_test) + + # test number of input features + assert reg.n_features_in_ == 10 + + +def test_stacking_classification(): + from sklearn.model_selection import train_test_split + from sklearn.datasets import load_iris + from sklearn.svm import LinearSVC + from sklearn.linear_model import LogisticRegression + from sklearn.preprocessing import StandardScaler + from sklearn.pipeline import make_pipeline + from sklearn.ensemble import StackingClassifier + + X, y = load_iris(return_X_y=True) + estimators = [ + ('gbm', xgb.XGBClassifier()), + ('svr', make_pipeline(StandardScaler(), + LinearSVC(random_state=42))) + ] + clf = StackingClassifier( + estimators=estimators, final_estimator=LogisticRegression() + ) + + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) + clf.fit(X_train, y_train).score(X_test, y_test) + + # test number of input features + assert clf.n_features_in_ == 4 + + @pytest.mark.skipif(**tm.no_pandas()) def test_feature_importances_weight(): from sklearn.datasets import load_digits From 2acea9de99af41622af08487af0dcbe35e7969e4 Mon Sep 17 00:00:00 2001 From: a-wozniakowski Date: Fri, 12 Jun 2020 17:53:34 +0800 Subject: [PATCH 3/5] add new attribute for number of features --- python-package/xgboost/sklearn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 7b72098d660b..ef22f4309ed7 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -500,7 +500,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, [xgb.callback.reset_learning_rate(custom_rates)] """ self.n_features_in_ = X.shape[1] - + train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight, base_margin=base_margin, missing=self.missing, @@ -814,7 +814,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, # different ways of reshaping raise ValueError( 'Please reshape the input data X into 2-dimensional matrix.') - + self._features_count = X.shape[1] self.n_features_in_ = self._features_count From 2ca1f71ddb047f5541f6fed36bdc9376794a3824 Mon Sep 17 00:00:00 2001 From: a-wozniakowski Date: Fri, 12 Jun 2020 18:50:02 +0800 Subject: [PATCH 4/5] update stacking tests --- tests/python/test_with_sklearn.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index af113df075b7..fc052cb57559 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -124,7 +124,8 @@ def test_stacking_regression(): X, y = load_diabetes(return_X_y=True) estimators = [ - ('gbm', xgb.XGBRegressor(objective='reg:squarederror')), ('lr', RidgeCV()) + ('gbm', xgb.sklearn.XGBRegressor(objective='reg:squarederror')), + ('lr', RidgeCV()) ] reg = StackingRegressor( estimators=estimators, @@ -150,7 +151,7 @@ def test_stacking_classification(): X, y = load_iris(return_X_y=True) estimators = [ - ('gbm', xgb.XGBClassifier()), + ('gbm', xgb.sklearn.XGBClassifier()), ('svr', make_pipeline(StandardScaler(), LinearSVC(random_state=42))) ] From eaf2b3e98f90eb30b0917a295b8568e51a0e56af Mon Sep 17 00:00:00 2001 From: a-wozniakowski Date: Fri, 12 Jun 2020 19:54:08 +0800 Subject: [PATCH 5/5] update stacking tests --- tests/python/test_with_sklearn.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index fc052cb57559..0bb5b3ada000 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -136,9 +136,6 @@ def test_stacking_regression(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg.fit(X_train, y_train).score(X_test, y_test) - # test number of input features - assert reg.n_features_in_ == 10 - def test_stacking_classification(): from sklearn.model_selection import train_test_split @@ -162,9 +159,6 @@ def test_stacking_classification(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) clf.fit(X_train, y_train).score(X_test, y_test) - # test number of input features - assert clf.n_features_in_ == 4 - @pytest.mark.skipif(**tm.no_pandas()) def test_feature_importances_weight():