From 55d35b6858f3716dc56ac771b9e95b0c7d161587 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 16:24:57 +0100 Subject: [PATCH 01/10] TST make sklearn integration test compatible with 0.24 --- python-package/lightgbm/sklearn.py | 12 ++++++++++-- tests/python_package_test/test_sklearn.py | 23 +++++++---------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 18ddc1640845..36ce396d94ad 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -315,8 +315,16 @@ def __init__(self, boosting_type='gbdt', num_leaves=31, max_depth=-1, self.set_params(**kwargs) def _more_tags(self): - return {'allow_nan': True, - 'X_types': ['2darray', 'sparse', '1dlabels']} + return { + 'allow_nan': True, + 'X_types': ['2darray', 'sparse', '1dlabels'], + '_xfail_checks': { + 'check_no_attributes_set_in_init': + 'scikit-learn incorrectly asserts that private attributes ' + 'cannot be set in __init__: ' + '(see https://github.com/microsoft/LightGBM/issues/2628)' + } + } def get_params(self, deep=True): """Get parameters for this estimator. diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 623f83a517a5..fc9edf2a9579 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -16,8 +16,11 @@ from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split from sklearn.multioutput import (MultiOutputClassifier, ClassifierChain, MultiOutputRegressor, RegressorChain) -from sklearn.utils.estimator_checks import (_yield_all_checks, SkipTest, - check_parameters_default_constructible) +from sklearn.utils.estimator_checks import ( + SkipTest, + check_parameters_default_constructible, + check_estimator, +) from sklearn.utils.validation import check_is_fitted from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud @@ -458,22 +461,10 @@ def test_sklearn_integration(self): # we cannot use `check_estimator` directly since there is no skip test mechanism for name, estimator in ((lgb.sklearn.LGBMClassifier.__name__, lgb.sklearn.LGBMClassifier), (lgb.sklearn.LGBMRegressor.__name__, lgb.sklearn.LGBMRegressor)): - check_parameters_default_constructible(name, estimator) # we cannot leave default params (see https://github.com/microsoft/LightGBM/issues/833) estimator = estimator(min_child_samples=1, min_data_in_bin=1) - for check in _yield_all_checks(name, estimator): - check_name = check.func.__name__ if hasattr(check, 'func') else check.__name__ - if check_name == 'check_estimators_nan_inf': - continue # skip test because LightGBM deals with nan - elif check_name == "check_no_attributes_set_in_init": - # skip test because scikit-learn incorrectly asserts that - # private attributes cannot be set in __init__ - # (see https://github.com/microsoft/LightGBM/issues/2628) - continue - try: - check(name, estimator) - except SkipTest as message: - warnings.warn(message, SkipTestWarning) + check_parameters_default_constructible(name, estimator) + check_estimator(estimator) @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') def test_pandas_categorical(self): From c38d478d9e88337fdd9e78f5157e29f56b4a0432 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 16:32:13 +0100 Subject: [PATCH 02/10] remove useless import --- tests/python_package_test/test_sklearn.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index fc9edf2a9579..c721f20700ba 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -4,20 +4,17 @@ import math import os import unittest -import warnings import lightgbm as lgb import numpy as np from sklearn import __version__ as sk_version from sklearn.base import clone from sklearn.datasets import load_svmlight_file, make_multilabel_classification -from sklearn.exceptions import SkipTestWarning from sklearn.metrics import log_loss, mean_squared_error from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split from sklearn.multioutput import (MultiOutputClassifier, ClassifierChain, MultiOutputRegressor, RegressorChain) from sklearn.utils.estimator_checks import ( - SkipTest, check_parameters_default_constructible, check_estimator, ) From f3fab612ac56e00f07c6d86d0b6b01155ef9bdae Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 16:40:55 +0100 Subject: [PATCH 03/10] remove outdated comment --- tests/python_package_test/test_sklearn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index c721f20700ba..3d5a9f4cd077 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -455,7 +455,6 @@ def test_feature_importances_type(self): # sklearn <0.19 cannot accept instance, but many tests could be passed only with min_data=1 and min_data_in_bin=1 @unittest.skipIf(sk_version < '0.19.0', 'scikit-learn version is less than 0.19') def test_sklearn_integration(self): - # we cannot use `check_estimator` directly since there is no skip test mechanism for name, estimator in ((lgb.sklearn.LGBMClassifier.__name__, lgb.sklearn.LGBMClassifier), (lgb.sklearn.LGBMRegressor.__name__, lgb.sklearn.LGBMRegressor)): # we cannot leave default params (see https://github.com/microsoft/LightGBM/issues/833) From 93be6f41f0118a0b9194ad78fe5a1274bc4f327e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 16:43:04 +0100 Subject: [PATCH 04/10] order import --- tests/python_package_test/test_sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 3d5a9f4cd077..f8a8985e1db4 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -15,8 +15,8 @@ from sklearn.multioutput import (MultiOutputClassifier, ClassifierChain, MultiOutputRegressor, RegressorChain) from sklearn.utils.estimator_checks import ( - check_parameters_default_constructible, check_estimator, + check_parameters_default_constructible, ) from sklearn.utils.validation import check_is_fitted From d1b80f3c5a2583b34972ab0a2e4dfe820e3593b3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 21:11:31 +0100 Subject: [PATCH 05/10] use parametrize_with_checks --- python-package/lightgbm/sklearn.py | 7 +++-- tests/python_package_test/test_sklearn.py | 34 +++++++++++++++-------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 36ce396d94ad..35501993bf94 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -320,9 +320,10 @@ def _more_tags(self): 'X_types': ['2darray', 'sparse', '1dlabels'], '_xfail_checks': { 'check_no_attributes_set_in_init': - 'scikit-learn incorrectly asserts that private attributes ' - 'cannot be set in __init__: ' - '(see https://github.com/microsoft/LightGBM/issues/2628)' + 'xxx' + # 'scikit-learn incorrectly asserts that private attributes ' + # 'cannot be set in __init__: ' + # '(see https://github.com/microsoft/LightGBM/issues/2628)' } } diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index f8a8985e1db4..8fcd23a59b43 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -7,6 +7,7 @@ import lightgbm as lgb import numpy as np +import pytest from sklearn import __version__ as sk_version from sklearn.base import clone from sklearn.datasets import load_svmlight_file, make_multilabel_classification @@ -15,8 +16,8 @@ from sklearn.multioutput import (MultiOutputClassifier, ClassifierChain, MultiOutputRegressor, RegressorChain) from sklearn.utils.estimator_checks import ( - check_estimator, check_parameters_default_constructible, + parametrize_with_checks, ) from sklearn.utils.validation import check_is_fitted @@ -452,16 +453,6 @@ def test_feature_importances_type(self): importance_gain_top1 = sorted(importances_gain, reverse=True)[0] self.assertNotEqual(importance_split_top1, importance_gain_top1) - # sklearn <0.19 cannot accept instance, but many tests could be passed only with min_data=1 and min_data_in_bin=1 - @unittest.skipIf(sk_version < '0.19.0', 'scikit-learn version is less than 0.19') - def test_sklearn_integration(self): - for name, estimator in ((lgb.sklearn.LGBMClassifier.__name__, lgb.sklearn.LGBMClassifier), - (lgb.sklearn.LGBMRegressor.__name__, lgb.sklearn.LGBMRegressor)): - # we cannot leave default params (see https://github.com/microsoft/LightGBM/issues/833) - estimator = estimator(min_child_samples=1, min_data_in_bin=1) - check_parameters_default_constructible(name, estimator) - check_estimator(estimator) - @unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed') def test_pandas_categorical(self): import pandas as pd @@ -1136,3 +1127,24 @@ def test_check_is_fitted(self): rnk.fit(X, y, group=np.ones(X.shape[0])) for model in models: check_is_fitted(model) + + +def _tested_estimators(): + for Estimator in [lgb.sklearn.LGBMClassifier, lgb.sklearn.LGBMRegressor]: + yield Estimator() + + +@pytest.mark.skipif( + sk_version < "0.23.0", reason="scikit-learn version is less than 0.23" +) +@parametrize_with_checks(list(_tested_estimators())) +def test_sklearn_integration(estimator, check, request): + estimator.set_params(min_child_samples=1, min_data_in_bin=1) + check(estimator) + + +@pytest.mark.parametrize("estimator", list(_tested_estimators())) +def test_parameters_default_constructible(estimator): + name, Estimator = estimator.__class__.__name__, estimator.__class__ + # Test that estimators are default-constructible + check_parameters_default_constructible(name, Estimator) From debdb08e5eec84480a2b5cb71932578ef3852192 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 21:11:50 +0100 Subject: [PATCH 06/10] change the reason --- python-package/lightgbm/sklearn.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 35501993bf94..36ce396d94ad 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -320,10 +320,9 @@ def _more_tags(self): 'X_types': ['2darray', 'sparse', '1dlabels'], '_xfail_checks': { 'check_no_attributes_set_in_init': - 'xxx' - # 'scikit-learn incorrectly asserts that private attributes ' - # 'cannot be set in __init__: ' - # '(see https://github.com/microsoft/LightGBM/issues/2628)' + 'scikit-learn incorrectly asserts that private attributes ' + 'cannot be set in __init__: ' + '(see https://github.com/microsoft/LightGBM/issues/2628)' } } From 8a1cc9ddd8e1c004b2855fba906d02c6f4b38516 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 6 Nov 2020 21:21:06 +0100 Subject: [PATCH 07/10] skip constructible if != 0.23 --- tests/python_package_test/test_sklearn.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 8fcd23a59b43..b0cfc37a5a6d 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -1143,6 +1143,13 @@ def test_sklearn_integration(estimator, check, request): check(estimator) +@pytest.mark.skipif( + (sk_version < "0.23.0") or (sk_version >= "0.24.0"), + reason=( + "Default constructible check is included in the common check from " + "sklearn 0.24" + ) +) @pytest.mark.parametrize("estimator", list(_tested_estimators())) def test_parameters_default_constructible(estimator): name, Estimator = estimator.__class__.__name__, estimator.__class__ From a8d74732ef490c10ad2bb6b2d9de5c84a9fadada Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 7 Nov 2020 12:43:56 +0100 Subject: [PATCH 08/10] make tests behave the same across sklearn version --- python-package/lightgbm/sklearn.py | 2 +- tests/python_package_test/test_sklearn.py | 76 ++++++++++++++++------- 2 files changed, 54 insertions(+), 24 deletions(-) diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py index 36ce396d94ad..ed3ccea6c238 100644 --- a/python-package/lightgbm/sklearn.py +++ b/python-package/lightgbm/sklearn.py @@ -320,7 +320,7 @@ def _more_tags(self): 'X_types': ['2darray', 'sparse', '1dlabels'], '_xfail_checks': { 'check_no_attributes_set_in_init': - 'scikit-learn incorrectly asserts that private attributes ' + 'scikit-learn incorrectly asserts that private attributes ' 'cannot be set in __init__: ' '(see https://github.com/microsoft/LightGBM/issues/2628)' } diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index b0cfc37a5a6d..1bdec4b61a7b 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -15,12 +15,24 @@ from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split from sklearn.multioutput import (MultiOutputClassifier, ClassifierChain, MultiOutputRegressor, RegressorChain) -from sklearn.utils.estimator_checks import ( - check_parameters_default_constructible, - parametrize_with_checks, -) from sklearn.utils.validation import check_is_fitted +from sklearn.utils.estimator_checks import check_parameters_default_constructible + +try: + from pkg_resources import parse_version # type: ignore +except ImportError: + # setuptools not installed + parse_version = LooseVersion # type: ignore + +sk_version = parse_version(sk_version) +if sk_version < parse_version("0.23"): + import warnings + from sklearn.exceptions import SkipTestWarning + from sklearn.utils.estimator_checks import _yield_all_checks, SkipTest +else: + from sklearn.utils.estimator_checks import parametrize_with_checks + from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud @@ -169,7 +181,7 @@ def test_dart(self): self.assertLessEqual(score, 1.) # sklearn <0.23 does not have a stacking classifier and n_features_in_ property - @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + @unittest.skipIf(sk_version < parse_version("0.23"), 'scikit-learn version is less than 0.23') def test_stacking_classifier(self): from sklearn.ensemble import StackingClassifier @@ -196,7 +208,7 @@ def test_stacking_classifier(self): self.assertTrue(all(classes)) # sklearn <0.23 does not have a stacking regressor and n_features_in_ property - @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + @unittest.skipIf(sk_version < parse_version('0.23'), 'scikit-learn version is less than 0.23') def test_stacking_regressor(self): from sklearn.ensemble import StackingRegressor @@ -281,7 +293,7 @@ def test_random_search(self): self.assertLessEqual(score, 1.) # sklearn < 0.22 does not have the post fit attribute: classes_ - @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') + @unittest.skipIf(sk_version < parse_version('0.22'), 'scikit-learn version is less than 0.22') def test_multioutput_classifier(self): n_outputs = 3 X, y = make_multilabel_classification(n_samples=100, n_features=20, @@ -301,7 +313,7 @@ def test_multioutput_classifier(self): self.assertIsInstance(classifier.booster_, lgb.Booster) # sklearn < 0.23 does not have as_frame parameter - @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + @unittest.skipIf(sk_version < parse_version('0.23'), 'scikit-learn version is less than 0.23') def test_multioutput_regressor(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] @@ -318,7 +330,7 @@ def test_multioutput_regressor(self): self.assertIsInstance(regressor.booster_, lgb.Booster) # sklearn < 0.22 does not have the post fit attribute: classes_ - @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') + @unittest.skipIf(sk_version < parse_version('0.22'), 'scikit-learn version is less than 0.22') def test_classifier_chain(self): n_outputs = 3 X, y = make_multilabel_classification(n_samples=100, n_features=20, @@ -340,7 +352,7 @@ def test_classifier_chain(self): self.assertIsInstance(classifier.booster_, lgb.Booster) # sklearn < 0.23 does not have as_frame parameter - @unittest.skipIf(sk_version < '0.23.0', 'scikit-learn version is less than 0.23') + @unittest.skipIf(sk_version < parse_version('0.23'), 'scikit-learn version is less than 0.23') def test_regressor_chain(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] @@ -1109,7 +1121,7 @@ def test_continue_training_with_model(self): init_gbm.evals_result_['valid_0']['multi_logloss'][-1]) # sklearn < 0.22 requires passing "attributes" argument - @unittest.skipIf(sk_version < '0.22.0', 'scikit-learn version is less than 0.22') + @unittest.skipIf(sk_version < parse_version('0.22'), 'scikit-learn version is less than 0.22') def test_check_is_fitted(self): X, y = load_digits(n_class=2, return_X_y=True) est = lgb.LGBMModel(n_estimators=5, objective="binary") @@ -1134,21 +1146,39 @@ def _tested_estimators(): yield Estimator() -@pytest.mark.skipif( - sk_version < "0.23.0", reason="scikit-learn version is less than 0.23" -) -@parametrize_with_checks(list(_tested_estimators())) -def test_sklearn_integration(estimator, check, request): - estimator.set_params(min_child_samples=1, min_data_in_bin=1) - check(estimator) +if sk_version < parse_version("0.23"): + def _generate_checks_per_estimator(check_generator, estimators): + for estimator in estimators: + name = estimator.__class__.__name__ + for check in check_generator(name, estimator): + yield estimator, check + + @pytest.mark.skipif( + sk_version < parse_version("0.21"), reason="scikit-learn version is less than 0.21" + ) + @pytest.mark.parametrize( + "estimator, check", + _generate_checks_per_estimator(_yield_all_checks, _tested_estimators()), + ) + def test_sklearn_integration(estimator, check): + xfail_checks = estimator._get_tags()["_xfail_checks"] + check_name = check.__name__ if hasattr(check, "__name__") else check.func.__name__ + if xfail_checks and check_name in xfail_checks: + warnings.warn(xfail_checks[check_name], SkipTestWarning) + raise SkipTest + estimator.set_params(min_child_samples=1, min_data_in_bin=1) + name = estimator.__class__.__name__ + check(name, estimator) +else: + @parametrize_with_checks(list(_tested_estimators())) + def test_sklearn_integration(estimator, check, request): + estimator.set_params(min_child_samples=1, min_data_in_bin=1) + check(estimator) @pytest.mark.skipif( - (sk_version < "0.23.0") or (sk_version >= "0.24.0"), - reason=( - "Default constructible check is included in the common check from " - "sklearn 0.24" - ) + not (sk_version < parse_version("0.24")), + reason="Default constructible check included in common check from 0.24" ) @pytest.mark.parametrize("estimator", list(_tested_estimators())) def test_parameters_default_constructible(estimator): From 6ecf05dafb4e44bb89c84b347d2bc652c7169e57 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 7 Nov 2020 13:39:39 +0100 Subject: [PATCH 09/10] linter --- tests/python_package_test/test_sklearn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 1bdec4b61a7b..5d9edc935088 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -25,6 +25,8 @@ # setuptools not installed parse_version = LooseVersion # type: ignore +from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud + sk_version = parse_version(sk_version) if sk_version < parse_version("0.23"): import warnings @@ -33,8 +35,6 @@ else: from sklearn.utils.estimator_checks import parametrize_with_checks -from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud - decreasing_generator = itertools.count(0, -1) From 3e984b4bb300fffa01b7def2bea0280b3acdeb6c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 8 Nov 2020 11:24:46 +0100 Subject: [PATCH 10/10] address suggestions --- tests/python_package_test/test_sklearn.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index 5d9edc935088..6163f5cb8682 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -8,23 +8,17 @@ import lightgbm as lgb import numpy as np import pytest +from pkg_resources import parse_version from sklearn import __version__ as sk_version from sklearn.base import clone from sklearn.datasets import load_svmlight_file, make_multilabel_classification +from sklearn.utils.estimator_checks import check_parameters_default_constructible from sklearn.metrics import log_loss, mean_squared_error from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split from sklearn.multioutput import (MultiOutputClassifier, ClassifierChain, MultiOutputRegressor, RegressorChain) from sklearn.utils.validation import check_is_fitted -from sklearn.utils.estimator_checks import check_parameters_default_constructible - -try: - from pkg_resources import parse_version # type: ignore -except ImportError: - # setuptools not installed - parse_version = LooseVersion # type: ignore - from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud sk_version = parse_version(sk_version) @@ -35,7 +29,6 @@ else: from sklearn.utils.estimator_checks import parametrize_with_checks - decreasing_generator = itertools.count(0, -1) @@ -1177,7 +1170,7 @@ def test_sklearn_integration(estimator, check, request): @pytest.mark.skipif( - not (sk_version < parse_version("0.24")), + sk_version >= parse_version("0.24"), reason="Default constructible check included in common check from 0.24" ) @pytest.mark.parametrize("estimator", list(_tested_estimators()))