Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tests][python-package] change boston dataset to synthetic dataset in tests that don't check score #4895

Merged
merged 2 commits into from
Dec 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import lightgbm as lgb

from .utils import load_boston, load_breast_cancer, load_digits, load_iris
from .utils import load_boston, load_breast_cancer, load_digits, load_iris, make_synthetic_regression

decreasing_generator = itertools.count(0, -1)

Expand Down Expand Up @@ -731,7 +731,7 @@ def test_continue_train():


def test_continue_train_reused_dataset():
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
params = {
'objective': 'regression',
'verbose': -1
Expand Down Expand Up @@ -791,7 +791,7 @@ def test_continue_train_multiclass():


def test_cv():
X_train, y_train = load_boston(return_X_y=True)
X_train, y_train = make_synthetic_regression()
params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train)
# shuffle = False, override metric in params
Expand Down Expand Up @@ -887,7 +887,7 @@ def test_cvbooster():


def test_feature_name():
X_train, y_train = load_boston(return_X_y=True)
X_train, y_train = make_synthetic_regression()
params = {'verbose': -1}
lgb_train = lgb.Dataset(X_train, y_train)
feature_names = [f'f_{i}' for i in range(X_train.shape[-1])]
Expand Down Expand Up @@ -917,7 +917,7 @@ def test_feature_name_with_non_ascii():

def test_save_load_copy_pickle():
def train_and_predict(init_model=None, return_model=False):
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
params = {
'objective': 'regression',
Expand Down Expand Up @@ -2102,7 +2102,7 @@ def test_default_objective_and_metric():

@pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason='not enough RAM')
def test_model_size():
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
Comment on lines 2103 to +2105
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked all CI jobs and there are no new skips compared to master due to dataset replacement.

data = lgb.Dataset(X, y)
bst = lgb.train({'verbose': -1}, data, num_boost_round=2)
y_pred = bst.predict(X)
Expand Down Expand Up @@ -2515,7 +2515,7 @@ def test_dataset_params_with_reference():

def test_extra_trees():
# check extra trees increases regularization
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
lgb_x = lgb.Dataset(X, label=y)
params = {'objective': 'regression',
'num_leaves': 32,
Expand All @@ -2534,7 +2534,7 @@ def test_extra_trees():

def test_path_smoothing():
# check path smoothing increases regularization
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
lgb_x = lgb.Dataset(X, label=y)
params = {'objective': 'regression',
'num_leaves': 32,
Expand Down Expand Up @@ -2804,7 +2804,7 @@ def inner_test(X, y, params, early_stopping_rounds):
np.testing.assert_allclose(pred4, pred6)

# test for regression
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
params = {
'objective': 'regression',
'verbose': -1,
Expand Down
15 changes: 8 additions & 7 deletions tests/python_package_test/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@

import lightgbm as lgb

from .utils import load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking
from .utils import (load_boston, load_breast_cancer, load_digits, load_iris, load_linnerud, make_ranking,
make_synthetic_regression)

sk_version = parse_version(sk_version)
if sk_version < parse_version("0.23"):
Expand Down Expand Up @@ -184,7 +185,7 @@ def test_eval_at_aliases():

@pytest.mark.parametrize("custom_objective", [True, False])
def test_objective_aliases(custom_objective):
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
if custom_objective:
obj = custom_dummy_obj
Expand Down Expand Up @@ -440,7 +441,7 @@ def test_regressor_chain():


def test_clone_and_property():
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
gbm.fit(X, y)

Expand All @@ -458,7 +459,7 @@ def test_clone_and_property():


def test_joblib():
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=10, objective=custom_asymmetric_obj,
verbose=-1, importance_type='split')
Expand Down Expand Up @@ -499,7 +500,7 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
with pytest.raises(Exception, match="This class in not picklable"):
joblib.dump(unpicklable_callback, tmp_path / 'tmp.joblib')

X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
gbm = lgb.LGBMRegressor(n_estimators=5)
gbm.fit(X, y, callbacks=[unpicklable_callback])
assert gbm.booster_.attr('attr_set_inside_callback') == '40'
Expand Down Expand Up @@ -757,7 +758,7 @@ def test_predict_with_params_from_init():


def test_evaluate_train_set():
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
gbm = lgb.LGBMRegressor(n_estimators=10, verbose=-1)
gbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)])
Expand Down Expand Up @@ -1332,7 +1333,7 @@ def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task
X, y = load_iris(return_X_y=True)
model_factory = lgb.LGBMClassifier
elif task == 'regression':
X, y = load_boston(return_X_y=True)
X, y = make_synthetic_regression()
model_factory = lgb.LGBMRegressor
X = pd.DataFrame(X)
y_col_array = y.reshape(-1, 1)
Expand Down
5 changes: 5 additions & 0 deletions tests/python_package_test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,8 @@ def make_ranking(n_samples=100, n_features=20, n_informative=5, gmax=2,
X[:, j] = bias + coef * y_vec

return X, y_vec, group_id_vec


@lru_cache(maxsize=None)
def make_synthetic_regression(n_samples=100):
return sklearn.datasets.make_regression(n_samples, n_features=4, n_informative=2, random_state=42)