diff --git a/.github/workflows/ubuntu-test.yml b/.github/workflows/ubuntu-test.yml index 33b57179b..21f0e106c 100644 --- a/.github/workflows/ubuntu-test.yml +++ b/.github/workflows/ubuntu-test.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] - scikit-learn: [0.21.2, 0.22.2, 0.23.1] + scikit-learn: [0.21.2, 0.22.2, 0.23.1, 0.24] exclude: # no scikit-learn 0.21.2 release for Python 3.8 - python-version: 3.8 scikit-learn: 0.21.2 diff --git a/examples/30_extended/flows_and_runs_tutorial.py b/examples/30_extended/flows_and_runs_tutorial.py index 5e73e7e9a..9f8c89375 100644 --- a/examples/30_extended/flows_and_runs_tutorial.py +++ b/examples/30_extended/flows_and_runs_tutorial.py @@ -8,7 +8,6 @@ # License: BSD 3-Clause import openml -import numpy as np from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree ############################################################################ @@ -54,7 +53,7 @@ task = openml.tasks.get_task(403) # Build any classifier or pipeline -clf = tree.ExtraTreeClassifier() +clf = tree.DecisionTreeClassifier() # Run the flow run = openml.runs.run_model_on_task(clf, task) @@ -83,7 +82,10 @@ # ############################ # # When you need to handle 'dirty' data, build pipelines to model then automatically. -task = openml.tasks.get_task(1) +# To demonstrate this using the dataset `credit-a `_ via +# `task `_ as it contains both numerical and categorical +# variables and missing values in both. +task = openml.tasks.get_task(96) # OpenML helper functions for sklearn can be plugged in directly for complicated pipelines from openml.extensions.sklearn import cat, cont @@ -96,20 +98,14 @@ [ ( "categorical", - pipeline.Pipeline( - [ - ("Imputer", impute.SimpleImputer(strategy="most_frequent")), - ( - "Encoder", - preprocessing.OneHotEncoder( - sparse=False, handle_unknown="ignore" - ), - ), - ] - ), + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), cat, # returns the categorical feature indices ), - ("continuous", "passthrough", cont), # returns the numeric feature indices + ( + "continuous", + impute.SimpleImputer(strategy="median"), + cont, + ), # returns the numeric feature indices ] ), ), @@ -146,20 +142,14 @@ [ ( "categorical", - pipeline.Pipeline( - [ - ("Imputer", impute.SimpleImputer(strategy="most_frequent")), - ( - "Encoder", - preprocessing.OneHotEncoder( - sparse=False, handle_unknown="ignore" - ), - ), - ] - ), + preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore"), categorical_feature_indices, ), - ("continuous", "passthrough", numeric_feature_indices), + ( + "continuous", + impute.SimpleImputer(strategy="median"), + numeric_feature_indices, + ), ] ), ), @@ -182,7 +172,9 @@ task = openml.tasks.get_task(6) # The following lines can then be executed offline: -run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False, upload_flow=False) +run = openml.runs.run_model_on_task( + pipe, task, avoid_duplicate_runs=False, upload_flow=False, dataset_format="array", +) # The run may be stored offline, and the flow will be stored along with it: run.to_filesystem(directory="myrun") diff --git a/examples/30_extended/run_setup_tutorial.py b/examples/30_extended/run_setup_tutorial.py index afc49a98b..8579d1d38 100644 --- a/examples/30_extended/run_setup_tutorial.py +++ b/examples/30_extended/run_setup_tutorial.py @@ -59,12 +59,9 @@ # easy as you want it to be -cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), - OneHotEncoder(handle_unknown="ignore", sparse=False), - TruncatedSVD(), -) -ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", "passthrough", cont)]) +cat_imp = make_pipeline(OneHotEncoder(handle_unknown="ignore", sparse=False), TruncatedSVD(),) +cont_imp = SimpleImputer(strategy="median") +ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) model_original = Pipeline(steps=[("transform", ct), ("estimator", RandomForestClassifier()),]) # Let's change some hyperparameters. Of course, in any good application we diff --git a/examples/40_paper/2018_neurips_perrone_example.py b/examples/40_paper/2018_neurips_perrone_example.py index 60d212116..5ae339ae2 100644 --- a/examples/40_paper/2018_neurips_perrone_example.py +++ b/examples/40_paper/2018_neurips_perrone_example.py @@ -177,18 +177,14 @@ def list_categorical_attributes(flow_type="svm"): cat_cols = list_categorical_attributes(flow_type=flow_type) num_cols = list(set(X.columns) - set(cat_cols)) -# Missing value imputers -cat_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="None") +# Missing value imputers for numeric columns num_imputer = SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=-1) -# Creating the one-hot encoder +# Creating the one-hot encoder for numerical representation of categorical columns enc = OneHotEncoder(handle_unknown="ignore") -# Pipeline to handle categorical column transformations -cat_transforms = Pipeline(steps=[("impute", cat_imputer), ("encode", enc)]) - # Combining column transformers -ct = ColumnTransformer([("cat", cat_transforms, cat_cols), ("num", num_imputer, num_cols)]) +ct = ColumnTransformer([("cat", enc, cat_cols), ("num", num_imputer, num_cols)]) # Creating the full pipeline with the surrogate model clf = RandomForestRegressor(n_estimators=50) diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py index 8ca6f9d45..4cd7b116d 100644 --- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py +++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py @@ -189,6 +189,8 @@ def test_serialize_model(self): if LooseVersion(sklearn.__version__) >= "0.22": fixture_parameters.update({"ccp_alpha": "0.0"}) fixture_parameters.move_to_end("ccp_alpha", last=False) + if LooseVersion(sklearn.__version__) >= "0.24": + del fixture_parameters["presort"] structure_fixture = {"sklearn.tree.{}.DecisionTreeClassifier".format(tree_name): []} @@ -1317,12 +1319,18 @@ def test__get_fn_arguments_with_defaults(self): (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] - else: + elif sklearn_version < "0.24": fns = [ (sklearn.ensemble.RandomForestRegressor.__init__, 18), (sklearn.tree.DecisionTreeClassifier.__init__, 14), (sklearn.pipeline.Pipeline.__init__, 2), ] + else: + fns = [ + (sklearn.ensemble.RandomForestRegressor.__init__, 18), + (sklearn.tree.DecisionTreeClassifier.__init__, 13), + (sklearn.pipeline.Pipeline.__init__, 2), + ] for fn, num_params_with_defaults in fns: defaults, defaultless = self.extension._get_fn_arguments_with_defaults(fn) @@ -1523,7 +1531,7 @@ def test_obtain_parameter_values(self): "bootstrap": [True, False], "criterion": ["gini", "entropy"], }, - cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1), + cv=sklearn.model_selection.StratifiedKFold(n_splits=2, random_state=1, shuffle=True), n_iter=5, ) flow = self.extension.model_to_flow(model) diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 8ebbdef2b..693f5a321 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -325,8 +325,16 @@ def test_get_flow_reinstantiate_model_wrong_version(self): # Note that CI does not test against 0.19.1. openml.config.server = self.production_server _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3] - flow = 8175 - expected = "Trying to deserialize a model with dependency" " sklearn==0.19.1 not satisfied." + if sklearn_major > 23: + flow = 18587 # 18687, 18725 --- flows building random forest on >= 0.23 + flow_sklearn_version = "0.23.1" + else: + flow = 8175 + flow_sklearn_version = "0.19.1" + expected = ( + "Trying to deserialize a model with dependency " + "sklearn=={} not satisfied.".format(flow_sklearn_version) + ) self.assertRaisesRegex( ValueError, expected, openml.flows.get_flow, flow_id=flow, reinstantiate=True ) diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py index e2a228aee..682359a61 100644 --- a/tests/test_study/test_study_examples.py +++ b/tests/test_study/test_study_examples.py @@ -1,6 +1,6 @@ # License: BSD 3-Clause -from openml.testing import TestBase, SimpleImputer, CustomImputer +from openml.testing import TestBase from openml.extensions.sklearn import cat, cont import sklearn @@ -13,8 +13,8 @@ class TestStudyFunctions(TestBase): """Test the example code of Bischl et al. (2018)""" @unittest.skipIf( - LooseVersion(sklearn.__version__) < "0.20", - reason="columntransformer introduction in 0.20.0", + LooseVersion(sklearn.__version__) < "0.24", + reason="columntransformer introduction in 0.24.0", ) def test_Figure1a(self): """Test listing in Figure 1a on a single task and the old OpenML100 study. @@ -39,15 +39,14 @@ def test_Figure1a(self): import openml import sklearn.metrics import sklearn.tree + from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline, make_pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler benchmark_suite = openml.study.get_study("OpenML100", "tasks") # obtain the benchmark suite - cat_imp = make_pipeline( - SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore") - ) - cont_imp = make_pipeline(CustomImputer(), StandardScaler()) + cat_imp = OneHotEncoder(handle_unknown="ignore") + cont_imp = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()) ct = ColumnTransformer([("cat", cat_imp, cat), ("cont", cont_imp, cont)]) clf = Pipeline( steps=[("preprocess", ct), ("estimator", sklearn.tree.DecisionTreeClassifier())]