From 83120185881ef11ab01d9a817a458493877fca2f Mon Sep 17 00:00:00 2001 From: Tim Head Date: Fri, 22 Nov 2024 14:09:39 +0000 Subject: [PATCH 01/11] Prepare for n_init=auto in KMeans This adds a warning for the upcoming switch from n_init=1 to 'auto'. This adds the possibility to use 'auto', which helps being compatible with sickit-learn. --- python/cuml/cuml/cluster/kmeans.pyx | 30 ++++++++++++++++++++++++--- python/cuml/cuml/tests/test_kmeans.py | 18 ++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index 9ba1cb710a..85b2336a6c 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -16,6 +16,8 @@ # distutils: language = c++ +import warnings + from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') from cuml.internals.safe_imports import gpu_only_import @@ -141,10 +143,17 @@ class KMeans(UniversalBase, - If an ndarray is passed, it should be of shape (`n_clusters`, `n_features`) and gives the initial centers. - n_init: int (default = 1) + n_init: 'auto' or int (default = 1) Number of instances the k-means algorithm will be called with different seeds. The final results will be from the instance that produces lowest inertia out of n_init instances. + + .. versionadded:: 25.02 + Added 'auto' option for `n_init`. + + .. versionchanged:: 25.XX + Default value for `n_init` will change from 1 to `'auto'` in version 25.XX. + oversampling_factor : float64 (default = 2.0) The amount of points to sample in scalable k-means++ initialization for potential centroids. @@ -211,7 +220,22 @@ class KMeans(UniversalBase, params.metric = CuvsDistanceType.L2Expanded # distance metric as squared L2: @todo - support other metrics # noqa: E501 params.batch_samples = self.max_samples_per_batch params.oversampling_factor = self.oversampling_factor - params.n_init = self.n_init + n_init = self.n_init + if n_init == "warn": + warnings.warn( + "The default value of `n_init` will change from" + " 1 to 'auto' in 25.XX. Set the value of `n_init`" + " explicitly to suppress this warning.", + FutureWarning, + ) + n_init = 1 + if n_init == "auto": + if self.init in ("k-means||", "scalable-k-means++"): + params.n_init = 1 + else: + params.n_init = 10 + else: + params.n_init = n_init return params ELSE: return None @@ -219,7 +243,7 @@ class KMeans(UniversalBase, @device_interop_preparation def __init__(self, *, handle=None, n_clusters=8, max_iter=300, tol=1e-4, verbose=False, random_state=1, - init='scalable-k-means++', n_init=1, oversampling_factor=2.0, + init='scalable-k-means++', n_init="warn", oversampling_factor=2.0, max_samples_per_batch=1<<15, convert_dtype=True, output_type=None): super().__init__(handle=handle, diff --git a/python/cuml/cuml/tests/test_kmeans.py b/python/cuml/cuml/tests/test_kmeans.py index b05a762177..7971922f1c 100644 --- a/python/cuml/cuml/tests/test_kmeans.py +++ b/python/cuml/cuml/tests/test_kmeans.py @@ -67,6 +67,24 @@ def random_state(): return random_state +def test_n_init_deprecation(): + X, y = make_blobs( + random_state=0, + ) + + # Warn about default changing + kmeans = cuml.KMeans() + with pytest.warns( + FutureWarning, match="The default value of `n_init` will change from" + ): + kmeans.fit(X) + + # No warning when explicitly set to integer or 'auto' + for n_init in ("auto", 2): + kmeans = cuml.KMeans(n_init=n_init) + kmeans.fit(X) + + @pytest.mark.xfail def test_n_init_cluster_consistency(random_state): From 06ba294e263f65799c3426da11b29310966338cc Mon Sep 17 00:00:00 2001 From: Tim Head Date: Fri, 22 Nov 2024 15:43:39 +0000 Subject: [PATCH 02/11] Fix KMeans tests for upcoming deprecation --- python/cuml/cuml/tests/test_kmeans.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/cuml/cuml/tests/test_kmeans.py b/python/cuml/cuml/tests/test_kmeans.py index 7971922f1c..e2efe8ff42 100644 --- a/python/cuml/cuml/tests/test_kmeans.py +++ b/python/cuml/cuml/tests/test_kmeans.py @@ -145,7 +145,9 @@ def test_traditional_kmeans_plus_plus_init( cuml_kmeans.fit(X) cu_score = cuml_kmeans.score(X) - kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) + kmeans = cluster.KMeans( + random_state=random_state, n_clusters=nclusters, n_init=1 + ) kmeans.fit(cp.asnumpy(X)) sk_score = kmeans.score(cp.asnumpy(X)) @@ -185,7 +187,9 @@ def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state): cuml_kmeans.fit(X, sample_weight=wt) cu_score = cuml_kmeans.score(X) - sk_kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters) + sk_kmeans = cluster.KMeans( + random_state=random_state, n_clusters=nclusters, n_init=1 + ) sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt) sk_score = sk_kmeans.score(cp.asnumpy(X)) @@ -214,6 +218,7 @@ def test_kmeans_clusters_blobs( n_clusters=nclusters, random_state=random_state, output_type="numpy", + n_init=1, ) preds = cuml_kmeans.fit_predict(X) @@ -345,6 +350,7 @@ def test_all_kmeans_params( oversampling_factor=oversampling_factor, max_samples_per_batch=max_samples_per_batch, output_type="cupy", + n_init=1, ) cuml_kmeans.fit_predict(X) @@ -373,6 +379,7 @@ def test_score(nrows, ncols, nclusters, random_state): n_clusters=nclusters, random_state=random_state, output_type="numpy", + n_init=1, ) cuml_kmeans.fit(X) From 5c27e8a412f7f0f7199c221137a07c7f5ecfa1e1 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Fri, 22 Nov 2024 16:40:46 +0000 Subject: [PATCH 03/11] Set 25.04 as change over version --- python/cuml/cuml/cluster/kmeans.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index 85b2336a6c..6421de504e 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -148,10 +148,10 @@ class KMeans(UniversalBase, different seeds. The final results will be from the instance that produces lowest inertia out of n_init instances. - .. versionadded:: 25.02 + .. versionadded:: 24.12 Added 'auto' option for `n_init`. - .. versionchanged:: 25.XX + .. versionchanged:: 25.04 Default value for `n_init` will change from 1 to `'auto'` in version 25.XX. oversampling_factor : float64 (default = 2.0) @@ -224,7 +224,7 @@ class KMeans(UniversalBase, if n_init == "warn": warnings.warn( "The default value of `n_init` will change from" - " 1 to 'auto' in 25.XX. Set the value of `n_init`" + " 1 to 'auto' in 25.04. Set the value of `n_init`" " explicitly to suppress this warning.", FutureWarning, ) From c3125df0df7aa2cd7b0bb11f22972a32d357d015 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Fri, 22 Nov 2024 16:43:50 +0000 Subject: [PATCH 04/11] Use 25.05 --- python/cuml/cuml/cluster/kmeans.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index 6421de504e..334401ae91 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -151,7 +151,7 @@ class KMeans(UniversalBase, .. versionadded:: 24.12 Added 'auto' option for `n_init`. - .. versionchanged:: 25.04 + .. versionchanged:: 25.05 Default value for `n_init` will change from 1 to `'auto'` in version 25.XX. oversampling_factor : float64 (default = 2.0) @@ -224,7 +224,7 @@ class KMeans(UniversalBase, if n_init == "warn": warnings.warn( "The default value of `n_init` will change from" - " 1 to 'auto' in 25.04. Set the value of `n_init`" + " 1 to 'auto' in 25.05. Set the value of `n_init`" " explicitly to suppress this warning.", FutureWarning, ) From 8caef3961e761b53d5f827244c60bf0a2aaa1282 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Mon, 25 Nov 2024 16:41:17 +0000 Subject: [PATCH 05/11] Make the switch in 25.02 --- python/cuml/cuml/cluster/kmeans.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index 334401ae91..087b337d29 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -151,8 +151,8 @@ class KMeans(UniversalBase, .. versionadded:: 24.12 Added 'auto' option for `n_init`. - .. versionchanged:: 25.05 - Default value for `n_init` will change from 1 to `'auto'` in version 25.XX. + .. versionchanged:: 25.02 + Default value for `n_init` will change from 1 to `'auto'` in version 25.02. oversampling_factor : float64 (default = 2.0) The amount of points to sample @@ -224,7 +224,7 @@ class KMeans(UniversalBase, if n_init == "warn": warnings.warn( "The default value of `n_init` will change from" - " 1 to 'auto' in 25.05. Set the value of `n_init`" + " 1 to 'auto' in 25.02. Set the value of `n_init`" " explicitly to suppress this warning.", FutureWarning, ) From 52fffe96e2085481d3cdcc70521e28d4ed2ca486 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Tue, 26 Nov 2024 07:05:53 +0000 Subject: [PATCH 06/11] Update dask tests --- .../cuml/cuml/tests/dask/test_dask_kmeans.py | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/python/cuml/cuml/tests/dask/test_dask_kmeans.py b/python/cuml/cuml/tests/dask/test_dask_kmeans.py index 8f00cd51ed..b9b00ab553 100644 --- a/python/cuml/cuml/tests/dask/test_dask_kmeans.py +++ b/python/cuml/cuml/tests/dask/test_dask_kmeans.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -63,7 +63,10 @@ def test_end_to_end( X_train, y_train = X, y cumlModel = cumlKMeans( - init="k-means||", n_clusters=nclusters, random_state=10 + init="k-means||", + n_clusters=nclusters, + random_state=10, + n_init="auto", ) cumlModel.fit(X_train) @@ -120,7 +123,7 @@ def test_large_data_no_overflow(nrows_per_part, ncols, nclusters, client): X.compute_chunk_sizes().persist() cumlModel = cumlKMeans( - init="k-means||", n_clusters=nclusters, random_state=10 + init="k-means||", n_clusters=nclusters, random_state=10, n_init="auto" ) cumlModel.fit(X_train) @@ -171,7 +174,11 @@ def test_weighted_kmeans(nrows, ncols, nclusters, n_parts, client): wt[cp.argmax(cp.array(y.compute()) == i).item()] = 5000.0 cumlModel = cumlKMeans( - verbose=0, init="k-means||", n_clusters=nclusters, random_state=10 + verbose=0, + init="k-means||", + n_clusters=nclusters, + random_state=10, + n_init="auto", ) chunk_parts = int(nrows / n_parts) @@ -237,7 +244,10 @@ def test_transform(nrows, ncols, nclusters, n_parts, input_type, client): labels = cp.squeeze(y_train.compute()) cumlModel = cumlKMeans( - init="k-means||", n_clusters=nclusters, random_state=10 + init="k-means||", + n_clusters=nclusters, + random_state=10, + n_init="auto", ) cumlModel.fit(X_train) @@ -302,7 +312,10 @@ def test_score(nrows, ncols, nclusters, n_parts, input_type, client): X_train, y_train = X, y cumlModel = cumlKMeans( - init="k-means||", n_clusters=nclusters, random_state=10 + init="k-means||", + n_clusters=nclusters, + random_state=10, + n_init="auto", ) cumlModel.fit(X_train) From 13965e3c161d6aee385d1dc78e4259c453fc4554 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Tue, 26 Nov 2024 11:08:19 +0000 Subject: [PATCH 07/11] Fix up tests that raised warnings --- python/cuml/cuml/cluster/kmeans.pyx | 2 +- python/cuml/cuml/tests/test_api.py | 2 ++ python/cuml/cuml/tests/test_device_selection.py | 2 +- python/cuml/cuml/tests/test_input_estimators.py | 5 ++++- python/cuml/cuml/tests/test_metrics.py | 2 +- python/cuml/cuml/tests/test_pickle.py | 5 ++++- 6 files changed, 13 insertions(+), 5 deletions(-) diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx index 087b337d29..65d7a15f48 100644 --- a/python/cuml/cuml/cluster/kmeans.pyx +++ b/python/cuml/cuml/cluster/kmeans.pyx @@ -95,7 +95,7 @@ class KMeans(UniversalBase, 3 4.0 3.0 >>> >>> # Calling fit - >>> kmeans_float = KMeans(n_clusters=2) + >>> kmeans_float = KMeans(n_clusters=2, n_init="auto") >>> kmeans_float.fit(b) KMeans() >>> diff --git a/python/cuml/cuml/tests/test_api.py b/python/cuml/cuml/tests/test_api.py index 74adbd177d..ce840199c2 100644 --- a/python/cuml/cuml/tests/test_api.py +++ b/python/cuml/cuml/tests/test_api.py @@ -237,6 +237,8 @@ def test_fit_function(dataset, model_name): model = models[model_name](np.random.normal(0.0, 1.0, (10,))) elif model_name in ["RandomForestClassifier", "RandomForestRegressor"]: model = models[model_name](n_bins=32) + elif model_name == "KMeans": + model = models[model_name](n_init="auto") else: if n_pos_args_constr == 1: model = models[model_name]() diff --git a/python/cuml/cuml/tests/test_device_selection.py b/python/cuml/cuml/tests/test_device_selection.py index 31c0f9aed6..96dc4e72b2 100644 --- a/python/cuml/cuml/tests/test_device_selection.py +++ b/python/cuml/cuml/tests/test_device_selection.py @@ -981,7 +981,7 @@ def test_kmeans_methods(train_device, infer_device): ref_model.fit(X_train_blob) ref_output = ref_model.predict(X_test_blob) - model = KMeans(n_clusters=n_clusters) + model = KMeans(n_clusters=n_clusters, n_init="auto") with using_device_type(train_device): model.fit(X_train_blob) with using_device_type(infer_device): diff --git a/python/cuml/cuml/tests/test_input_estimators.py b/python/cuml/cuml/tests/test_input_estimators.py index 9964b7f0fe..5ac983b916 100644 --- a/python/cuml/cuml/tests/test_input_estimators.py +++ b/python/cuml/cuml/tests/test_input_estimators.py @@ -115,7 +115,10 @@ def test_estimators_all_dtypes(model_name, dtype): X_train, y_train, X_test = make_dataset(dtype, nrows, ncols, ninfo) print(model_name) - model = models[model_name]() + if model_name == "KMeans": + model = models[model_name](n_init="auto") + else: + model = models[model_name]() sign = inspect.signature(model.fit) if "y" in sign.parameters: model.fit(X=X_train, y=y_train) diff --git a/python/cuml/cuml/tests/test_metrics.py b/python/cuml/cuml/tests/test_metrics.py index 2189dcdc41..264fc378bd 100644 --- a/python/cuml/cuml/tests/test_metrics.py +++ b/python/cuml/cuml/tests/test_metrics.py @@ -286,7 +286,7 @@ def test_rand_index_score(name, nrows): params = default_base.copy() params.update(pat[1]) - cuml_kmeans = cuml.KMeans(n_clusters=params["n_clusters"]) + cuml_kmeans = cuml.KMeans(n_clusters=params["n_clusters"], n_init="auto") X, y = pat[0] diff --git a/python/cuml/cuml/tests/test_pickle.py b/python/cuml/cuml/tests/test_pickle.py index 598ebbd7e3..e62232a888 100644 --- a/python/cuml/cuml/tests/test_pickle.py +++ b/python/cuml/cuml/tests/test_pickle.py @@ -299,7 +299,10 @@ def test_cluster_pickle(tmpdir, datatype, keys, data_size): def create_mod(): nrows, ncols, n_info = data_size X_train, y_train, X_test = make_dataset(datatype, nrows, ncols, n_info) - model = cluster_models[keys]() + if keys == "KMeans": + model = cluster_models[keys](n_init="auto") + else: + model = cluster_models[keys]() model.fit(X_train) result["cluster"] = model.predict(X_test) return model, X_test From 509853144f4650657fb10ee48ea7133121542ce9 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Thu, 5 Dec 2024 10:40:10 +0000 Subject: [PATCH 08/11] Ping From 998f0b08a13ce91c0d64d378f836ab299cd588ca Mon Sep 17 00:00:00 2001 From: Tim Head Date: Thu, 5 Dec 2024 13:15:42 +0000 Subject: [PATCH 09/11] Deal with deprecations --- python/cuml/cuml/explainer/sampling.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cuml/cuml/explainer/sampling.py b/python/cuml/cuml/explainer/sampling.py index 2ffe9d0054..63d6e438ae 100644 --- a/python/cuml/cuml/explainer/sampling.py +++ b/python/cuml/cuml/explainer/sampling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -91,7 +91,10 @@ def kmeans_sampling(X, k, round_values=True, detailed=False, random_state=0): X = imp.fit_transform(X) kmeans = KMeans( - n_clusters=k, random_state=random_state, output_type=_output_dtype_str + n_clusters=k, + random_state=random_state, + output_type=_output_dtype_str, + n_init="auto", ).fit(X) if round_values: From 9ffe05156b3f3ee40dab66e0aca3c8eaab8fe312 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Fri, 6 Dec 2024 12:51:08 +0000 Subject: [PATCH 10/11] Use sklearn default values for constructor arguments This way the hyper-parameter translator gets to see all arguments and we avoid deprecation warnings due to mismatches between the sklearn and cuml defaults --- python/cuml/cuml/experimental/accel/estimator_proxy.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cuml/cuml/experimental/accel/estimator_proxy.py b/python/cuml/cuml/experimental/accel/estimator_proxy.py index fcee2f5b37..914b683b4e 100644 --- a/python/cuml/cuml/experimental/accel/estimator_proxy.py +++ b/python/cuml/cuml/experimental/accel/estimator_proxy.py @@ -207,7 +207,13 @@ def __init__(self, *args, **kwargs): self._cpu_model_class = ( original_class_a # Store a reference to the original class ) - kwargs, self._gpuaccel = self._hyperparam_translator(**kwargs) + sklearn_args = inspect.signature(self._cpu_model_class) + sklearn_args = sklearn_args.bind(*args, **kwargs) + sklearn_args.apply_defaults() + + kwargs, self._gpuaccel = self._hyperparam_translator( + **sklearn_args.arguments + ) super().__init__(*args, **kwargs) self._cpu_hyperparams = list( From d9a65e3c911f568a62cc9a43926032915b23ab86 Mon Sep 17 00:00:00 2001 From: Tim Head Date: Fri, 6 Dec 2024 13:51:45 +0000 Subject: [PATCH 11/11] Ping