From 83120185881ef11ab01d9a817a458493877fca2f Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Fri, 22 Nov 2024 14:09:39 +0000
Subject: [PATCH 01/11] Prepare for n_init=auto in KMeans

This adds a warning for the upcoming switch from n_init=1 to 'auto'.
This adds the possibility to use 'auto', which helps being compatible
with sickit-learn.
---
 python/cuml/cuml/cluster/kmeans.pyx   | 30 ++++++++++++++++++++++++---
 python/cuml/cuml/tests/test_kmeans.py | 18 ++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
index 9ba1cb710a..85b2336a6c 100644
--- a/python/cuml/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cuml/cluster/kmeans.pyx
@@ -16,6 +16,8 @@
 
 # distutils: language = c++
 
+import warnings
+
 from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 from cuml.internals.safe_imports import gpu_only_import
@@ -141,10 +143,17 @@ class KMeans(UniversalBase,
          - If an ndarray is passed, it should be of
            shape (`n_clusters`, `n_features`) and gives the initial centers.
 
-    n_init: int (default = 1)
+    n_init: 'auto' or int (default = 1)
         Number of instances the k-means algorithm will be called with
         different seeds. The final results will be from the instance
         that produces lowest inertia out of n_init instances.
+
+        .. versionadded:: 25.02
+           Added 'auto' option for `n_init`.
+
+        .. versionchanged:: 25.XX
+            Default value for `n_init` will change from 1 to `'auto'` in version 25.XX.
+
     oversampling_factor : float64 (default = 2.0)
         The amount of points to sample
         in scalable k-means++ initialization for potential centroids.
@@ -211,7 +220,22 @@ class KMeans(UniversalBase,
             params.metric = CuvsDistanceType.L2Expanded   # distance metric as squared L2: @todo - support other metrics # noqa: E501
             params.batch_samples = <int>self.max_samples_per_batch
             params.oversampling_factor = <double>self.oversampling_factor
-            params.n_init = <int>self.n_init
+            n_init = self.n_init
+            if n_init == "warn":
+                warnings.warn(
+                    "The default value of `n_init` will change from"
+                    " 1 to 'auto' in 25.XX. Set the value of `n_init`"
+                    " explicitly to suppress this warning.",
+                    FutureWarning,
+                )
+                n_init = 1
+            if n_init == "auto":
+                if self.init in ("k-means||", "scalable-k-means++"):
+                    params.n_init = 1
+                else:
+                    params.n_init = 10
+            else:
+                params.n_init = <int>n_init
             return <size_t>params
         ELSE:
             return None
@@ -219,7 +243,7 @@ class KMeans(UniversalBase,
     @device_interop_preparation
     def __init__(self, *, handle=None, n_clusters=8, max_iter=300, tol=1e-4,
                  verbose=False, random_state=1,
-                 init='scalable-k-means++', n_init=1, oversampling_factor=2.0,
+                 init='scalable-k-means++', n_init="warn", oversampling_factor=2.0,
                  max_samples_per_batch=1<<15, convert_dtype=True,
                  output_type=None):
         super().__init__(handle=handle,
diff --git a/python/cuml/cuml/tests/test_kmeans.py b/python/cuml/cuml/tests/test_kmeans.py
index b05a762177..7971922f1c 100644
--- a/python/cuml/cuml/tests/test_kmeans.py
+++ b/python/cuml/cuml/tests/test_kmeans.py
@@ -67,6 +67,24 @@ def random_state():
     return random_state
 
 
+def test_n_init_deprecation():
+    X, y = make_blobs(
+        random_state=0,
+    )
+
+    # Warn about default changing
+    kmeans = cuml.KMeans()
+    with pytest.warns(
+        FutureWarning, match="The default value of `n_init` will change from"
+    ):
+        kmeans.fit(X)
+
+    # No warning when explicitly set to integer or 'auto'
+    for n_init in ("auto", 2):
+        kmeans = cuml.KMeans(n_init=n_init)
+        kmeans.fit(X)
+
+
 @pytest.mark.xfail
 def test_n_init_cluster_consistency(random_state):
 

From 06ba294e263f65799c3426da11b29310966338cc Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Fri, 22 Nov 2024 15:43:39 +0000
Subject: [PATCH 02/11] Fix KMeans tests for upcoming deprecation

---
 python/cuml/cuml/tests/test_kmeans.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/cuml/cuml/tests/test_kmeans.py b/python/cuml/cuml/tests/test_kmeans.py
index 7971922f1c..e2efe8ff42 100644
--- a/python/cuml/cuml/tests/test_kmeans.py
+++ b/python/cuml/cuml/tests/test_kmeans.py
@@ -145,7 +145,9 @@ def test_traditional_kmeans_plus_plus_init(
     cuml_kmeans.fit(X)
     cu_score = cuml_kmeans.score(X)
 
-    kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
+    kmeans = cluster.KMeans(
+        random_state=random_state, n_clusters=nclusters, n_init=1
+    )
     kmeans.fit(cp.asnumpy(X))
     sk_score = kmeans.score(cp.asnumpy(X))
 
@@ -185,7 +187,9 @@ def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state):
     cuml_kmeans.fit(X, sample_weight=wt)
     cu_score = cuml_kmeans.score(X)
 
-    sk_kmeans = cluster.KMeans(random_state=random_state, n_clusters=nclusters)
+    sk_kmeans = cluster.KMeans(
+        random_state=random_state, n_clusters=nclusters, n_init=1
+    )
     sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt)
     sk_score = sk_kmeans.score(cp.asnumpy(X))
 
@@ -214,6 +218,7 @@ def test_kmeans_clusters_blobs(
         n_clusters=nclusters,
         random_state=random_state,
         output_type="numpy",
+        n_init=1,
     )
 
     preds = cuml_kmeans.fit_predict(X)
@@ -345,6 +350,7 @@ def test_all_kmeans_params(
         oversampling_factor=oversampling_factor,
         max_samples_per_batch=max_samples_per_batch,
         output_type="cupy",
+        n_init=1,
     )
 
     cuml_kmeans.fit_predict(X)
@@ -373,6 +379,7 @@ def test_score(nrows, ncols, nclusters, random_state):
         n_clusters=nclusters,
         random_state=random_state,
         output_type="numpy",
+        n_init=1,
     )
 
     cuml_kmeans.fit(X)

From 5c27e8a412f7f0f7199c221137a07c7f5ecfa1e1 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Fri, 22 Nov 2024 16:40:46 +0000
Subject: [PATCH 03/11] Set 25.04 as change over version

---
 python/cuml/cuml/cluster/kmeans.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
index 85b2336a6c..6421de504e 100644
--- a/python/cuml/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cuml/cluster/kmeans.pyx
@@ -148,10 +148,10 @@ class KMeans(UniversalBase,
         different seeds. The final results will be from the instance
         that produces lowest inertia out of n_init instances.
 
-        .. versionadded:: 25.02
+        .. versionadded:: 24.12
            Added 'auto' option for `n_init`.
 
-        .. versionchanged:: 25.XX
+        .. versionchanged:: 25.04
             Default value for `n_init` will change from 1 to `'auto'` in version 25.XX.
 
     oversampling_factor : float64 (default = 2.0)
@@ -224,7 +224,7 @@ class KMeans(UniversalBase,
             if n_init == "warn":
                 warnings.warn(
                     "The default value of `n_init` will change from"
-                    " 1 to 'auto' in 25.XX. Set the value of `n_init`"
+                    " 1 to 'auto' in 25.04. Set the value of `n_init`"
                     " explicitly to suppress this warning.",
                     FutureWarning,
                 )

From c3125df0df7aa2cd7b0bb11f22972a32d357d015 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Fri, 22 Nov 2024 16:43:50 +0000
Subject: [PATCH 04/11] Use 25.05

---
 python/cuml/cuml/cluster/kmeans.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
index 6421de504e..334401ae91 100644
--- a/python/cuml/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cuml/cluster/kmeans.pyx
@@ -151,7 +151,7 @@ class KMeans(UniversalBase,
         .. versionadded:: 24.12
            Added 'auto' option for `n_init`.
 
-        .. versionchanged:: 25.04
+        .. versionchanged:: 25.05
             Default value for `n_init` will change from 1 to `'auto'` in version 25.XX.
 
     oversampling_factor : float64 (default = 2.0)
@@ -224,7 +224,7 @@ class KMeans(UniversalBase,
             if n_init == "warn":
                 warnings.warn(
                     "The default value of `n_init` will change from"
-                    " 1 to 'auto' in 25.04. Set the value of `n_init`"
+                    " 1 to 'auto' in 25.05. Set the value of `n_init`"
                     " explicitly to suppress this warning.",
                     FutureWarning,
                 )

From 8caef3961e761b53d5f827244c60bf0a2aaa1282 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Mon, 25 Nov 2024 16:41:17 +0000
Subject: [PATCH 05/11] Make the switch in 25.02

---
 python/cuml/cuml/cluster/kmeans.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
index 334401ae91..087b337d29 100644
--- a/python/cuml/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cuml/cluster/kmeans.pyx
@@ -151,8 +151,8 @@ class KMeans(UniversalBase,
         .. versionadded:: 24.12
            Added 'auto' option for `n_init`.
 
-        .. versionchanged:: 25.05
-            Default value for `n_init` will change from 1 to `'auto'` in version 25.XX.
+        .. versionchanged:: 25.02
+            Default value for `n_init` will change from 1 to `'auto'` in version 25.02.
 
     oversampling_factor : float64 (default = 2.0)
         The amount of points to sample
@@ -224,7 +224,7 @@ class KMeans(UniversalBase,
             if n_init == "warn":
                 warnings.warn(
                     "The default value of `n_init` will change from"
-                    " 1 to 'auto' in 25.05. Set the value of `n_init`"
+                    " 1 to 'auto' in 25.02. Set the value of `n_init`"
                     " explicitly to suppress this warning.",
                     FutureWarning,
                 )

From 52fffe96e2085481d3cdcc70521e28d4ed2ca486 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Tue, 26 Nov 2024 07:05:53 +0000
Subject: [PATCH 06/11] Update dask tests

---
 .../cuml/cuml/tests/dask/test_dask_kmeans.py  | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/python/cuml/cuml/tests/dask/test_dask_kmeans.py b/python/cuml/cuml/tests/dask/test_dask_kmeans.py
index 8f00cd51ed..b9b00ab553 100644
--- a/python/cuml/cuml/tests/dask/test_dask_kmeans.py
+++ b/python/cuml/cuml/tests/dask/test_dask_kmeans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -63,7 +63,10 @@ def test_end_to_end(
         X_train, y_train = X, y
 
     cumlModel = cumlKMeans(
-        init="k-means||", n_clusters=nclusters, random_state=10
+        init="k-means||",
+        n_clusters=nclusters,
+        random_state=10,
+        n_init="auto",
     )
 
     cumlModel.fit(X_train)
@@ -120,7 +123,7 @@ def test_large_data_no_overflow(nrows_per_part, ncols, nclusters, client):
     X.compute_chunk_sizes().persist()
 
     cumlModel = cumlKMeans(
-        init="k-means||", n_clusters=nclusters, random_state=10
+        init="k-means||", n_clusters=nclusters, random_state=10, n_init="auto"
     )
 
     cumlModel.fit(X_train)
@@ -171,7 +174,11 @@ def test_weighted_kmeans(nrows, ncols, nclusters, n_parts, client):
         wt[cp.argmax(cp.array(y.compute()) == i).item()] = 5000.0
 
     cumlModel = cumlKMeans(
-        verbose=0, init="k-means||", n_clusters=nclusters, random_state=10
+        verbose=0,
+        init="k-means||",
+        n_clusters=nclusters,
+        random_state=10,
+        n_init="auto",
     )
 
     chunk_parts = int(nrows / n_parts)
@@ -237,7 +244,10 @@ def test_transform(nrows, ncols, nclusters, n_parts, input_type, client):
         labels = cp.squeeze(y_train.compute())
 
     cumlModel = cumlKMeans(
-        init="k-means||", n_clusters=nclusters, random_state=10
+        init="k-means||",
+        n_clusters=nclusters,
+        random_state=10,
+        n_init="auto",
     )
 
     cumlModel.fit(X_train)
@@ -302,7 +312,10 @@ def test_score(nrows, ncols, nclusters, n_parts, input_type, client):
         X_train, y_train = X, y
 
     cumlModel = cumlKMeans(
-        init="k-means||", n_clusters=nclusters, random_state=10
+        init="k-means||",
+        n_clusters=nclusters,
+        random_state=10,
+        n_init="auto",
     )
 
     cumlModel.fit(X_train)

From 13965e3c161d6aee385d1dc78e4259c453fc4554 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Tue, 26 Nov 2024 11:08:19 +0000
Subject: [PATCH 07/11] Fix up tests that raised warnings

---
 python/cuml/cuml/cluster/kmeans.pyx             | 2 +-
 python/cuml/cuml/tests/test_api.py              | 2 ++
 python/cuml/cuml/tests/test_device_selection.py | 2 +-
 python/cuml/cuml/tests/test_input_estimators.py | 5 ++++-
 python/cuml/cuml/tests/test_metrics.py          | 2 +-
 python/cuml/cuml/tests/test_pickle.py           | 5 ++++-
 6 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
index 087b337d29..65d7a15f48 100644
--- a/python/cuml/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cuml/cluster/kmeans.pyx
@@ -95,7 +95,7 @@ class KMeans(UniversalBase,
         3  4.0  3.0
         >>>
         >>> # Calling fit
-        >>> kmeans_float = KMeans(n_clusters=2)
+        >>> kmeans_float = KMeans(n_clusters=2, n_init="auto")
         >>> kmeans_float.fit(b)
         KMeans()
         >>>
diff --git a/python/cuml/cuml/tests/test_api.py b/python/cuml/cuml/tests/test_api.py
index 74adbd177d..ce840199c2 100644
--- a/python/cuml/cuml/tests/test_api.py
+++ b/python/cuml/cuml/tests/test_api.py
@@ -237,6 +237,8 @@ def test_fit_function(dataset, model_name):
         model = models[model_name](np.random.normal(0.0, 1.0, (10,)))
     elif model_name in ["RandomForestClassifier", "RandomForestRegressor"]:
         model = models[model_name](n_bins=32)
+    elif model_name == "KMeans":
+        model = models[model_name](n_init="auto")
     else:
         if n_pos_args_constr == 1:
             model = models[model_name]()
diff --git a/python/cuml/cuml/tests/test_device_selection.py b/python/cuml/cuml/tests/test_device_selection.py
index 31c0f9aed6..96dc4e72b2 100644
--- a/python/cuml/cuml/tests/test_device_selection.py
+++ b/python/cuml/cuml/tests/test_device_selection.py
@@ -981,7 +981,7 @@ def test_kmeans_methods(train_device, infer_device):
     ref_model.fit(X_train_blob)
     ref_output = ref_model.predict(X_test_blob)
 
-    model = KMeans(n_clusters=n_clusters)
+    model = KMeans(n_clusters=n_clusters, n_init="auto")
     with using_device_type(train_device):
         model.fit(X_train_blob)
     with using_device_type(infer_device):
diff --git a/python/cuml/cuml/tests/test_input_estimators.py b/python/cuml/cuml/tests/test_input_estimators.py
index 9964b7f0fe..5ac983b916 100644
--- a/python/cuml/cuml/tests/test_input_estimators.py
+++ b/python/cuml/cuml/tests/test_input_estimators.py
@@ -115,7 +115,10 @@ def test_estimators_all_dtypes(model_name, dtype):
 
     X_train, y_train, X_test = make_dataset(dtype, nrows, ncols, ninfo)
     print(model_name)
-    model = models[model_name]()
+    if model_name == "KMeans":
+        model = models[model_name](n_init="auto")
+    else:
+        model = models[model_name]()
     sign = inspect.signature(model.fit)
     if "y" in sign.parameters:
         model.fit(X=X_train, y=y_train)
diff --git a/python/cuml/cuml/tests/test_metrics.py b/python/cuml/cuml/tests/test_metrics.py
index 2189dcdc41..264fc378bd 100644
--- a/python/cuml/cuml/tests/test_metrics.py
+++ b/python/cuml/cuml/tests/test_metrics.py
@@ -286,7 +286,7 @@ def test_rand_index_score(name, nrows):
     params = default_base.copy()
     params.update(pat[1])
 
-    cuml_kmeans = cuml.KMeans(n_clusters=params["n_clusters"])
+    cuml_kmeans = cuml.KMeans(n_clusters=params["n_clusters"], n_init="auto")
 
     X, y = pat[0]
 
diff --git a/python/cuml/cuml/tests/test_pickle.py b/python/cuml/cuml/tests/test_pickle.py
index 598ebbd7e3..e62232a888 100644
--- a/python/cuml/cuml/tests/test_pickle.py
+++ b/python/cuml/cuml/tests/test_pickle.py
@@ -299,7 +299,10 @@ def test_cluster_pickle(tmpdir, datatype, keys, data_size):
     def create_mod():
         nrows, ncols, n_info = data_size
         X_train, y_train, X_test = make_dataset(datatype, nrows, ncols, n_info)
-        model = cluster_models[keys]()
+        if keys == "KMeans":
+            model = cluster_models[keys](n_init="auto")
+        else:
+            model = cluster_models[keys]()
         model.fit(X_train)
         result["cluster"] = model.predict(X_test)
         return model, X_test

From 509853144f4650657fb10ee48ea7133121542ce9 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Thu, 5 Dec 2024 10:40:10 +0000
Subject: [PATCH 08/11] Ping


From 998f0b08a13ce91c0d64d378f836ab299cd588ca Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Thu, 5 Dec 2024 13:15:42 +0000
Subject: [PATCH 09/11] Deal with deprecations

---
 python/cuml/cuml/explainer/sampling.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/cuml/cuml/explainer/sampling.py b/python/cuml/cuml/explainer/sampling.py
index 2ffe9d0054..63d6e438ae 100644
--- a/python/cuml/cuml/explainer/sampling.py
+++ b/python/cuml/cuml/explainer/sampling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -91,7 +91,10 @@ def kmeans_sampling(X, k, round_values=True, detailed=False, random_state=0):
     X = imp.fit_transform(X)
 
     kmeans = KMeans(
-        n_clusters=k, random_state=random_state, output_type=_output_dtype_str
+        n_clusters=k,
+        random_state=random_state,
+        output_type=_output_dtype_str,
+        n_init="auto",
     ).fit(X)
 
     if round_values:

From 9ffe05156b3f3ee40dab66e0aca3c8eaab8fe312 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Fri, 6 Dec 2024 12:51:08 +0000
Subject: [PATCH 10/11] Use sklearn default values for constructor arguments

This way the hyper-parameter translator gets to see all arguments and we
avoid deprecation warnings due to mismatches between the sklearn and
cuml defaults
---
 python/cuml/cuml/experimental/accel/estimator_proxy.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cuml/cuml/experimental/accel/estimator_proxy.py b/python/cuml/cuml/experimental/accel/estimator_proxy.py
index fcee2f5b37..914b683b4e 100644
--- a/python/cuml/cuml/experimental/accel/estimator_proxy.py
+++ b/python/cuml/cuml/experimental/accel/estimator_proxy.py
@@ -207,7 +207,13 @@ def __init__(self, *args, **kwargs):
             self._cpu_model_class = (
                 original_class_a  # Store a reference to the original class
             )
-            kwargs, self._gpuaccel = self._hyperparam_translator(**kwargs)
+            sklearn_args = inspect.signature(self._cpu_model_class)
+            sklearn_args = sklearn_args.bind(*args, **kwargs)
+            sklearn_args.apply_defaults()
+
+            kwargs, self._gpuaccel = self._hyperparam_translator(
+                **sklearn_args.arguments
+            )
             super().__init__(*args, **kwargs)
 
             self._cpu_hyperparams = list(

From d9a65e3c911f568a62cc9a43926032915b23ab86 Mon Sep 17 00:00:00 2001
From: Tim Head <betatim@gmail.com>
Date: Fri, 6 Dec 2024 13:51:45 +0000
Subject: [PATCH 11/11] Ping