rapidsai · dantegd · Jan 14, 2025 · Feb 7, 2025 · Feb 14, 2025 · Feb 14, 2025
@@ -771,7 +771,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
         """
         Fit HDBSCAN model from features.
         """
-
+        self._all_finite = True
         X_m, n_rows, n_cols, self.dtype = \
             input_to_cuml_array(X, order='C',
                                 check_dtype=[np.float32],
@@ -1163,7 +1163,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
     def get_attr_names(self):
         attr_names = ['labels_', 'probabilities_', 'cluster_persistence_',
                       'condensed_tree_', 'single_linkage_tree_',
-                      'outlier_scores_']
+                      'outlier_scores_', '_all_finite']
         if self.gen_min_span_tree:
             attr_names = attr_names + ['minimum_spanning_tree_']
         if self.prediction_data:

@@ -73,7 +73,7 @@ def main(module, convert_to_sklearn, format, output, args):
         (module,) = module
         # run the module passing the remaining arguments
         # as if it were run with python -m <module> <args>
-        sys.argv[:] = [module] + args  # not thread safe?
+        sys.argv[:] = [module, *args.args]  # not thread safe?
         runpy.run_module(module, run_name="__main__")
     elif len(args) >= 1:
         # Remove ourself from argv and continue

@@ -207,15 +207,21 @@ def __init__(self, *args, **kwargs):
             self._cpu_model_class = (
                 original_class_a  # Store a reference to the original class
             )
-            kwargs, self._gpuaccel = self._hyperparam_translator(**kwargs)
-            super().__init__(*args, **kwargs)
+
+            translated_kwargs, self._gpuaccel = self._hyperparam_translator(
+                **kwargs
+            )
+            super().__init__(*args, **translated_kwargs)
 
             self._cpu_hyperparams = list(
                 inspect.signature(
                     self._cpu_model_class.__init__
                 ).parameters.keys()
             )
 
+            self.import_cpu_model()
+            self.build_cpu_model(**kwargs)
+
         def __repr__(self):
             """
             Return a formal string representation of the object.
@@ -226,7 +232,7 @@ def __repr__(self):
                 A string representation indicating that this is a wrapped
                  version of the original CPU-based estimator.
             """
-            return f"wrapped {self._cpu_model_class}"
+            return self._cpu_model.__repr__()
 
         def __str__(self):
             """
@@ -238,7 +244,7 @@ def __str__(self):
                 A string representation indicating that this is a wrapped
                  version of the original CPU-based estimator.
             """
-            return f"ProxyEstimator of {self._cpu_model_class}"
+            return self._cpu_model.__str__()
 
         def __getstate__(self):
             """

@@ -640,17 +640,20 @@ class UniversalBase(Base):
             inspect.signature(self._cpu_model_class.__init__).parameters.keys()
         )
 
-    def build_cpu_model(self):
+    def build_cpu_model(self, **kwargs):
         if hasattr(self, '_cpu_model'):
             return
-        filtered_kwargs = {}
-        for keyword, arg in self._full_kwargs.items():
-            if keyword in self._cpu_hyperparams:
-                filtered_kwargs[keyword] = arg
-            else:
-                logger.info("Unused keyword parameter: {} "
-                            "during CPU estimator "
-                            "initialization".format(keyword))
+        if kwargs:
+            filtered_kwargs = kwargs
+        else:
+            filtered_kwargs = {}
+            for keyword, arg in self._full_kwargs.items():
+                if keyword in self._cpu_hyperparams:
+                    filtered_kwargs[keyword] = arg
+                else:
+                    logger.info("Unused keyword parameter: {} "
+                                "during CPU estimator "
+                                "initialization".format(keyword))
 
         # initialize model
         self._cpu_model = self._cpu_model_class(**filtered_kwargs)
@@ -889,6 +892,12 @@ class UniversalBase(Base):
             # that are not in the cuML estimator in the host estimator
             if GlobalSettings().accelerator_active or self._experimental_dispatching:
 
+                # we don't want to special sklearn dispatch cloning function
+                # so that cloning works with this class as a regular estimator
+                # without __sklearn_clone__
+                if attr == "__sklearn_clone__":
+                    raise ex
+
                 self.import_cpu_model()
                 if hasattr(self._cpu_model_class, attr):
 
@@ -978,6 +987,8 @@ class UniversalBase(Base):
         estimator = cls()
         estimator.import_cpu_model()
         estimator._cpu_model = model
+        params, gpuaccel = cls._hyperparam_translator(**model.get_params())
+        estimator.set_params(**params)
         estimator.cpu_to_gpu()
 
         # we need to set an output type here since
@@ -988,3 +999,44 @@ class UniversalBase(Base):
         estimator.output_mem_type = MemoryType.host
 
         return estimator
+
+    def get_params(self, deep=True):
+        """
+        Get parameters for this estimator.
+
+        Parameters
+        ----------
+        deep : bool, default=True
+            If True, will return the parameters for this estimator and
+            contained subobjects that are estimators.
+
+        Returns
+        -------
+        params : dict
+            Parameter names mapped to their values.
+        """
+
+        if GlobalSettings().accelerator_active or self._experimental_dispatching:
+            return self._cpu_model.get_params(deep=deep)
+        else:
+            return super().get_params(deep=deep)
+
+    def set_params(self, **params):
+        """
+        Set parameters for this estimator.
+
+        Parameters
+        ----------
+        **params : dict
+            Estimator parameters
+
+        Returns
+        -------
+        self : estimator instance
+            The estimnator instance
+        """
+        self._cpu_model.set_params(**params)
+        params, gpuaccel = self._hyperparam_translator(**params)
+        params = {key: params[key] for key in self._get_param_names() if key in params}
+        super().set_params(**params)
+        return self
@@ -14,7 +14,8 @@
 
 import pytest
 import numpy as np
-import cupy as cp
+from sklearn import clone, cluster
+import cuml
 from sklearn.datasets import make_classification, make_regression, make_blobs
 from sklearn.linear_model import (
     LinearRegression,
@@ -173,6 +174,57 @@ def test_proxy_facade():
             assert original_value == proxy_value
 
 
+def test_proxy_clone():
+    # Test that cloning a proxy estimator preserves parameters, even those we
+    # translate for the cuml class
+    pca = PCA(n_components=42, svd_solver="arpack")
+    pca_clone = clone(pca)
+
+    assert pca.get_params() == pca_clone.get_params()
+
+
+def test_proxy_params():
+    # Test that parameters match between constructor and get_params()
+    # Mix of default and non-default values
+    pca = PCA(
+        n_components=5,
+        copy=False,
+        # Pass in an argument and set it to its default value
+        whiten=False,
+    )
+
+    params = pca.get_params()
+    assert params["n_components"] == 5
+    assert params["copy"] is False
+    assert params["whiten"] is False
+    # A parameter we never touched, should be the default
+    assert params["tol"] == 0.0
+
+    # Check that get_params doesn't return any unexpected parameters
+    expected_params = set(
+        [
+            "n_components",
+            "copy",
+            "whiten",
+            "tol",
+            "svd_solver",
+            "n_oversamples",
+            "random_state",
+            "iterated_power",
+            "power_iteration_normalizer",
+        ]
+    )
+    assert set(params.keys()) == expected_params
+
+
+def test_roundtrip():
+
+    km = cluster.KMeans(n_clusters=13)
+    ckm = cuml.KMeans.from_sklearn(km)
+
+    assert ckm.n_clusters == 13
+
+
 def test_defaults_args_only_methods():
     # Check that estimator methods that take no arguments work
     # These are slightly weird because basically everything else takes
@@ -186,6 +238,8 @@ def test_defaults_args_only_methods():
 
 
 def test_kernel_ridge():
+    import cupy as cp
+
     rng = np.random.RandomState(42)
 
     X = 5 * rng.rand(10000, 1)