Merge branch 'branch-25.02' into feat/raft_logger

rapidsai · Dec 31, 2024 · 354501f · 354501f
2 parents 238c961 + 7731ba2
commit 354501f
Show file tree

Hide file tree

Showing 7 changed files with 21 additions and 7 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -71,9 +71,9 @@ dependencies:
 - scipy>=1.8.0
 - seaborn
 - spdlog>=1.14.1,<1.15
+- sphinx
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sphinx<6
 - statsmodels
 - sysroot_linux-64==2.17
 - treelite==4.3.0

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -67,9 +67,9 @@ dependencies:
 - scipy>=1.8.0
 - seaborn
 - spdlog>=1.14.1,<1.15
+- sphinx
 - sphinx-copybutton
 - sphinx-markdown-tables
-- sphinx<6
 - statsmodels
 - sysroot_linux-64==2.17
 - treelite==4.3.0

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -464,7 +464,7 @@ dependencies:
           - pydata-sphinx-theme!=0.14.2
           - recommonmark
           - &scikit_learn scikit-learn==1.5.*
-          - sphinx<6
+          - sphinx
           - sphinx-copybutton
           - sphinx-markdown-tables
       - output_types: conda

diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
@@ -31,6 +31,7 @@ from cuml.common import input_to_cuml_array
 from cuml.common.array_descriptor import CumlArrayDescriptor
 from cuml.internals.api_decorators import device_interop_preparation
 from cuml.internals.api_decorators import enable_device_interop
+from cuml.internals.global_settings import GlobalSettings
 from cuml.internals.mixins import ClusterMixin
 from cuml.internals.mixins import CMajorInputTagMixin
 from cuml.internals.import_utils import has_hdbscan
@@ -782,6 +783,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
         self.n_rows = n_rows
         self.n_cols = n_cols
 
+        if GlobalSettings().accelerator_active:
+            self._raw_data = self.X_m.to_output("numpy")
+
         cdef uintptr_t _input_ptr = X_m.ptr
 
         IF GPUBUILD == 1:
@@ -1133,6 +1137,8 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
             self.condensed_tree_._raw_tree
         self._cpu_model.single_linkage_tree_ = \
             self.single_linkage_tree_._linkage
+        if hasattr(self, "_raw_data"):
+            self._cpu_model._raw_data = self._raw_data
         if self.gen_min_span_tree:
             self._cpu_model.minimum_spanning_tree_ = \
                 self.minimum_spanning_tree_._mst

diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
@@ -289,6 +289,7 @@ class KMeans(UniversalBase,
         Compute k-means clustering with X.
 
         """
+        self._n_features_out = self.n_clusters
         if self.init == 'preset':
             check_cols = self.n_features_in_
             check_dtype = self.dtype
@@ -306,6 +307,8 @@ class KMeans(UniversalBase,
                                                   else None),
                                 check_dtype=check_dtype)
 
+        self.feature_names_in_ = _X_m.index
+
         IF GPUBUILD == 1:
 
             cdef uintptr_t input_ptr = _X_m.ptr
@@ -708,4 +711,5 @@ class KMeans(UniversalBase,
 
     def get_attr_names(self):
         return ['cluster_centers_', 'labels_', 'inertia_',
-                'n_iter_', 'n_features_in_', '_n_threads']
+                'n_iter_', 'n_features_in_', '_n_threads',
+                "feature_names_in_", "_n_features_out"]
diff --git a/python/cuml/cuml/manifold/umap.pyx b/python/cuml/cuml/manifold/umap.pyx
@@ -577,11 +577,13 @@ class UMAP(UniversalBase,
                                              convert_format=False)
             self.n_rows, self.n_dims = self._raw_data.shape
             self.sparse_fit = True
+            self._sparse_data = True
             if self.build_algo == "nn_descent":
                 raise ValueError("NN Descent does not support sparse inputs")
 
         # Handle dense inputs
         else:
+            self._sparse_data = False
             if data_on_host:
                 convert_to_mem_type = MemoryType.host
             else:
@@ -908,6 +910,7 @@ class UMAP(UniversalBase,
                                   self.metric_kwds, False, self.random_state)
 
         super().gpu_to_cpu()
+        self._cpu_model._validate_parameters()
 
     @classmethod
     def _get_param_names(cls):
@@ -943,4 +946,4 @@ class UMAP(UniversalBase,
         return ['_raw_data', 'embedding_', '_input_hash', '_small_data',
                 '_knn_dists', '_knn_indices', '_knn_search_index',
                 '_disconnection_distance', '_n_neighbors', '_a', '_b',
-                '_initial_alpha']
+                '_initial_alpha', '_sparse_data']
diff --git a/python/cuml/cuml/tests/test_kmeans.py b/python/cuml/cuml/tests/test_kmeans.py
@@ -171,7 +171,7 @@ def test_weighted_kmeans(nrows, ncols, nclusters, max_weight, random_state):
     sk_kmeans.fit(cp.asnumpy(X), sample_weight=wt)
     sk_score = sk_kmeans.score(cp.asnumpy(X))
 
-    assert abs(cu_score - sk_score) <= cluster_std * 1.5
+    assert cu_score - sk_score <= cluster_std * 1.5
 
 
 @pytest.mark.parametrize("nrows", [1000, 10000])
@@ -418,5 +418,6 @@ def test_fit_transform_weighted_kmeans(
     sk_transf = sk_kmeans.fit_transform(cp.asnumpy(X), sample_weight=wt)
     sk_score = sk_kmeans.score(cp.asnumpy(X))
 
-    assert abs(cu_score - sk_score) <= cluster_std * 1.5
+    # we fail if cuML's score is significantly worse than sklearn's
+    assert cu_score - sk_score <= cluster_std * 1.5
     assert sk_transf.shape == cuml_transf.shape