Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple CPU interop fixes for serialization and cloning #6223

Open
wants to merge 18 commits into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
"""
Fit HDBSCAN model from features.
"""

self._all_finite = True
X_m, n_rows, n_cols, self.dtype = \
input_to_cuml_array(X, order='C',
check_dtype=[np.float32],
Expand Down Expand Up @@ -1163,7 +1163,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
def get_attr_names(self):
attr_names = ['labels_', 'probabilities_', 'cluster_persistence_',
'condensed_tree_', 'single_linkage_tree_',
'outlier_scores_']
'outlier_scores_', '_all_finite']
if self.gen_min_span_tree:
attr_names = attr_names + ['minimum_spanning_tree_']
if self.prediction_data:
Expand Down
2 changes: 1 addition & 1 deletion python/cuml/cuml/experimental/accel/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def main(module, convert_to_sklearn, format, output, args):
(module,) = module
# run the module passing the remaining arguments
# as if it were run with python -m <module> <args>
sys.argv[:] = [module] + args # not thread safe?
sys.argv[:] = [module, *args.args] # not thread safe?
runpy.run_module(module, run_name="__main__")
elif len(args) >= 1:
# Remove ourself from argv and continue
Expand Down
14 changes: 10 additions & 4 deletions python/cuml/cuml/experimental/accel/estimator_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,21 @@ def __init__(self, *args, **kwargs):
self._cpu_model_class = (
original_class_a # Store a reference to the original class
)
kwargs, self._gpuaccel = self._hyperparam_translator(**kwargs)
super().__init__(*args, **kwargs)

translated_kwargs, self._gpuaccel = self._hyperparam_translator(
**kwargs
)
super().__init__(*args, **translated_kwargs)

self._cpu_hyperparams = list(
inspect.signature(
self._cpu_model_class.__init__
).parameters.keys()
)

self.import_cpu_model()
self.build_cpu_model(**kwargs)

def __repr__(self):
"""
Return a formal string representation of the object.
Expand All @@ -226,7 +232,7 @@ def __repr__(self):
A string representation indicating that this is a wrapped
version of the original CPU-based estimator.
"""
return f"wrapped {self._cpu_model_class}"
return self._cpu_model.__repr__()

def __str__(self):
"""
Expand All @@ -238,7 +244,7 @@ def __str__(self):
A string representation indicating that this is a wrapped
version of the original CPU-based estimator.
"""
return f"ProxyEstimator of {self._cpu_model_class}"
return self._cpu_model.__str__()

def __getstate__(self):
"""
Expand Down
70 changes: 61 additions & 9 deletions python/cuml/cuml/internals/base.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -640,17 +640,20 @@ class UniversalBase(Base):
inspect.signature(self._cpu_model_class.__init__).parameters.keys()
)

def build_cpu_model(self):
def build_cpu_model(self, **kwargs):
if hasattr(self, '_cpu_model'):
return
filtered_kwargs = {}
for keyword, arg in self._full_kwargs.items():
if keyword in self._cpu_hyperparams:
filtered_kwargs[keyword] = arg
else:
logger.info("Unused keyword parameter: {} "
"during CPU estimator "
"initialization".format(keyword))
if kwargs:
filtered_kwargs = kwargs
else:
filtered_kwargs = {}
for keyword, arg in self._full_kwargs.items():
if keyword in self._cpu_hyperparams:
filtered_kwargs[keyword] = arg
else:
logger.info("Unused keyword parameter: {} "
"during CPU estimator "
"initialization".format(keyword))

# initialize model
self._cpu_model = self._cpu_model_class(**filtered_kwargs)
Expand Down Expand Up @@ -889,6 +892,12 @@ class UniversalBase(Base):
# that are not in the cuML estimator in the host estimator
if GlobalSettings().accelerator_active or self._experimental_dispatching:

# we don't want to special sklearn dispatch cloning function
# so that cloning works with this class as a regular estimator
# without __sklearn_clone__
if attr == "__sklearn_clone__":
raise ex

self.import_cpu_model()
if hasattr(self._cpu_model_class, attr):

Expand Down Expand Up @@ -978,6 +987,8 @@ class UniversalBase(Base):
estimator = cls()
estimator.import_cpu_model()
estimator._cpu_model = model
params, gpuaccel = cls._hyperparam_translator(**model.get_params())
estimator.set_params(**params)
estimator.cpu_to_gpu()

# we need to set an output type here since
Expand All @@ -988,3 +999,44 @@ class UniversalBase(Base):
estimator.output_mem_type = MemoryType.host

return estimator

def get_params(self, deep=True):
"""
Get parameters for this estimator.

Parameters
----------
deep : bool, default=True
If True, will return the parameters for this estimator and
contained subobjects that are estimators.

Returns
-------
params : dict
Parameter names mapped to their values.
"""

if GlobalSettings().accelerator_active or self._experimental_dispatching:
return self._cpu_model.get_params(deep=deep)
else:
return super().get_params(deep=deep)

def set_params(self, **params):
"""
Set parameters for this estimator.

Parameters
----------
**params : dict
Estimator parameters

Returns
-------
self : estimator instance
The estimnator instance
"""
self._cpu_model.set_params(**params)
params, gpuaccel = self._hyperparam_translator(**params)
params = {key: params[key] for key in self._get_param_names() if key in params}
super().set_params(**params)
return self
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

import pytest
import numpy as np
import cupy as cp
from sklearn import clone, cluster
import cuml
from sklearn.datasets import make_classification, make_regression, make_blobs
from sklearn.linear_model import (
LinearRegression,
Expand Down Expand Up @@ -173,6 +174,57 @@ def test_proxy_facade():
assert original_value == proxy_value


def test_proxy_clone():
# Test that cloning a proxy estimator preserves parameters, even those we
# translate for the cuml class
pca = PCA(n_components=42, svd_solver="arpack")
pca_clone = clone(pca)

assert pca.get_params() == pca_clone.get_params()


def test_proxy_params():
# Test that parameters match between constructor and get_params()
# Mix of default and non-default values
pca = PCA(
n_components=5,
copy=False,
# Pass in an argument and set it to its default value
whiten=False,
)

params = pca.get_params()
assert params["n_components"] == 5
assert params["copy"] is False
assert params["whiten"] is False
# A parameter we never touched, should be the default
assert params["tol"] == 0.0

# Check that get_params doesn't return any unexpected parameters
expected_params = set(
[
"n_components",
"copy",
"whiten",
"tol",
"svd_solver",
"n_oversamples",
"random_state",
"iterated_power",
"power_iteration_normalizer",
]
)
assert set(params.keys()) == expected_params


def test_roundtrip():

km = cluster.KMeans(n_clusters=13)
ckm = cuml.KMeans.from_sklearn(km)

assert ckm.n_clusters == 13
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lucky number 13 :D

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should merge this PR. It improves things and fixes several things.

We can keep improving the from_/as_sklearn round tripping. I think the test from https://github.com/rapidsai/cuml/pull/6342/files#r1963552769 still doesn't pass (even if you exclude the raft handle). But lets look at that in a new PR



def test_defaults_args_only_methods():
# Check that estimator methods that take no arguments work
# These are slightly weird because basically everything else takes
Expand All @@ -186,6 +238,8 @@ def test_defaults_args_only_methods():


def test_kernel_ridge():
import cupy as cp
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why move this here? Maybe we should leave a comment for people from the future to explain why it can't be imported at the top of the file (or move it back if this was just for debugging)


rng = np.random.RandomState(42)

X = 5 * rng.rand(10000, 1)
Expand Down
Loading