From 45693cdccdbf5af536c5c8bd0738d3b9f0e75544 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 23 Feb 2024 11:26:58 +0800 Subject: [PATCH 1/4] Let cuDF handle input types for label encoder. cuDF handles more types than the label encoder currently does (like torch tensor). This PR delegates the type checking to cuDF. - Let cuDF handle input types for label encoder. - Small cleanups. --- python/cuml/preprocessing/LabelEncoder.py | 59 ++++++++++------------- python/cuml/tests/test_label_encoder.py | 13 +++-- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index aceed2766a..291b36015a 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -14,16 +14,26 @@ # limitations under the License. # -from cuml.common.exceptions import NotFittedError -from cuml.internals.safe_imports import cpu_only_import_from -from cuml import Base -from cuml.internals.safe_imports import cpu_only_import -from cuml.internals.safe_imports import gpu_only_import +from typing import TYPE_CHECKING -cudf = gpu_only_import("cudf") -cp = gpu_only_import("cupy") -np = cpu_only_import("numpy") -pdSeries = cpu_only_import_from("pandas", "Series") +from cuml import Base +from cuml.common.exceptions import NotFittedError +from cuml.internals.safe_imports import ( + cpu_only_import, + cpu_only_import_from, + gpu_only_import, +) + +if TYPE_CHECKING: + import cudf + import cupy as cp + import numpy as np + from pandas import Series as pdSeries +else: + cudf = gpu_only_import("cudf") + cp = gpu_only_import("cupy") + np = cpu_only_import("numpy") + pdSeries = cpu_only_import_from("pandas", "Series") class LabelEncoder(Base): @@ -125,7 +135,7 @@ def __init__( handle=None, verbose=False, output_type=None, - ): + ) -> None: super().__init__( handle=handle, verbose=verbose, output_type=output_type @@ -137,7 +147,7 @@ def __init__( self.handle_unknown = handle_unknown def _check_is_fitted(self): - if not self._fitted: + if self.classes_ is None: msg = ( "This LabelEncoder instance is not fitted yet. Call 'fit' " "with appropriate arguments before using this estimator." @@ -175,7 +185,7 @@ def fit(self, y, _classes=None): if _classes is None: y = ( - self._to_cudf_series(y) + cudf.Series(y) .drop_duplicates() .sort_values(ignore_index=True) ) # dedupe and sort @@ -184,7 +194,6 @@ def fit(self, y, _classes=None): self.classes_ = _classes self.dtype = y.dtype if y.dtype != cp.dtype("O") else str - self._fitted = True return self def transform(self, y) -> cudf.Series: @@ -211,7 +220,7 @@ def transform(self, y) -> cudf.Series: KeyError if a category appears that was not seen in `fit` """ - y = self._to_cudf_series(y) + y = cudf.Series(y) self._check_is_fitted() @@ -233,13 +242,12 @@ def fit_transform(self, y, z=None) -> cudf.Series: `LabelEncoder().fit(y).transform(y)` """ - y = self._to_cudf_series(y) + y = cudf.Series(y) self.dtype = y.dtype if y.dtype != cp.dtype("O") else str y = y.astype("category") self.classes_ = y._column.categories - self._fitted = True return cudf.Series(y._column.codes, index=y.index) def inverse_transform(self, y: cudf.Series) -> cudf.Series: @@ -260,7 +268,7 @@ def inverse_transform(self, y: cudf.Series) -> cudf.Series: # check LabelEncoder is fitted self._check_is_fitted() # check input type is cudf.Series - y = self._to_cudf_series(y) + y = cudf.Series(y) # check if ord_label out of bound ord_label = y.unique() @@ -285,20 +293,3 @@ def get_param_names(self): return super().get_param_names() + [ "handle_unknown", ] - - def _to_cudf_series(self, y): - if isinstance(y, pdSeries): - y = cudf.from_pandas(y) - elif isinstance(y, cp.ndarray): - y = cudf.Series(y) - elif isinstance(y, np.ndarray): - y = cudf.Series(y) - elif not isinstance(y, cudf.Series): - msg = ( - "input should be either 'cupy.ndarray'" - " or 'numpy.ndarray' or 'pandas.Series'," - " or 'cudf.Series'" - "got {0}.".format(type(y)) - ) - raise TypeError(msg) - return y diff --git a/python/cuml/tests/test_label_encoder.py b/python/cuml/tests/test_label_encoder.py index 5c66fb8a64..1a134dcee7 100644 --- a/python/cuml/tests/test_label_encoder.py +++ b/python/cuml/tests/test_label_encoder.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from cuml.common.exceptions import NotFittedError import pytest -from cuml.internals.safe_imports import cpu_only_import + +from cuml.common.exceptions import NotFittedError +from cuml.internals.safe_imports import cpu_only_import, gpu_only_import from cuml.preprocessing.LabelEncoder import LabelEncoder -from cuml.internals.safe_imports import gpu_only_import +pd = cpu_only_import("pandas") cudf = gpu_only_import("cudf") np = cpu_only_import("numpy") cp = gpu_only_import("cupy") @@ -187,12 +188,14 @@ def _array_to_similarity_mat(x): @pytest.mark.parametrize("length", [10, 1000]) @pytest.mark.parametrize("cardinality", [5, 10, 50]) -@pytest.mark.parametrize("dtype", ["cupy", "numpy"]) -def test_labelencoder_fit_transform_cupy_numpy(length, cardinality, dtype): +@pytest.mark.parametrize("dtype", ["cupy", "numpy", "pd"]) +def test_labelencoder_fit_transform_cupy_numpy_pd(length, cardinality, dtype): """Try encoding the cupy array""" x = cp.random.choice(cardinality, (length,)) if dtype == "numpy": x = x.get() + elif dtype == "pd": + x = pd.Series(x.get()) encoded = LabelEncoder().fit_transform(x) x_arr = _array_to_similarity_mat(x) From 7538f6972da18d4d4ac28b20be99b28b5ef9edf8 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 23 Feb 2024 11:57:30 +0800 Subject: [PATCH 2/4] lint. --- python/cuml/preprocessing/LabelEncoder.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index 291b36015a..a26ab90465 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -184,11 +184,8 @@ def fit(self, y, _classes=None): self._validate_keywords() if _classes is None: - y = ( - cudf.Series(y) - .drop_duplicates() - .sort_values(ignore_index=True) - ) # dedupe and sort + # dedupe and sort + y = cudf.Series(y).drop_duplicates().sort_values(ignore_index=True) self.classes_ = y else: self.classes_ = _classes From 31617b70fc2c2f8218b0f51ca5252322e7eed45c Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 23 Feb 2024 11:58:07 +0800 Subject: [PATCH 3/4] lint. --- python/cuml/preprocessing/LabelEncoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index a26ab90465..3c4029b4bf 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 9400fa977f6a530de94f0a4b7a5e4edf1f1acffe Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 26 Feb 2024 14:57:56 +0800 Subject: [PATCH 4/4] Fix test, cleanup checks. --- .../_thirdparty/sklearn/utils/validation.py | 2 ++ python/cuml/preprocessing/LabelEncoder.py | 18 ++++------- .../tests/dask/test_dask_label_encoder.py | 17 +++++----- python/cuml/tests/test_label_encoder.py | 32 ++++++++++++------- 4 files changed, 38 insertions(+), 31 deletions(-) diff --git a/python/cuml/_thirdparty/sklearn/utils/validation.py b/python/cuml/_thirdparty/sklearn/utils/validation.py index ddb84319fa..b15837d4ea 100644 --- a/python/cuml/_thirdparty/sklearn/utils/validation.py +++ b/python/cuml/_thirdparty/sklearn/utils/validation.py @@ -225,6 +225,8 @@ def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): if not isinstance(attributes, (list, tuple)): attributes = [attributes] attrs = all_or_any([hasattr(estimator, attr) for attr in attributes]) + elif hasattr(estimator, "__sklearn_is_fitted__"): + attrs = estimator.__sklearn_is_fitted__() else: attrs = [v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")] diff --git a/python/cuml/preprocessing/LabelEncoder.py b/python/cuml/preprocessing/LabelEncoder.py index 3c4029b4bf..d1f1c7d736 100644 --- a/python/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/preprocessing/LabelEncoder.py @@ -17,6 +17,7 @@ from typing import TYPE_CHECKING from cuml import Base +from cuml._thirdparty.sklearn.utils.validation import check_is_fitted from cuml.common.exceptions import NotFittedError from cuml.internals.safe_imports import ( cpu_only_import, @@ -146,13 +147,8 @@ def __init__( self._fitted: bool = False self.handle_unknown = handle_unknown - def _check_is_fitted(self): - if self.classes_ is None: - msg = ( - "This LabelEncoder instance is not fitted yet. Call 'fit' " - "with appropriate arguments before using this estimator." - ) - raise NotFittedError(msg) + def __sklearn_is_fitted__(self) -> bool: + return self.classes_ is not None def _validate_keywords(self): if self.handle_unknown not in ("error", "ignore"): @@ -217,11 +213,9 @@ def transform(self, y) -> cudf.Series: KeyError if a category appears that was not seen in `fit` """ - y = cudf.Series(y) + check_is_fitted(self) - self._check_is_fitted() - - y = y.astype("category") + y = cudf.Series(y, dtype="category") encoded = y.cat.set_categories(self.classes_)._column.codes encoded = cudf.Series(encoded, index=y.index) @@ -263,7 +257,7 @@ def inverse_transform(self, y: cudf.Series) -> cudf.Series: Reverted labels """ # check LabelEncoder is fitted - self._check_is_fitted() + check_is_fitted(self) # check input type is cudf.Series y = cudf.Series(y) diff --git a/python/cuml/tests/dask/test_dask_label_encoder.py b/python/cuml/tests/dask/test_dask_label_encoder.py index 7228b70a85..8fd5683fa3 100644 --- a/python/cuml/tests/dask/test_dask_label_encoder.py +++ b/python/cuml/tests/dask/test_dask_label_encoder.py @@ -11,12 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from cuml.common.exceptions import NotFittedError import pytest -from cuml.internals.safe_imports import cpu_only_import + import cuml +from cuml._thirdparty.sklearn.utils.validation import check_is_fitted +from cuml.common.exceptions import NotFittedError from cuml.dask.preprocessing.LabelEncoder import LabelEncoder -from cuml.internals.safe_imports import gpu_only_import +from cuml.internals.safe_imports import cpu_only_import, gpu_only_import cudf = gpu_only_import("cudf") np = cpu_only_import("numpy") @@ -51,7 +52,7 @@ def test_labelencoder_transform(length, cardinality, client): tmp = cudf.Series(np.random.choice(cardinality, (length,))) df = dask_cudf.from_cudf(tmp, npartitions=len(client.has_what())) le = LabelEncoder().fit(df) - assert le._fitted + check_is_fitted(le) encoded = le.transform(df) @@ -69,7 +70,7 @@ def test_labelencoder_unseen(client): npartitions=len(client.has_what()), ) le = LabelEncoder().fit(df) - assert le._fitted + check_is_fitted(le) with pytest.raises(KeyError): tmp = dask_cudf.from_cudf( @@ -141,7 +142,7 @@ def test_inverse_transform( le.fit_transform(orig_label) else: le.fit(orig_label) - assert le._fitted is True + check_is_fitted(le) # test if inverse_transform is correct reverted = le.inverse_transform(ord_label) @@ -175,7 +176,7 @@ def test_empty_input(empty, ord_label, client): ord_label = dask_cudf.from_cudf(ord_label, npartitions=n_workers) le = LabelEncoder() le.fit(empty) - assert le._fitted is True + check_is_fitted(le) # test if correctly raies ValueError with pytest.raises(ValueError, match="y contains previously unseen label"): @@ -184,7 +185,7 @@ def test_empty_input(empty, ord_label, client): # check fit_transform() le = LabelEncoder() transformed = le.fit_transform(empty).compute() - assert le._fitted is True + check_is_fitted(le) assert len(transformed) == 0 diff --git a/python/cuml/tests/test_label_encoder.py b/python/cuml/tests/test_label_encoder.py index 1a134dcee7..dd0b941bfd 100644 --- a/python/cuml/tests/test_label_encoder.py +++ b/python/cuml/tests/test_label_encoder.py @@ -14,6 +14,7 @@ import pytest +from cuml._thirdparty.sklearn.utils.validation import check_is_fitted from cuml.common.exceptions import NotFittedError from cuml.internals.safe_imports import cpu_only_import, gpu_only_import from cuml.preprocessing.LabelEncoder import LabelEncoder @@ -47,7 +48,7 @@ def test_labelencoder_transform(length, cardinality): """Try fitting and then encoding a small subset of the df""" df = cudf.Series(np.random.choice(cardinality, (length,))) le = LabelEncoder().fit(df) - assert le._fitted + check_is_fitted(le) subset = df.iloc[0 : df.shape[0] // 2] encoded = le.transform(subset) @@ -63,7 +64,7 @@ def test_labelencoder_unseen(): """Try encoding a value that was not present during fitting""" df = cudf.Series(np.random.choice(10, (10,))) le = LabelEncoder().fit(df) - assert le._fitted + check_is_fitted(le) with pytest.raises(KeyError): le.transform(cudf.Series([-1])) @@ -73,7 +74,7 @@ def test_labelencoder_unfitted(): """Try calling `.transform()` without fitting first""" df = cudf.Series(np.random.choice(10, (10,))) le = LabelEncoder() - assert not le._fitted + assert not le.__sklearn_is_fitted__() with pytest.raises(NotFittedError): le.transform(df) @@ -118,7 +119,7 @@ def test_inverse_transform( le.fit_transform(orig_label) else: le.fit(orig_label) - assert le._fitted is True + check_is_fitted(le) # test if inverse_transform is correct reverted = le.inverse_transform(ord_label) @@ -133,7 +134,7 @@ def test_unfitted_inverse_transform(): """Try calling `.inverse_transform()` without fitting first""" df = cudf.Series(np.random.choice(10, (10,))) le = LabelEncoder() - assert not le._fitted + assert not le.__sklearn_is_fitted__() with pytest.raises(NotFittedError): le.transform(df) @@ -146,7 +147,7 @@ def test_empty_input(empty, ord_label): # prepare LabelEncoder le = LabelEncoder() le.fit(empty) - assert le._fitted is True + check_is_fitted(le) # test if correctly raies ValueError with pytest.raises(ValueError, match="y contains previously unseen label"): @@ -155,7 +156,7 @@ def test_empty_input(empty, ord_label): # check fit_transform() le = LabelEncoder() transformed = le.fit_transform(empty) - assert le._fitted is True + check_is_fitted(le) assert len(transformed) == 0 @@ -190,18 +191,27 @@ def _array_to_similarity_mat(x): @pytest.mark.parametrize("cardinality", [5, 10, 50]) @pytest.mark.parametrize("dtype", ["cupy", "numpy", "pd"]) def test_labelencoder_fit_transform_cupy_numpy_pd(length, cardinality, dtype): - """Try encoding the cupy array""" + """Try encoding with various types""" x = cp.random.choice(cardinality, (length,)) + # to series if dtype == "numpy": x = x.get() elif dtype == "pd": x = pd.Series(x.get()) encoded = LabelEncoder().fit_transform(x) - x_arr = _array_to_similarity_mat(x) + if dtype == "pd": + x_arr = _df_to_similarity_mat(x) + else: + x_arr = _array_to_similarity_mat(x) + encoded_arr = _array_to_similarity_mat(encoded.values) - if dtype == "numpy": + + # to array + if dtype == "numpy" or dtype == "pd": encoded_arr = encoded_arr.get() + if dtype == "pd": + x = x.to_numpy() assert ((encoded_arr == encoded_arr.T) == (x == x_arr.T)).all() @@ -232,7 +242,7 @@ def test_inverse_transform_cupy_numpy( le.fit_transform(orig_label) else: le.fit(orig_label) - assert le._fitted is True + check_is_fitted(le) # test if inverse_transform is correct reverted = le.inverse_transform(ord_label)