From ac55047ab4da66c87eb7f812d13caad41cb2981b Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sat, 27 Jul 2024 23:27:48 -0500
Subject: [PATCH 01/10] FIX small type corrections in labelencoder and tests
 for cudf.pandas

---
 python/cuml/cuml/preprocessing/LabelEncoder.py | 5 ++++-
 python/cuml/cuml/tests/test_ordinal_encoder.py | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cuml/cuml/preprocessing/LabelEncoder.py b/python/cuml/cuml/preprocessing/LabelEncoder.py
index d1f1c7d736..00e62855e1 100644
--- a/python/cuml/cuml/preprocessing/LabelEncoder.py
+++ b/python/cuml/cuml/preprocessing/LabelEncoder.py
@@ -223,6 +223,7 @@ def transform(self, y) -> cudf.Series:
         if encoded.has_nulls and self.handle_unknown == "error":
             raise KeyError("Attempted to encode unseen key")
 
+        # print(type(encoded))
         return encoded
 
     def fit_transform(self, y, z=None) -> cudf.Series:
@@ -265,7 +266,9 @@ def inverse_transform(self, y: cudf.Series) -> cudf.Series:
         ord_label = y.unique()
         category_num = len(self.classes_)
         if self.handle_unknown == "error":
-            for ordi in ord_label.values_host:
+            if not isinstance(ord_label, (cp.ndarray, np.ndarray)):
+                ord_label = ord_label.values_host
+            for ordi in ord_label:
                 if ordi < 0 or ordi >= category_num:
                     raise ValueError(
                         "y contains previously unseen label {}".format(ordi)
diff --git a/python/cuml/cuml/tests/test_ordinal_encoder.py b/python/cuml/cuml/tests/test_ordinal_encoder.py
index c9379a43be..2fc80d5d76 100644
--- a/python/cuml/cuml/tests/test_ordinal_encoder.py
+++ b/python/cuml/cuml/tests/test_ordinal_encoder.py
@@ -20,6 +20,7 @@
 from cuml.internals.safe_imports import gpu_only_import_from
 from cuml.preprocessing import OrdinalEncoder
 
+cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")
 DataFrame = gpu_only_import_from("cudf", "DataFrame")
 
 
@@ -97,7 +98,8 @@ def test_output_type(test_sample) -> None:
     enc = OrdinalEncoder(output_type="cudf").fit(X)
     assert isinstance(enc.transform(X), DataFrame)
     enc = OrdinalEncoder(output_type="pandas").fit(X)
-    assert isinstance(enc.transform(X), pd.DataFrame)
+    if not cudf_pandas_active:
+        assert isinstance(enc.transform(X), pd.DataFrame)
     enc = OrdinalEncoder(output_type="numpy").fit(X)
     assert isinstance(enc.transform(X), np.ndarray)
     # output_type == "input"

From 1a7bee575a0d2daba02020f094126e296e8ca182 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sat, 27 Jul 2024 23:46:46 -0500
Subject: [PATCH 02/10] FIX small column transformer fix

---
 .../_thirdparty/sklearn/preprocessing/_column_transformer.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py b/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py
index d928f9b218..a83f3d4bfa 100644
--- a/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py
+++ b/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py
@@ -274,7 +274,7 @@ def _pandas_indexing(X, key, key_dtype, axis):
     if hasattr(key, 'shape'):
         # Work-around for indexing with read-only key in pandas
         # FIXME: solved in pandas 0.25
-        key = np.asarray(key)
+        key = key.to_numpy()
         key = key if key.flags.writeable else key.copy()
     elif isinstance(key, tuple):
         key = list(key)

From cd393ab9069ee058d5ef16095d2a8bf77ebfd851 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sun, 28 Jul 2024 00:09:53 -0500
Subject: [PATCH 03/10] FIX style fixes

---
 python/cuml/cuml/tests/test_ordinal_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/cuml/tests/test_ordinal_encoder.py b/python/cuml/cuml/tests/test_ordinal_encoder.py
index 2fc80d5d76..6b7db9b837 100644
--- a/python/cuml/cuml/tests/test_ordinal_encoder.py
+++ b/python/cuml/cuml/tests/test_ordinal_encoder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 88e4cc7dd845e9f4baa725b6228fc8733aba112d Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sun, 28 Jul 2024 00:16:39 -0500
Subject: [PATCH 04/10] FIX skip thirdparty/sklearn on copyright checks

---
 .pre-commit-config.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 67ef2d6ad3..cbf1ee1aa3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -73,7 +73,8 @@ repos:
                 setup[.]cfg$
           exclude: |
             (?x)
-                cpp/src/tsne/cannylab/bh[.]cu$
+                cpp/src/tsne/cannylab/bh[.]cu$|
+                python/cuml/cuml/_thirdparty/sklearn/
         - id: verify-alpha-spec
     - repo: https://github.com/rapidsai/dependency-file-generator
       rev: v1.13.11

From bf1ec2f7308127f902bbeda4be660d10e98db4f6 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sun, 28 Jul 2024 01:20:51 -0500
Subject: [PATCH 05/10] FIX small test fixes

---
 python/cuml/cuml/testing/utils.py            | 17 ++++++++++++++---
 python/cuml/cuml/tests/test_module_config.py |  8 +++++---
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/python/cuml/cuml/testing/utils.py b/python/cuml/cuml/testing/utils.py
index 41de2e1f37..af85b23a35 100644
--- a/python/cuml/cuml/testing/utils.py
+++ b/python/cuml/cuml/testing/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@
 pd = cpu_only_import("pandas")
 
 cuda = gpu_only_import_from("numba", "cuda")
+cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")
 
 
 cudf = gpu_only_import("cudf")
@@ -599,7 +600,10 @@ def generate_inputs_from_categories(
         inp_ary = cp.array(ary)
         return inp_ary, ary
     else:
-        df = cudf.DataFrame.from_pandas(pandas_df)
+        if cudf_pandas_active:
+            df = pandas_df
+        else:
+            df = cudf.DataFrame.from_pandas(pandas_df)
         return df, ary
 
 
@@ -607,7 +611,14 @@ def assert_inverse_equal(ours, ref):
     if isinstance(ours, cp.ndarray):
         cp.testing.assert_array_equal(ours, ref)
     else:
-        pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas())
+        if cudf_pandas_active:
+            if hasattr(ours, "to_pandas"):
+                ours = ours.to_pandas()
+            if hasattr(ref, "to_pandas"):
+                ref = ref.to_pandas()
+            pd.testing.assert_frame_equal(ours, ref)
+        else:
+            pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas())
 
 
 def from_df_to_numpy(df):
diff --git a/python/cuml/cuml/tests/test_module_config.py b/python/cuml/cuml/tests/test_module_config.py
index 4d595a9e55..124ca9c380 100644
--- a/python/cuml/cuml/tests/test_module_config.py
+++ b/python/cuml/cuml/tests/test_module_config.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,13 +19,15 @@
 import cuml
 import pytest
 
-from cuml.internals.safe_imports import gpu_only_import
+from cuml.internals.safe_imports import gpu_only_import, gpu_only_import_from
 
 cudf = gpu_only_import("cudf")
 cp = gpu_only_import("cupy")
 np = cpu_only_import("numpy")
 pd = cpu_only_import("pandas")
 
+cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")
+
 
 ###############################################################################
 #                                    Parameters                               #
@@ -71,7 +73,7 @@ def test_default_global_output_type(input_type):
 
     if input_type == "numba":
         assert is_cuda_array(res)
-    else:
+    elif not (input_type == "pandas" and cudf_pandas_active):
         assert isinstance(res, test_output_types[input_type])
 
 

From aa4a8c7d562eca3790d68e9b5219abe1e3bd780e Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sun, 28 Jul 2024 02:03:40 -0500
Subject: [PATCH 06/10] FIX skip rf tests that use sklearn when cudf.pandas is
 active

---
 python/cuml/cuml/tests/test_random_forest.py | 41 ++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/python/cuml/cuml/tests/test_random_forest.py b/python/cuml/cuml/tests/test_random_forest.py
index 640c22fd67..653d1a7cf5 100644
--- a/python/cuml/cuml/tests/test_random_forest.py
+++ b/python/cuml/cuml/tests/test_random_forest.py
@@ -54,6 +54,7 @@
 np = cpu_only_import("numpy")
 
 cuda = gpu_only_import_from("numba", "cuda")
+cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")
 
 
 pytestmark = pytest.mark.filterwarnings(
@@ -276,6 +277,11 @@ def test_tweedie_convergence(max_depth, split_criterion):
 )
 @pytest.mark.parametrize("datatype", [np.float32, np.float64])
 @pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"])
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_classification(small_clf, datatype, max_samples, max_features):
     use_handle = True
 
@@ -405,6 +411,11 @@ def test_rf_classification_unorder(
         (1.0, 32),
     ],
 )
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_regression(
     special_reg, datatype, max_features, max_samples, n_bins
 ):
@@ -510,6 +521,11 @@ def test_rf_classification_seed(small_clf, datatype):
 )
 @pytest.mark.parametrize("convert_dtype", [True, False])
 @pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]")
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_classification_float64(small_clf, datatype, convert_dtype):
 
     X, y = small_clf
@@ -552,6 +568,11 @@ def test_rf_classification_float64(small_clf, datatype, convert_dtype):
     "datatype", [(np.float64, np.float32), (np.float32, np.float64)]
 )
 @pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]")
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_regression_float64(large_reg, datatype):
 
     X, y = large_reg
@@ -675,6 +696,11 @@ def rf_classification(
 
 @pytest.mark.parametrize("datatype", [(np.float32, np.float64)])
 @pytest.mark.parametrize("array_type", ["dataframe", "numpy"])
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_classification_multi_class(mclass_clf, datatype, array_type):
     rf_classification(datatype, array_type, 1.0, 1.0, mclass_clf)
 
@@ -682,6 +708,11 @@ def test_rf_classification_multi_class(mclass_clf, datatype, array_type):
 @pytest.mark.parametrize("datatype", [(np.float32, np.float64)])
 @pytest.mark.parametrize("max_samples", [unit_param(1.0), stress_param(0.95)])
 @pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"])
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_classification_proba(
     small_clf, datatype, max_samples, max_features
 ):
@@ -695,6 +726,11 @@ def test_rf_classification_proba(
 @pytest.mark.parametrize(
     "algo", ["auto", "naive", "tree_reorg", "batch_tree_reorg"]
 )
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_classification_sparse(
     small_clf, datatype, fil_sparse_format, algo
 ):
@@ -783,6 +819,11 @@ def test_rf_classification_sparse(
 @pytest.mark.parametrize(
     "algo", ["auto", "naive", "tree_reorg", "batch_tree_reorg"]
 )
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
+    "Issue: https://github.com/rapidsai/cuml/issues/5991",
+)
 def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
     use_handle = True
     num_treees = 50

From ea21773414c8b652040ca8d9ba1056d862bf0f26 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sun, 28 Jul 2024 03:02:17 -0500
Subject: [PATCH 07/10] FIX few more small fixes

---
 python/cuml/cuml/preprocessing/TargetEncoder.py   | 9 ++++++---
 python/cuml/cuml/tests/explainer/test_sampling.py | 3 ++-
 python/cuml/cuml/tests/test_svm.py                | 8 +++++++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/python/cuml/cuml/preprocessing/TargetEncoder.py b/python/cuml/cuml/preprocessing/TargetEncoder.py
index 3b4a93ab7e..43162b8805 100644
--- a/python/cuml/cuml/preprocessing/TargetEncoder.py
+++ b/python/cuml/cuml/preprocessing/TargetEncoder.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -315,8 +315,11 @@ def _rename_col(df, col):
             return df.reset_index()
 
         res = []
-        for f in train[self.fold_col].unique().values_host:
-            mask = train[self.fold_col] == f
+        unq_vals = train[self.fold_col].unique()
+        if not isinstance(unq_vals, (cp.ndarray, np.ndarray)):
+            unq_vals = unq_vals.values_host
+        for f in unq_vals:
+            mask = train[self.fold_col].values == f
             dg = train.loc[~mask].groupby(x_cols).agg({self.y_col: self.stat})
             dg = _rename_col(dg, self.out_col)
             res.append(train.loc[mask].merge(dg, on=x_cols, how="left"))
diff --git a/python/cuml/cuml/tests/explainer/test_sampling.py b/python/cuml/cuml/tests/explainer/test_sampling.py
index c2cedb1ee0..93d436a459 100644
--- a/python/cuml/cuml/tests/explainer/test_sampling.py
+++ b/python/cuml/cuml/tests/explainer/test_sampling.py
@@ -23,6 +23,7 @@
 np = cpu_only_import("numpy")
 pd = cpu_only_import("pandas")
 cuda = gpu_only_import_from("numba", "cuda")
+cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")
 
 
 @pytest.mark.parametrize(
@@ -64,7 +65,7 @@ def test_kmeans_input(input_type):
     elif input_type == "cudf-series":
         cp.testing.assert_array_equal(summary[0].values.tolist(), [23.0, 52.0])
         assert isinstance(summary[0], cudf.Series)
-    elif input_type == "pandas-series":
+    elif input_type == "pandas-series" and not cudf_pandas_active:
         cp.testing.assert_array_equal(
             summary[0].to_numpy().flatten(), [23.0, 52.0]
         )
diff --git a/python/cuml/cuml/tests/test_svm.py b/python/cuml/cuml/tests/test_svm.py
index 5ae8895be1..c2e4501cd3 100644
--- a/python/cuml/cuml/tests/test_svm.py
+++ b/python/cuml/cuml/tests/test_svm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,6 +44,8 @@
 cudf = gpu_only_import("cudf")
 scipy_sparse = cpu_only_import("scipy.sparse")
 
+cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")
+
 IS_ARM = platform.processor() == "aarch64"
 
 
@@ -666,6 +668,10 @@ def test_svm_predict_convert_dtype(train_dtype, test_dtype, classifier):
     reason="Test fails unexpectedly on ARM. "
     "github.com/rapidsai/cuml/issues/5100",
 )
+@pytest.mark.skipif(
+    cudf_pandas_active,
+    reason="cudf.pandas causes small numeric issues in this test only ",
+)
 def test_svm_no_support_vectors():
     n_rows = 10
     n_cols = 3

From 50895476b5db804146dd677c349d9c16cc01f9c6 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Sun, 28 Jul 2024 03:24:28 -0500
Subject: [PATCH 08/10] FIX copyright

---
 python/cuml/cuml/tests/explainer/test_sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/cuml/tests/explainer/test_sampling.py b/python/cuml/cuml/tests/explainer/test_sampling.py
index 93d436a459..f4843ca7fe 100644
--- a/python/cuml/cuml/tests/explainer/test_sampling.py
+++ b/python/cuml/cuml/tests/explainer/test_sampling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 317f23443bc1d3c9721671a3a847beeab6f3cd5a Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Sun, 28 Jul 2024 17:02:57 -0500
Subject: [PATCH 09/10] FIX remove commented line of code

---
 python/cuml/cuml/preprocessing/LabelEncoder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cuml/cuml/preprocessing/LabelEncoder.py b/python/cuml/cuml/preprocessing/LabelEncoder.py
index 00e62855e1..152650bbed 100644
--- a/python/cuml/cuml/preprocessing/LabelEncoder.py
+++ b/python/cuml/cuml/preprocessing/LabelEncoder.py
@@ -223,7 +223,6 @@ def transform(self, y) -> cudf.Series:
         if encoded.has_nulls and self.handle_unknown == "error":
             raise KeyError("Attempted to encode unseen key")
 
-        # print(type(encoded))
         return encoded
 
     def fit_transform(self, y, z=None) -> cudf.Series:

From 70f1f2c6ba0789809d6e34ced0417a89ee53df45 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <dante.gamadessavre@gmail.com>
Date: Mon, 29 Jul 2024 12:32:57 -0500
Subject: [PATCH 10/10] Update python/cuml/cuml/testing/utils.py

Co-authored-by: Divye Gala <divyegala@gmail.com>
---
 python/cuml/cuml/testing/utils.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/python/cuml/cuml/testing/utils.py b/python/cuml/cuml/testing/utils.py
index af85b23a35..e7a5ce76d9 100644
--- a/python/cuml/cuml/testing/utils.py
+++ b/python/cuml/cuml/testing/utils.py
@@ -611,14 +611,11 @@ def assert_inverse_equal(ours, ref):
     if isinstance(ours, cp.ndarray):
         cp.testing.assert_array_equal(ours, ref)
     else:
-        if cudf_pandas_active:
-            if hasattr(ours, "to_pandas"):
-                ours = ours.to_pandas()
-            if hasattr(ref, "to_pandas"):
-                ref = ref.to_pandas()
-            pd.testing.assert_frame_equal(ours, ref)
-        else:
-            pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas())
+        if hasattr(ours, "to_pandas"):
+            ours = ours.to_pandas()
+        if hasattr(ref, "to_pandas"):
+            ref = ref.to_pandas()
+        pd.testing.assert_frame_equal(ours, ref)
 
 
 def from_df_to_numpy(df):