From ac55047ab4da66c87eb7f812d13caad41cb2981b Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 27 Jul 2024 23:27:48 -0500 Subject: [PATCH 01/10] FIX small type corrections in labelencoder and tests for cudf.pandas --- python/cuml/cuml/preprocessing/LabelEncoder.py | 5 ++++- python/cuml/cuml/tests/test_ordinal_encoder.py | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cuml/cuml/preprocessing/LabelEncoder.py b/python/cuml/cuml/preprocessing/LabelEncoder.py index d1f1c7d736..00e62855e1 100644 --- a/python/cuml/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/cuml/preprocessing/LabelEncoder.py @@ -223,6 +223,7 @@ def transform(self, y) -> cudf.Series: if encoded.has_nulls and self.handle_unknown == "error": raise KeyError("Attempted to encode unseen key") + # print(type(encoded)) return encoded def fit_transform(self, y, z=None) -> cudf.Series: @@ -265,7 +266,9 @@ def inverse_transform(self, y: cudf.Series) -> cudf.Series: ord_label = y.unique() category_num = len(self.classes_) if self.handle_unknown == "error": - for ordi in ord_label.values_host: + if not isinstance(ord_label, (cp.ndarray, np.ndarray)): + ord_label = ord_label.values_host + for ordi in ord_label: if ordi < 0 or ordi >= category_num: raise ValueError( "y contains previously unseen label {}".format(ordi) diff --git a/python/cuml/cuml/tests/test_ordinal_encoder.py b/python/cuml/cuml/tests/test_ordinal_encoder.py index c9379a43be..2fc80d5d76 100644 --- a/python/cuml/cuml/tests/test_ordinal_encoder.py +++ b/python/cuml/cuml/tests/test_ordinal_encoder.py @@ -20,6 +20,7 @@ from cuml.internals.safe_imports import gpu_only_import_from from cuml.preprocessing import OrdinalEncoder +cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED") DataFrame = gpu_only_import_from("cudf", "DataFrame") @@ -97,7 +98,8 @@ def test_output_type(test_sample) -> None: enc = OrdinalEncoder(output_type="cudf").fit(X) assert isinstance(enc.transform(X), DataFrame) enc = OrdinalEncoder(output_type="pandas").fit(X) - assert isinstance(enc.transform(X), pd.DataFrame) + if not cudf_pandas_active: + assert isinstance(enc.transform(X), pd.DataFrame) enc = OrdinalEncoder(output_type="numpy").fit(X) assert isinstance(enc.transform(X), np.ndarray) # output_type == "input" From 1a7bee575a0d2daba02020f094126e296e8ca182 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 27 Jul 2024 23:46:46 -0500 Subject: [PATCH 02/10] FIX small column transformer fix --- .../_thirdparty/sklearn/preprocessing/_column_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py b/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py index d928f9b218..a83f3d4bfa 100644 --- a/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py +++ b/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_column_transformer.py @@ -274,7 +274,7 @@ def _pandas_indexing(X, key, key_dtype, axis): if hasattr(key, 'shape'): # Work-around for indexing with read-only key in pandas # FIXME: solved in pandas 0.25 - key = np.asarray(key) + key = key.to_numpy() key = key if key.flags.writeable else key.copy() elif isinstance(key, tuple): key = list(key) From cd393ab9069ee058d5ef16095d2a8bf77ebfd851 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 28 Jul 2024 00:09:53 -0500 Subject: [PATCH 03/10] FIX style fixes --- python/cuml/cuml/tests/test_ordinal_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/cuml/tests/test_ordinal_encoder.py b/python/cuml/cuml/tests/test_ordinal_encoder.py index 2fc80d5d76..6b7db9b837 100644 --- a/python/cuml/cuml/tests/test_ordinal_encoder.py +++ b/python/cuml/cuml/tests/test_ordinal_encoder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 88e4cc7dd845e9f4baa725b6228fc8733aba112d Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 28 Jul 2024 00:16:39 -0500 Subject: [PATCH 04/10] FIX skip thirdparty/sklearn on copyright checks --- .pre-commit-config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 67ef2d6ad3..cbf1ee1aa3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -73,7 +73,8 @@ repos: setup[.]cfg$ exclude: | (?x) - cpp/src/tsne/cannylab/bh[.]cu$ + cpp/src/tsne/cannylab/bh[.]cu$| + python/cuml/cuml/_thirdparty/sklearn/ - id: verify-alpha-spec - repo: https://github.com/rapidsai/dependency-file-generator rev: v1.13.11 From bf1ec2f7308127f902bbeda4be660d10e98db4f6 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 28 Jul 2024 01:20:51 -0500 Subject: [PATCH 05/10] FIX small test fixes --- python/cuml/cuml/testing/utils.py | 17 ++++++++++++++--- python/cuml/cuml/tests/test_module_config.py | 8 +++++--- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/python/cuml/cuml/testing/utils.py b/python/cuml/cuml/testing/utils.py index 41de2e1f37..af85b23a35 100644 --- a/python/cuml/cuml/testing/utils.py +++ b/python/cuml/cuml/testing/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -38,6 +38,7 @@ pd = cpu_only_import("pandas") cuda = gpu_only_import_from("numba", "cuda") +cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED") cudf = gpu_only_import("cudf") @@ -599,7 +600,10 @@ def generate_inputs_from_categories( inp_ary = cp.array(ary) return inp_ary, ary else: - df = cudf.DataFrame.from_pandas(pandas_df) + if cudf_pandas_active: + df = pandas_df + else: + df = cudf.DataFrame.from_pandas(pandas_df) return df, ary @@ -607,7 +611,14 @@ def assert_inverse_equal(ours, ref): if isinstance(ours, cp.ndarray): cp.testing.assert_array_equal(ours, ref) else: - pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas()) + if cudf_pandas_active: + if hasattr(ours, "to_pandas"): + ours = ours.to_pandas() + if hasattr(ref, "to_pandas"): + ref = ref.to_pandas() + pd.testing.assert_frame_equal(ours, ref) + else: + pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas()) def from_df_to_numpy(df): diff --git a/python/cuml/cuml/tests/test_module_config.py b/python/cuml/cuml/tests/test_module_config.py index 4d595a9e55..124ca9c380 100644 --- a/python/cuml/cuml/tests/test_module_config.py +++ b/python/cuml/cuml/tests/test_module_config.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,13 +19,15 @@ import cuml import pytest -from cuml.internals.safe_imports import gpu_only_import +from cuml.internals.safe_imports import gpu_only_import, gpu_only_import_from cudf = gpu_only_import("cudf") cp = gpu_only_import("cupy") np = cpu_only_import("numpy") pd = cpu_only_import("pandas") +cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED") + ############################################################################### # Parameters # @@ -71,7 +73,7 @@ def test_default_global_output_type(input_type): if input_type == "numba": assert is_cuda_array(res) - else: + elif not (input_type == "pandas" and cudf_pandas_active): assert isinstance(res, test_output_types[input_type]) From aa4a8c7d562eca3790d68e9b5219abe1e3bd780e Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 28 Jul 2024 02:03:40 -0500 Subject: [PATCH 06/10] FIX skip rf tests that use sklearn when cudf.pandas is active --- python/cuml/cuml/tests/test_random_forest.py | 41 ++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/python/cuml/cuml/tests/test_random_forest.py b/python/cuml/cuml/tests/test_random_forest.py index 640c22fd67..653d1a7cf5 100644 --- a/python/cuml/cuml/tests/test_random_forest.py +++ b/python/cuml/cuml/tests/test_random_forest.py @@ -54,6 +54,7 @@ np = cpu_only_import("numpy") cuda = gpu_only_import_from("numba", "cuda") +cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED") pytestmark = pytest.mark.filterwarnings( @@ -276,6 +277,11 @@ def test_tweedie_convergence(max_depth, split_criterion): ) @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"]) +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_classification(small_clf, datatype, max_samples, max_features): use_handle = True @@ -405,6 +411,11 @@ def test_rf_classification_unorder( (1.0, 32), ], ) +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_regression( special_reg, datatype, max_features, max_samples, n_bins ): @@ -510,6 +521,11 @@ def test_rf_classification_seed(small_clf, datatype): ) @pytest.mark.parametrize("convert_dtype", [True, False]) @pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]") +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_classification_float64(small_clf, datatype, convert_dtype): X, y = small_clf @@ -552,6 +568,11 @@ def test_rf_classification_float64(small_clf, datatype, convert_dtype): "datatype", [(np.float64, np.float32), (np.float32, np.float64)] ) @pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]") +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_regression_float64(large_reg, datatype): X, y = large_reg @@ -675,6 +696,11 @@ def rf_classification( @pytest.mark.parametrize("datatype", [(np.float32, np.float64)]) @pytest.mark.parametrize("array_type", ["dataframe", "numpy"]) +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_classification_multi_class(mclass_clf, datatype, array_type): rf_classification(datatype, array_type, 1.0, 1.0, mclass_clf) @@ -682,6 +708,11 @@ def test_rf_classification_multi_class(mclass_clf, datatype, array_type): @pytest.mark.parametrize("datatype", [(np.float32, np.float64)]) @pytest.mark.parametrize("max_samples", [unit_param(1.0), stress_param(0.95)]) @pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"]) +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_classification_proba( small_clf, datatype, max_samples, max_features ): @@ -695,6 +726,11 @@ def test_rf_classification_proba( @pytest.mark.parametrize( "algo", ["auto", "naive", "tree_reorg", "batch_tree_reorg"] ) +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_classification_sparse( small_clf, datatype, fil_sparse_format, algo ): @@ -783,6 +819,11 @@ def test_rf_classification_sparse( @pytest.mark.parametrize( "algo", ["auto", "naive", "tree_reorg", "batch_tree_reorg"] ) +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes sklearn RF estimators crashes sometimes. " + "Issue: https://github.com/rapidsai/cuml/issues/5991", +) def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 From ea21773414c8b652040ca8d9ba1056d862bf0f26 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 28 Jul 2024 03:02:17 -0500 Subject: [PATCH 07/10] FIX few more small fixes --- python/cuml/cuml/preprocessing/TargetEncoder.py | 9 ++++++--- python/cuml/cuml/tests/explainer/test_sampling.py | 3 ++- python/cuml/cuml/tests/test_svm.py | 8 +++++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/python/cuml/cuml/preprocessing/TargetEncoder.py b/python/cuml/cuml/preprocessing/TargetEncoder.py index 3b4a93ab7e..43162b8805 100644 --- a/python/cuml/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/cuml/preprocessing/TargetEncoder.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -315,8 +315,11 @@ def _rename_col(df, col): return df.reset_index() res = [] - for f in train[self.fold_col].unique().values_host: - mask = train[self.fold_col] == f + unq_vals = train[self.fold_col].unique() + if not isinstance(unq_vals, (cp.ndarray, np.ndarray)): + unq_vals = unq_vals.values_host + for f in unq_vals: + mask = train[self.fold_col].values == f dg = train.loc[~mask].groupby(x_cols).agg({self.y_col: self.stat}) dg = _rename_col(dg, self.out_col) res.append(train.loc[mask].merge(dg, on=x_cols, how="left")) diff --git a/python/cuml/cuml/tests/explainer/test_sampling.py b/python/cuml/cuml/tests/explainer/test_sampling.py index c2cedb1ee0..93d436a459 100644 --- a/python/cuml/cuml/tests/explainer/test_sampling.py +++ b/python/cuml/cuml/tests/explainer/test_sampling.py @@ -23,6 +23,7 @@ np = cpu_only_import("numpy") pd = cpu_only_import("pandas") cuda = gpu_only_import_from("numba", "cuda") +cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED") @pytest.mark.parametrize( @@ -64,7 +65,7 @@ def test_kmeans_input(input_type): elif input_type == "cudf-series": cp.testing.assert_array_equal(summary[0].values.tolist(), [23.0, 52.0]) assert isinstance(summary[0], cudf.Series) - elif input_type == "pandas-series": + elif input_type == "pandas-series" and not cudf_pandas_active: cp.testing.assert_array_equal( summary[0].to_numpy().flatten(), [23.0, 52.0] ) diff --git a/python/cuml/cuml/tests/test_svm.py b/python/cuml/cuml/tests/test_svm.py index 5ae8895be1..c2e4501cd3 100644 --- a/python/cuml/cuml/tests/test_svm.py +++ b/python/cuml/cuml/tests/test_svm.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -44,6 +44,8 @@ cudf = gpu_only_import("cudf") scipy_sparse = cpu_only_import("scipy.sparse") +cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED") + IS_ARM = platform.processor() == "aarch64" @@ -666,6 +668,10 @@ def test_svm_predict_convert_dtype(train_dtype, test_dtype, classifier): reason="Test fails unexpectedly on ARM. " "github.com/rapidsai/cuml/issues/5100", ) +@pytest.mark.skipif( + cudf_pandas_active, + reason="cudf.pandas causes small numeric issues in this test only ", +) def test_svm_no_support_vectors(): n_rows = 10 n_cols = 3 From 50895476b5db804146dd677c349d9c16cc01f9c6 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 28 Jul 2024 03:24:28 -0500 Subject: [PATCH 08/10] FIX copyright --- python/cuml/cuml/tests/explainer/test_sampling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/cuml/tests/explainer/test_sampling.py b/python/cuml/cuml/tests/explainer/test_sampling.py index 93d436a459..f4843ca7fe 100644 --- a/python/cuml/cuml/tests/explainer/test_sampling.py +++ b/python/cuml/cuml/tests/explainer/test_sampling.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 317f23443bc1d3c9721671a3a847beeab6f3cd5a Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sun, 28 Jul 2024 17:02:57 -0500 Subject: [PATCH 09/10] FIX remove commented line of code --- python/cuml/cuml/preprocessing/LabelEncoder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cuml/cuml/preprocessing/LabelEncoder.py b/python/cuml/cuml/preprocessing/LabelEncoder.py index 00e62855e1..152650bbed 100644 --- a/python/cuml/cuml/preprocessing/LabelEncoder.py +++ b/python/cuml/cuml/preprocessing/LabelEncoder.py @@ -223,7 +223,6 @@ def transform(self, y) -> cudf.Series: if encoded.has_nulls and self.handle_unknown == "error": raise KeyError("Attempted to encode unseen key") - # print(type(encoded)) return encoded def fit_transform(self, y, z=None) -> cudf.Series: From 70f1f2c6ba0789809d6e34ced0417a89ee53df45 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 29 Jul 2024 12:32:57 -0500 Subject: [PATCH 10/10] Update python/cuml/cuml/testing/utils.py Co-authored-by: Divye Gala --- python/cuml/cuml/testing/utils.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/python/cuml/cuml/testing/utils.py b/python/cuml/cuml/testing/utils.py index af85b23a35..e7a5ce76d9 100644 --- a/python/cuml/cuml/testing/utils.py +++ b/python/cuml/cuml/testing/utils.py @@ -611,14 +611,11 @@ def assert_inverse_equal(ours, ref): if isinstance(ours, cp.ndarray): cp.testing.assert_array_equal(ours, ref) else: - if cudf_pandas_active: - if hasattr(ours, "to_pandas"): - ours = ours.to_pandas() - if hasattr(ref, "to_pandas"): - ref = ref.to_pandas() - pd.testing.assert_frame_equal(ours, ref) - else: - pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas()) + if hasattr(ours, "to_pandas"): + ours = ours.to_pandas() + if hasattr(ref, "to_pandas"): + ref = ref.to_pandas() + pd.testing.assert_frame_equal(ours, ref) def from_df_to_numpy(df):