Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes for encoders/transformers for cudf.pandas #5990

Merged
merged 11 commits into from
Jul 29, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def _pandas_indexing(X, key, key_dtype, axis):
if hasattr(key, 'shape'):
# Work-around for indexing with read-only key in pandas
# FIXME: solved in pandas 0.25
key = np.asarray(key)
key = key.to_numpy()
key = key if key.flags.writeable else key.copy()
elif isinstance(key, tuple):
key = list(key)
Expand Down
4 changes: 3 additions & 1 deletion python/cuml/cuml/preprocessing/LabelEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,9 @@ def inverse_transform(self, y: cudf.Series) -> cudf.Series:
ord_label = y.unique()
category_num = len(self.classes_)
if self.handle_unknown == "error":
for ordi in ord_label.values_host:
if not isinstance(ord_label, (cp.ndarray, np.ndarray)):
ord_label = ord_label.values_host
for ordi in ord_label:
if ordi < 0 or ordi >= category_num:
raise ValueError(
"y contains previously unseen label {}".format(ordi)
Expand Down
9 changes: 6 additions & 3 deletions python/cuml/cuml/preprocessing/TargetEncoder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -315,8 +315,11 @@ def _rename_col(df, col):
return df.reset_index()

res = []
for f in train[self.fold_col].unique().values_host:
mask = train[self.fold_col] == f
unq_vals = train[self.fold_col].unique()
if not isinstance(unq_vals, (cp.ndarray, np.ndarray)):
unq_vals = unq_vals.values_host
for f in unq_vals:
mask = train[self.fold_col].values == f
dg = train.loc[~mask].groupby(x_cols).agg({self.y_col: self.stat})
dg = _rename_col(dg, self.out_col)
res.append(train.loc[mask].merge(dg, on=x_cols, how="left"))
Expand Down
17 changes: 14 additions & 3 deletions python/cuml/cuml/testing/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -38,6 +38,7 @@
pd = cpu_only_import("pandas")

cuda = gpu_only_import_from("numba", "cuda")
cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")


cudf = gpu_only_import("cudf")
Expand Down Expand Up @@ -599,15 +600,25 @@ def generate_inputs_from_categories(
inp_ary = cp.array(ary)
return inp_ary, ary
else:
df = cudf.DataFrame.from_pandas(pandas_df)
if cudf_pandas_active:
df = pandas_df
else:
df = cudf.DataFrame.from_pandas(pandas_df)
return df, ary


def assert_inverse_equal(ours, ref):
if isinstance(ours, cp.ndarray):
cp.testing.assert_array_equal(ours, ref)
else:
pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas())
if cudf_pandas_active:
if hasattr(ours, "to_pandas"):
ours = ours.to_pandas()
if hasattr(ref, "to_pandas"):
ref = ref.to_pandas()
pd.testing.assert_frame_equal(ours, ref)
else:
pd.testing.assert_frame_equal(ours.to_pandas(), ref.to_pandas())

dantegd marked this conversation as resolved.
Show resolved Hide resolved

def from_df_to_numpy(df):
Expand Down
5 changes: 3 additions & 2 deletions python/cuml/cuml/tests/explainer/test_sampling.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -23,6 +23,7 @@
np = cpu_only_import("numpy")
pd = cpu_only_import("pandas")
cuda = gpu_only_import_from("numba", "cuda")
cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")


@pytest.mark.parametrize(
Expand Down Expand Up @@ -64,7 +65,7 @@ def test_kmeans_input(input_type):
elif input_type == "cudf-series":
cp.testing.assert_array_equal(summary[0].values.tolist(), [23.0, 52.0])
assert isinstance(summary[0], cudf.Series)
elif input_type == "pandas-series":
elif input_type == "pandas-series" and not cudf_pandas_active:
cp.testing.assert_array_equal(
summary[0].to_numpy().flatten(), [23.0, 52.0]
)
Expand Down
8 changes: 5 additions & 3 deletions python/cuml/cuml/tests/test_module_config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -19,13 +19,15 @@
import cuml
import pytest

from cuml.internals.safe_imports import gpu_only_import
from cuml.internals.safe_imports import gpu_only_import, gpu_only_import_from

cudf = gpu_only_import("cudf")
cp = gpu_only_import("cupy")
np = cpu_only_import("numpy")
pd = cpu_only_import("pandas")

cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")


###############################################################################
# Parameters #
Expand Down Expand Up @@ -71,7 +73,7 @@ def test_default_global_output_type(input_type):

if input_type == "numba":
assert is_cuda_array(res)
else:
elif not (input_type == "pandas" and cudf_pandas_active):
assert isinstance(res, test_output_types[input_type])


Expand Down
6 changes: 4 additions & 2 deletions python/cuml/cuml/tests/test_ordinal_encoder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
from cuml.internals.safe_imports import gpu_only_import_from
from cuml.preprocessing import OrdinalEncoder

cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")
DataFrame = gpu_only_import_from("cudf", "DataFrame")


Expand Down Expand Up @@ -97,7 +98,8 @@ def test_output_type(test_sample) -> None:
enc = OrdinalEncoder(output_type="cudf").fit(X)
assert isinstance(enc.transform(X), DataFrame)
enc = OrdinalEncoder(output_type="pandas").fit(X)
assert isinstance(enc.transform(X), pd.DataFrame)
if not cudf_pandas_active:
assert isinstance(enc.transform(X), pd.DataFrame)
enc = OrdinalEncoder(output_type="numpy").fit(X)
assert isinstance(enc.transform(X), np.ndarray)
# output_type == "input"
Expand Down
41 changes: 41 additions & 0 deletions python/cuml/cuml/tests/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
np = cpu_only_import("numpy")

cuda = gpu_only_import_from("numba", "cuda")
cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")


pytestmark = pytest.mark.filterwarnings(
Expand Down Expand Up @@ -276,6 +277,11 @@ def test_tweedie_convergence(max_depth, split_criterion):
)
@pytest.mark.parametrize("datatype", [np.float32, np.float64])
@pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"])
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_classification(small_clf, datatype, max_samples, max_features):
use_handle = True

Expand Down Expand Up @@ -405,6 +411,11 @@ def test_rf_classification_unorder(
(1.0, 32),
],
)
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_regression(
special_reg, datatype, max_features, max_samples, n_bins
):
Expand Down Expand Up @@ -510,6 +521,11 @@ def test_rf_classification_seed(small_clf, datatype):
)
@pytest.mark.parametrize("convert_dtype", [True, False])
@pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]")
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_classification_float64(small_clf, datatype, convert_dtype):

X, y = small_clf
Expand Down Expand Up @@ -552,6 +568,11 @@ def test_rf_classification_float64(small_clf, datatype, convert_dtype):
"datatype", [(np.float64, np.float32), (np.float32, np.float64)]
)
@pytest.mark.filterwarnings("ignore:To use pickling(.*)::cuml[.*]")
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_regression_float64(large_reg, datatype):

X, y = large_reg
Expand Down Expand Up @@ -675,13 +696,23 @@ def rf_classification(

@pytest.mark.parametrize("datatype", [(np.float32, np.float64)])
@pytest.mark.parametrize("array_type", ["dataframe", "numpy"])
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_classification_multi_class(mclass_clf, datatype, array_type):
rf_classification(datatype, array_type, 1.0, 1.0, mclass_clf)


@pytest.mark.parametrize("datatype", [(np.float32, np.float64)])
@pytest.mark.parametrize("max_samples", [unit_param(1.0), stress_param(0.95)])
@pytest.mark.parametrize("max_features", [1.0, "log2", "sqrt"])
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_classification_proba(
small_clf, datatype, max_samples, max_features
):
Expand All @@ -695,6 +726,11 @@ def test_rf_classification_proba(
@pytest.mark.parametrize(
"algo", ["auto", "naive", "tree_reorg", "batch_tree_reorg"]
)
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_classification_sparse(
small_clf, datatype, fil_sparse_format, algo
):
Expand Down Expand Up @@ -783,6 +819,11 @@ def test_rf_classification_sparse(
@pytest.mark.parametrize(
"algo", ["auto", "naive", "tree_reorg", "batch_tree_reorg"]
)
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes sklearn RF estimators crashes sometimes. "
"Issue: https://github.com/rapidsai/cuml/issues/5991",
)
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
use_handle = True
num_treees = 50
Expand Down
8 changes: 7 additions & 1 deletion python/cuml/cuml/tests/test_svm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,6 +44,8 @@
cudf = gpu_only_import("cudf")
scipy_sparse = cpu_only_import("scipy.sparse")

cudf_pandas_active = gpu_only_import_from("cudf.pandas", "LOADED")

IS_ARM = platform.processor() == "aarch64"


Expand Down Expand Up @@ -666,6 +668,10 @@ def test_svm_predict_convert_dtype(train_dtype, test_dtype, classifier):
reason="Test fails unexpectedly on ARM. "
"github.com/rapidsai/cuml/issues/5100",
)
@pytest.mark.skipif(
cudf_pandas_active,
reason="cudf.pandas causes small numeric issues in this test only ",
)
def test_svm_no_support_vectors():
n_rows = 10
n_cols = 3
Expand Down
Loading