diff --git a/python/cuml/preprocessing/label.py b/python/cuml/preprocessing/label.py index 78daee603e..c34a9d9249 100644 --- a/python/cuml/preprocessing/label.py +++ b/python/cuml/preprocessing/label.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -64,14 +64,19 @@ def label_binarize( cp.cuda.Stream.null.synchronize() + is_binary = classes.shape[0] == 2 + if sparse_output: sp = sp.tocsr() + if is_binary: + sp = sp.getcol(1) # getcol does not support -1 indexing return sp else: arr = sp.toarray().astype(y.dtype) arr[arr == 0] = neg_label - + if is_binary: + arr = arr[:, -1].reshape((-1, 1)) return arr diff --git a/python/cuml/tests/test_preprocessing.py b/python/cuml/tests/test_preprocessing.py index 332ef4be93..c341fa2a63 100644 --- a/python/cuml/tests/test_preprocessing.py +++ b/python/cuml/tests/test_preprocessing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -43,6 +43,7 @@ quantile_transform as cu_quantile_transform, robust_scale as cu_robust_scale, scale as cu_scale, + label_binarize as cu_label_binarize, ) from sklearn.preprocessing import ( Binarizer as skBinarizer, @@ -68,6 +69,7 @@ quantile_transform as sk_quantile_transform, robust_scale as sk_robust_scale, scale as sk_scale, + label_binarize as sk_label_binarize, ) from sklearn.impute import ( MissingIndicator as skMissingIndicator, @@ -1135,6 +1137,36 @@ def test_kernel_centerer(): assert_allclose(sk_t_X, t_X) +def test_label_binarize(): + cu_bin = cu_label_binarize( + cp.array([1, 0, 1, 1]), classes=cp.array([0, 1]) + ) + sk_bin = sk_label_binarize([1, 0, 1, 1], classes=[0, 1]) + assert_allclose(cu_bin, sk_bin) + + cu_bin_sparse = cu_label_binarize( + cp.array([1, 0, 1, 1]), classes=cp.array([0, 1]), sparse_output=True + ) + sk_bin_sparse = sk_label_binarize( + [1, 0, 1, 1], classes=[0, 1], sparse_output=True + ) + assert_allclose(cu_bin_sparse, sk_bin_sparse) + + cu_multi = cu_label_binarize( + cp.array([1, 6, 3]), classes=cp.array([1, 3, 4, 6]) + ) + sk_multi = sk_label_binarize([1, 6, 3], classes=[1, 3, 4, 6]) + assert_allclose(cu_multi, sk_multi) + + cu_multi_sparse = cu_label_binarize( + cp.array([1, 6, 3]), classes=cp.array([1, 3, 4, 6]), sparse_output=True + ) + sk_multi_sparse = sk_label_binarize( + [1, 6, 3], classes=[1, 3, 4, 6], sparse_output=True + ) + assert_allclose(cu_multi_sparse, sk_multi_sparse) + + def test__repr__(): assert cuBinarizer().__repr__() == "Binarizer()" assert cuFunctionTransformer().__repr__() == "FunctionTransformer()"