rapidsai · AyodeAwe · Oct 3, 2024 · Sep 30, 2024 · Oct 2, 2024 · Oct 2, 2024
@@ -20,6 +20,7 @@ from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 from cuml.internals.safe_imports import gpu_only_import
 rmm = gpu_only_import('rmm')
+from cuml.internals.safe_imports import safe_import_from, return_false
 import typing
 
 IF GPUBUILD == 1:
@@ -46,7 +47,10 @@ from cuml.common import input_to_cuml_array
 from cuml.internals.api_decorators import device_interop_preparation
 from cuml.internals.api_decorators import enable_device_interop
 
-from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+# from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+_openmp_effective_n_threads = safe_import_from(
+    "sklearn.utils._openmp_helpers", "_openmp_effective_n_threads", alt=return_false
+)
 
 
 class KMeans(UniversalBase,
@@ -235,7 +239,10 @@ class KMeans(UniversalBase,
         self.cluster_centers_ = None
 
         # For sklearn interoperability
-        self._n_threads = _openmp_effective_n_threads()
+        if _openmp_effective_n_threads():
+            self._n_threads = _openmp_effective_n_threads()
+        else:
+            self._n_threads = 1
 
         # cuPy does not allow comparing with string. See issue #2372
         init_str = init if isinstance(init, str) else None

@@ -1251,13 +1251,14 @@ def array_to_memory_order(arr, default="C"):
         return arr.order
     except AttributeError:
         pass
-    try:
-        array_interface = arr.__cuda_array_interface__
-    except AttributeError:
-        try:
-            array_interface = arr.__array_interface__
-        except AttributeError:
-            return array_to_memory_order(CumlArray.from_input(arr, order="K"))
+    array_interface = getattr(
+        arr,
+        "__cuda_array_interface__",
+        getattr(arr, "__array_interface__", False),
+    )
+    if not array_interface:
+        return array_to_memory_order(CumlArray.from_input(arr, order="K"))
+
     strides = array_interface.get("strides", None)
     if strides is None:
         try:

@@ -265,8 +265,18 @@ def train_test_split(
                              string"
             )
 
-    x_order = array_to_memory_order(X)
-    X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order)
+    all_numeric = True
+    if isinstance(X, cudf.DataFrame):
+        all_numeric = all(
+            cudf.api.types.is_numeric_dtype(X[col]) for col in X.columns
+        )
+
+    if all_numeric:
+        x_order = array_to_memory_order(X)
+        X_arr, X_row, *_ = input_to_cuml_array(X, order=x_order)
+    else:
+        x_order = "F"
+        X_arr, X_row = X, X.shape[0]
     if y is not None:
         y_order = array_to_memory_order(y)
         y_arr, y_row, *_ = input_to_cuml_array(y, order=y_order)
@@ -363,55 +373,53 @@ def train_test_split(
         train_indices = range(0, train_size)
         test_indices = range(-1 * test_size, 0)
 
-    # Gather from indices
-    X_train = X_arr[train_indices]
-    X_test = X_arr[test_indices]
-    if y is not None:
-        y_train = y_arr[train_indices]
-        y_test = y_arr[test_indices]
-
-    # Coerce output to original input type
-    if ty := determine_df_obj_type(X):
-        x_type = ty
-    else:
-        x_type = determine_array_type(X)
-
-    if ty := determine_df_obj_type(y):
-        y_type = ty
-    else:
-        y_type = determine_array_type(y)
-
-    if x_type in ("series", "dataframe"):
-        X_train = output_to_df_obj_like(X_train, X, x_type)
-        X_test = output_to_df_obj_like(X_test, X, x_type)
-
-        if determine_array_type(X.index) == "pandas":
-            if isinstance(train_indices, cp.ndarray):
-                train_indices = train_indices.get()
-            if isinstance(test_indices, cp.ndarray):
-                test_indices = test_indices.get()
+    if all_numeric:
+        # Gather from indices
+        X_train = X_arr[train_indices]
+        X_test = X_arr[test_indices]
+        if y is not None:
+            y_train = y_arr[train_indices]
+            y_test = y_arr[test_indices]
+
+        # Coerce output to original input type
+        x_type = determine_df_obj_type(X) or determine_array_type(X)
+        if y is not None:
+            y_type = determine_df_obj_type(y) or determine_array_type(y)
+
+        def _process_df_objs(
+            df, df_type, df_train, df_test, train_indices, test_indices
+        ):
+            if df_type in {"series", "dataframe"}:
+                df_train = output_to_df_obj_like(df_train, df, df_type)
+                df_test = output_to_df_obj_like(df_test, df, df_type)
+
+                if determine_array_type(df.index) == "pandas":
+                    if isinstance(train_indices, cp.ndarray):
+                        train_indices = train_indices.get()
+                    if isinstance(test_indices, cp.ndarray):
+                        test_indices = test_indices.get()
+
+                df_train.index = df.index[train_indices]
+                df_test.index = df.index[test_indices]
+            else:
+                df_train = df_train.to_output(df_type)
+                df_test = df_test.to_output(df_type)
+            return df_train, df_test
+
+        X_train, X_test = _process_df_objs(
+            X, x_type, X_train, X_test, train_indices, test_indices
+        )
+        if y is not None:
+            y_train, y_test = _process_df_objs(
+                y, y_type, y_train, y_test, train_indices, test_indices
+            )
 
-        X_train.index = X.index[train_indices]
-        X_test.index = X.index[test_indices]
     else:
-        X_train = X_train.to_output(x_type)
-        X_test = X_test.to_output(x_type)
-
-    if y_type in ("series", "dataframe"):
-        y_train = output_to_df_obj_like(y_train, y, y_type)
-        y_test = output_to_df_obj_like(y_test, y, y_type)
-
-        if determine_array_type(y.index) == "pandas":
-            if isinstance(train_indices, cp.ndarray):
-                train_indices = train_indices.get()
-            if isinstance(test_indices, cp.ndarray):
-                test_indices = test_indices.get()
-
-        y_train.index = y.index[train_indices]
-        y_test.index = y.index[test_indices]
-    elif y_type is not None:
-        y_train = y_train.to_output(y_type)
-        y_test = y_test.to_output(y_type)
+        X_train = X_arr.iloc[train_indices]
+        X_test = X_arr.iloc[test_indices]
+        if y is not None:
+            y_train = y_arr[train_indices]
+            y_test = y_arr[test_indices]
 
     if y is not None:
         return X_train, X_test, y_train, y_test