rapidsai · rapids-bot · Jul 5, 2023 · May 24, 2023 · May 24, 2023 · May 24, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,6 +18,10 @@ repos:
             types_or: [python, cython]
             exclude: thirdparty
             additional_dependencies: [flake8-force]
+    - repo: https://github.com/MarcoGorelli/cython-lint
+      rev: v0.15.0
+      hooks:
+          - id: cython-lint
     - repo: https://github.com/pre-commit/mirrors-clang-format
       rev: v16.0.1
       hooks:

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -107,6 +107,7 @@ please see the `.pre-commit-config.yaml` file.
 - `clang-format`: Formats C++ and CUDA code for consistency and readability.
 - `black`: Auto-formats Python code to conform to the PEP 8 style guide.
 - `flake8`: Lints Python code for syntax errors and common code style issues.
+- `cython-lint`: Lints Cython code for syntax errors and common code style issues.
 - _`DeprecationWarning` checker_: Checks for new `DeprecationWarning` being
   introduced in Python code, and instead `FutureWarning` should be used.
 - _`#include` syntax checker_: Ensures consistent syntax for C++ `#include` statements.

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,3 +9,8 @@ ignore-words-list = "inout,numer,startd,couldn,referr"
 builtin = "clear"
 # disable warnings about binary files and wrong encoding
 quiet-level = 3
+
+[tool.cython-lint]
+# TODO: Re-enable E501 with a reasonable line length
+max-line-length = 999
+ignore = ['E501']
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -167,4 +167,4 @@ cdef extern from "cuml/cluster/kmeans.hpp" namespace "ML::kmeans":
                         const double *X,
                         int64_t n_samples,
                         int64_t n_features,
-                        double *X_new) except +
+                        double *X_new) except +
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,15 +16,13 @@
 
 # distutils: language = c++
 
-import ctypes
 from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 from cuml.internals.safe_imports import gpu_only_import
 cp = gpu_only_import('cupy')
 
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t, int64_t
-from libc.stdlib cimport calloc, malloc, free
 
 from cuml.internals.array import CumlArray
 from cuml.internals.base import Base
@@ -37,8 +35,6 @@ from cuml.internals.mixins import ClusterMixin
 from cuml.internals.mixins import CMajorInputTagMixin
 from cuml.metrics.distance_type cimport DistanceType
 
-from collections import defaultdict
-
 cdef extern from "cuml/cluster/dbscan.hpp" \
         namespace "ML::Dbscan":
 
@@ -345,7 +341,7 @@ class DBSCAN(Base,
         # make sure that the `fit` is complete before the following
         # delete call happens
         self.handle.sync()
-        del(X_m)
+        del X_m
 
         # Finally, resize the core_sample_indices array if necessary
         if self.calc_core_sample_indices:

@@ -40,7 +40,6 @@ from cuml.internals.api_decorators import device_interop_preparation
 from cuml.internals.api_decorators import enable_device_interop
 from cuml.internals.mixins import ClusterMixin
 from cuml.internals.mixins import CMajorInputTagMixin
-from cuml.internals import logger
 from cuml.internals.import_utils import has_hdbscan
 
 import cuml
@@ -257,7 +256,7 @@ def condense_hierarchy(dendrogram,
         new CondensedHierarchy[int, float](
             handle_[0], <size_t>n_leaves)
 
-    children, n_rows, _, _ = \
+    children, _, _, _ = \
         input_to_cuml_array(dendrogram[:, 0:2].astype('int32'), order='C',
                             check_dtype=[np.int32],
                             convert_to_dtype=(np.int32))
@@ -457,7 +456,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
         A score of how persistent each cluster is. A score of 1.0 represents
         a perfectly stable cluster that persists over all distance scales,
         while a score of 0.0 represents a perfectly ephemeral cluster. These
-        scores can be used to gauge the relative coherence of the 
+        scores can be used to gauge the relative coherence of the
         clusters output by the algorithm.
 
     condensed_tree_ : CondensedTree object
@@ -1026,7 +1025,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
 
         cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()
 
-        self.X_m, self.n_rows, self.n_cols, dtype = \
+        self.X_m, self.n_rows, self.n_cols, _ = \
             input_to_cuml_array(self._cpu_model._raw_data, order='C',
                                 check_dtype=[np.float32],
                                 convert_to_dtype=(np.float32

@@ -25,20 +25,13 @@ from cuml.internals.safe_imports import gpu_only_import
 cp = gpu_only_import('cupy')
 
 from cuml.internals.array import CumlArray
-from cuml.internals.base import Base
-from cuml.common.doc_utils import generate_docstring
 from pylibraft.common.handle cimport handle_t
 
-from pylibraft.common.handle import Handle
 from cuml.common import (
     input_to_cuml_array,
     input_to_host_array
 )
-from cuml.common.array_descriptor import CumlArrayDescriptor
-from cuml.internals.available_devices import is_cuda_available
 from cuml.internals.device_type import DeviceType
-from cuml.internals.mixins import ClusterMixin
-from cuml.internals.mixins import CMajorInputTagMixin
 from cuml.internals import logger
 from cuml.internals.import_utils import has_hdbscan
 
@@ -96,7 +89,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
         DistanceType metric,
         float* membership_vec,
         size_t batch_size)
-    
+
     void compute_membership_vector(
         const handle_t& handle,
         CondensedHierarchy[int, float] &condensed_tree,
@@ -107,7 +100,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
         int min_samples,
         DistanceType metric,
         float* membership_vec,
-        size_t batch_size);
+        size_t batch_size)
 
     void out_of_sample_predict(const handle_t &handle,
                                CondensedHierarchy[int, float] &condensed_tree,
@@ -250,7 +243,7 @@ def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dty
         The new data points to predict cluster labels for. They should
         have the same dimensionality as the original dataset over which
         clusterer was fit.
-    
+
     batch_size : int, optional, default=min(4096, n_points_to_predict)
         Lowers memory requirement by computing distance-based membership
         in smaller batches of points in the prediction data. For example, a
@@ -308,7 +301,7 @@ def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dty
                             convert_to_dtype=(np.float32
                                               if convert_dtype
                                               else None))
-    
+
     if clusterer.n_clusters_ == 0:
         return np.zeros(n_prediction_points, dtype=np.float32)
 
@@ -317,7 +310,7 @@ def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dty
 
     cdef uintptr_t prediction_ptr = points_to_predict_m.ptr
     cdef uintptr_t input_ptr = clusterer.X_m.ptr
-    
+
     membership_vec = CumlArray.empty(
         (n_prediction_points * clusterer.n_clusters_,),
         dtype="float32")

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,24 +16,21 @@
 
 # distutils: language = c++
 
-import ctypes
 from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
 from cuml.internals.safe_imports import gpu_only_import
 rmm = gpu_only_import('rmm')
-import warnings
 import typing
 
 from cython.operator cimport dereference as deref
 from libcpp cimport bool
 from libc.stdint cimport uintptr_t, int64_t
-from libc.stdlib cimport calloc, malloc, free
+from libc.stdlib cimport calloc, free
 
 from cuml.cluster.cpp.kmeans cimport fit_predict as cpp_fit_predict
 from cuml.cluster.cpp.kmeans cimport predict as cpp_predict
 from cuml.cluster.cpp.kmeans cimport transform as cpp_transform
 from cuml.cluster.cpp.kmeans cimport KMeansParams
-from cuml.cluster.cpp.kmeans cimport InitMethod
 
 from cuml.internals.array import CumlArray
 from cuml.common.array_descriptor import CumlArrayDescriptor
@@ -246,7 +243,7 @@ class KMeans(Base,
         else:
             self.init = 'preset'
             self._params_init = Array
-            self.cluster_centers_, n_rows, self.n_cols, self.dtype = \
+            self.cluster_centers_, _n_rows, self.n_cols, self.dtype = \
                 input_to_cuml_array(init, order='C',
                                     check_dtype=[np.float32, np.float64])
 
@@ -369,8 +366,8 @@ class KMeans(Base,
                             ' passed.')
 
         self.handle.sync()
-        del(X_m)
-        del(sample_weight_m)
+        del X_m
+        del sample_weight_m
         free(params)
         return self
 
@@ -417,7 +414,7 @@ class KMeans(Base,
         Sum of squared distances of samples to their closest cluster center.
         """
 
-        X_m, n_rows, n_cols, dtype = \
+        X_m, n_rows, n_cols, _ = \
             input_to_cuml_array(X, order='C', check_dtype=self.dtype,
                                 convert_to_dtype=(self.dtype if convert_dtype
                                                   else None),
@@ -452,8 +449,6 @@ class KMeans(Base,
         cdef KMeansParams* params = \
             <KMeansParams*><size_t>self._get_kmeans_params()
 
-        cur_int_dtype = labels_.dtype
-
         if self.dtype == np.float32:
             if int_dtype == np.int32:
                 cpp_predict(
@@ -515,8 +510,8 @@ class KMeans(Base,
                             ' passed.')
 
         self.handle.sync()
-        del(X_m)
-        del(sample_weight_m)
+        del X_m
+        del sample_weight_m
         free(params)
         return labels_, inertia
 
@@ -548,7 +543,7 @@ class KMeans(Base,
 
         """
 
-        X_m, n_rows, n_cols, dtype = \
+        X_m, n_rows, _n_cols, _dtype = \
             input_to_cuml_array(X, order='C', check_dtype=self.dtype,
                                 convert_to_dtype=(self.dtype if convert_dtype
                                                   else None),
@@ -620,7 +615,7 @@ class KMeans(Base,
 
         self.handle.sync()
 
-        del(X_m)
+        del X_m
         free(params)
         return preds
 

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,21 +16,17 @@
 
 # distutils: language = c++
 
-import ctypes
 from cuml.internals.safe_imports import cpu_only_import
 np = cpu_only_import('numpy')
-import warnings
 
 from cuml.internals.safe_imports import gpu_only_import
 rmm = gpu_only_import('rmm')
 
 from cython.operator cimport dereference as deref
-from libcpp cimport bool
 from libc.stdint cimport uintptr_t, int64_t
-from libc.stdlib cimport calloc, malloc, free
+from libc.stdlib cimport free
 
 from cuml.internals.array import CumlArray
-from cuml.internals.base import Base
 from pylibraft.common.handle cimport handle_t
 from cuml.common import input_to_cuml_array
 
@@ -79,10 +75,10 @@ cdef extern from "cuml/cluster/kmeans_mg.hpp" \
                   const double *sample_weight,
                   double *centroids,
                   double &inertia,
-                  int64_t &n_iter) except +                  
+                  int64_t &n_iter) except +
 
-class KMeansMG(KMeans):
 
+class KMeansMG(KMeans):
     """
     A Multi-Node Multi-GPU implementation of KMeans
 
@@ -141,16 +137,10 @@ class KMeansMG(KMeans):
 
         cdef uintptr_t cluster_centers_ptr = self.cluster_centers_.ptr
 
-
         int_dtype = np.int32 if np.int64(n_rows) * np.int64(n_cols) < 2**31-1 else np.int64
 
         print(str(n_rows * n_cols))
 
-        labels_ = CumlArray.zeros(shape=n_rows, dtype=int_dtype,
-                                  index=X_m.index)
-
-        cdef uintptr_t labels_ptr = labels_.ptr
-
         cdef float inertiaf = 0
         cdef double inertiad = 0
 
@@ -224,11 +214,11 @@ class KMeansMG(KMeans):
 
         self.handle.sync()
 
-        self.labels_, _, _, _ =  input_to_cuml_array(self.predict(X,
-                                                     sample_weight=sample_weight), order='C',
-                                                     convert_to_dtype=self.dtype)
+        self.labels_, _, _, _ = input_to_cuml_array(self.predict(X,
+                                                    sample_weight=sample_weight), order='C',
+                                                    convert_to_dtype=self.dtype)
 
-        del(X_m)
+        del X_m
         free(params)
 
         return self
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ np = cpu_only_import('numpy')
 
 from cuml.common.opg_data_utils_mg cimport *
 from libc.stdlib cimport malloc, free
-from libc.stdint cimport uintptr_t, uint32_t, uint64_t
+from libc.stdint cimport uintptr_t
 from cuml.common import input_to_cuml_array
 from cython.operator cimport dereference as deref
 from cuml.internals.array import CumlArray
@@ -213,7 +213,7 @@ def _build_part_inputs(cuda_arr_ifaces,
 
     cuml_arr_ifaces = []
     for arr in cuda_arr_ifaces:
-        X_m, n_rows, n_cols, dtype = \
+        X_m, _, _, _ = \
             input_to_cuml_array(arr, order="F",
                                 convert_to_dtype=(np.float32
                                                   if convert_dtype