Merge branch 'branch-24.06' into fea/devcontainer-run-args

rapidsai · Apr 22, 2024 · 8e04198 · 8e04198
2 parents b48ac58 + b5309dc
commit 8e04198
Show file tree

Hide file tree

Showing 13 changed files with 160 additions and 73 deletions.
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
@@ -27,7 +27,7 @@ rapids-logger "pytest cuml single GPU"
 ./ci/run_cuml_singlegpu_pytests.sh \
   --numprocesses=8 \
   --dist=worksteal \
-  -k 'not test_sparse_pca_inputs and not test_fil_skl_classification' \
+  -k 'not test_sparse_pca_inputs' \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml.xml"
 
 # Run test_sparse_pca_inputs separately

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -21,9 +21,7 @@ dependencies:
 - dask-ml
 - doxygen=1.9.1
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - hdbscan<=0.8.30
 - hypothesis>=6.0,<7
 - ipykernel

diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -23,9 +23,7 @@ dependencies:
 - dask-ml
 - doxygen=1.9.1
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - hdbscan<=0.8.30
 - hypothesis>=6.0,<7
 - ipykernel

diff --git a/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml b/conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
@@ -15,8 +15,6 @@ dependencies:
 - cudatoolkit
 - cxx-compiler
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58

diff --git a/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml b/conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
@@ -13,8 +13,6 @@ dependencies:
 - cudatoolkit
 - cxx-compiler
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcufft-dev=10.9.0.58

diff --git a/conda/environments/cpp_all_cuda-122_arch-x86_64.yaml b/conda/environments/cpp_all_cuda-122_arch-x86_64.yaml
@@ -15,8 +15,6 @@ dependencies:
 - cuda-version=12.2
 - cxx-compiler
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - libcublas-dev
 - libcufft-dev
 - libcumlprims==24.6.*

diff --git a/cpp/include/cuml/tsa/arima_common.h b/cpp/include/cuml/tsa/arima_common.h
@@ -18,7 +18,9 @@
 
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/aligned.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime.h>
 #include <thrust/execution_policy.h>
@@ -79,15 +81,27 @@ struct ARIMAParams {
    */
   void allocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
   {
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    if (order.k && !tr) mu = (DataT*)rmm_alloc->allocate(batch_size * sizeof(DataT), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    if (order.k && !tr)
+      mu = (DataT*)rmm_alloc.allocate_async(
+        batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     if (order.n_exog && !tr)
-      beta = (DataT*)rmm_alloc->allocate(order.n_exog * batch_size * sizeof(DataT), stream);
-    if (order.p) ar = (DataT*)rmm_alloc->allocate(order.p * batch_size * sizeof(DataT), stream);
-    if (order.q) ma = (DataT*)rmm_alloc->allocate(order.q * batch_size * sizeof(DataT), stream);
-    if (order.P) sar = (DataT*)rmm_alloc->allocate(order.P * batch_size * sizeof(DataT), stream);
-    if (order.Q) sma = (DataT*)rmm_alloc->allocate(order.Q * batch_size * sizeof(DataT), stream);
-    sigma2 = (DataT*)rmm_alloc->allocate(batch_size * sizeof(DataT), stream);
+      beta = (DataT*)rmm_alloc.allocate_async(
+        order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.p)
+      ar = (DataT*)rmm_alloc.allocate_async(
+        order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.q)
+      ma = (DataT*)rmm_alloc.allocate_async(
+        order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.P)
+      sar = (DataT*)rmm_alloc.allocate_async(
+        order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.Q)
+      sma = (DataT*)rmm_alloc.allocate_async(
+        order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    sigma2 = (DataT*)rmm_alloc.allocate_async(
+      batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -101,15 +115,27 @@ struct ARIMAParams {
    */
   void deallocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
   {
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    if (order.k && !tr) rmm_alloc->deallocate(mu, batch_size * sizeof(DataT), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    if (order.k && !tr)
+      rmm_alloc.deallocate_async(
+        mu, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     if (order.n_exog && !tr)
-      rmm_alloc->deallocate(beta, order.n_exog * batch_size * sizeof(DataT), stream);
-    if (order.p) rmm_alloc->deallocate(ar, order.p * batch_size * sizeof(DataT), stream);
-    if (order.q) rmm_alloc->deallocate(ma, order.q * batch_size * sizeof(DataT), stream);
-    if (order.P) rmm_alloc->deallocate(sar, order.P * batch_size * sizeof(DataT), stream);
-    if (order.Q) rmm_alloc->deallocate(sma, order.Q * batch_size * sizeof(DataT), stream);
-    rmm_alloc->deallocate(sigma2, batch_size * sizeof(DataT), stream);
+      rmm_alloc.deallocate_async(
+        beta, order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.p)
+      rmm_alloc.deallocate_async(
+        ar, order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.q)
+      rmm_alloc.deallocate_async(
+        ma, order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.P)
+      rmm_alloc.deallocate_async(
+        sar, order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    if (order.Q)
+      rmm_alloc.deallocate_async(
+        sma, order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+    rmm_alloc.deallocate_async(
+      sigma2, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**

diff --git a/cpp/src/svm/results.cuh b/cpp/src/svm/results.cuh
@@ -29,8 +29,10 @@
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <rmm/aligned.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_select.cuh>
 
@@ -150,8 +152,8 @@ class Results {
     // allow ~1GB dense support matrix
     if (isDenseType<MatrixViewType>() ||
         ((size_t)n_support * n_cols * sizeof(math_t) < (1 << 30))) {
-      support_matrix.data =
-        (math_t*)rmm_alloc->allocate(n_support * n_cols * sizeof(math_t), stream);
+      support_matrix.data = (math_t*)rmm_alloc.allocate_async(
+        n_support * n_cols * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
       ML::SVM::extractRows<math_t>(matrix, support_matrix.data, idx, n_support, handle);
     } else {
       ML::SVM::extractRows<math_t>(matrix,
@@ -208,7 +210,8 @@ class Results {
     // Return only the non-zero coefficients
     auto select_op = [] __device__(math_t a) { return 0 != a; };
     *n_support     = SelectByCoef(val_tmp, n_rows, val_tmp, select_op, val_selected.data());
-    *dual_coefs    = (math_t*)rmm_alloc->allocate(*n_support * sizeof(math_t), stream);
+    *dual_coefs    = (math_t*)rmm_alloc.allocate_async(
+      *n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     raft::copy(*dual_coefs, val_selected.data(), *n_support, stream);
     handle.sync_stream(stream);
   }
@@ -225,7 +228,8 @@ class Results {
   {
     auto select_op = [] __device__(math_t a) -> bool { return 0 != a; };
     SelectByCoef(coef, n_rows, f_idx.data(), select_op, idx_selected.data());
-    int* idx = (int*)rmm_alloc->allocate(n_support * sizeof(int), stream);
+    int* idx = (int*)rmm_alloc.allocate_async(
+      n_support * sizeof(int), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     raft::copy(idx, idx_selected.data(), n_support, stream);
     return idx;
   }
@@ -297,7 +301,7 @@ class Results {
     return n_selected;
   }
 
-  rmm::mr::device_memory_resource* rmm_alloc;
+  rmm::device_async_resource_ref rmm_alloc;
 
  private:
   const raft::handle_t& handle;

diff --git a/cpp/src/svm/svc_impl.cuh b/cpp/src/svm/svc_impl.cuh
@@ -32,8 +32,10 @@
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/gemv.cuh>
 
+#include <rmm/aligned.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
@@ -70,8 +72,9 @@ void svcFitX(const raft::handle_t& handle,
   {
     rmm::device_uvector<math_t> unique_labels(0, stream);
     model.n_classes = raft::label::getUniquelabels(unique_labels, labels, n_rows, stream);
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    model.unique_labels = (math_t*)rmm_alloc->allocate(model.n_classes * sizeof(math_t), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    model.unique_labels                      = (math_t*)rmm_alloc.allocate_async(
+      model.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
     raft::copy(model.unique_labels, unique_labels.data(), model.n_classes, stream);
     handle_impl.sync_stream(stream);
   }
@@ -352,27 +355,45 @@ void svcPredictSparse(const raft::handle_t& handle,
 template <typename math_t>
 void svmFreeBuffers(const raft::handle_t& handle, SvmModel<math_t>& m)
 {
-  cudaStream_t stream                        = handle.get_stream();
-  rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-  if (m.dual_coefs) rmm_alloc->deallocate(m.dual_coefs, m.n_support * sizeof(math_t), stream);
-  if (m.support_idx) rmm_alloc->deallocate(m.support_idx, m.n_support * sizeof(int), stream);
+  cudaStream_t stream                      = handle.get_stream();
+  rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+  if (m.dual_coefs)
+    rmm_alloc.deallocate_async(
+      m.dual_coefs, m.n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
+  if (m.support_idx)
+    rmm_alloc.deallocate_async(
+      m.support_idx, m.n_support * sizeof(int), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   if (m.support_matrix.indptr) {
-    rmm_alloc->deallocate(m.support_matrix.indptr, (m.n_support + 1) * sizeof(int), stream);
+    rmm_alloc.deallocate_async(m.support_matrix.indptr,
+                               (m.n_support + 1) * sizeof(int),
+                               rmm::CUDA_ALLOCATION_ALIGNMENT,
+                               stream);
     m.support_matrix.indptr = nullptr;
   }
   if (m.support_matrix.indices) {
-    rmm_alloc->deallocate(m.support_matrix.indices, m.support_matrix.nnz * sizeof(int), stream);
+    rmm_alloc.deallocate_async(m.support_matrix.indices,
+                               m.support_matrix.nnz * sizeof(int),
+                               rmm::CUDA_ALLOCATION_ALIGNMENT,
+                               stream);
     m.support_matrix.indices = nullptr;
   }
   if (m.support_matrix.data) {
     if (m.support_matrix.nnz == -1) {
-      rmm_alloc->deallocate(m.support_matrix.data, m.n_support * m.n_cols * sizeof(math_t), stream);
+      rmm_alloc.deallocate_async(m.support_matrix.data,
+                                 m.n_support * m.n_cols * sizeof(math_t),
+                                 rmm::CUDA_ALLOCATION_ALIGNMENT,
+                                 stream);
     } else {
-      rmm_alloc->deallocate(m.support_matrix.data, m.support_matrix.nnz * sizeof(math_t), stream);
+      rmm_alloc.deallocate_async(m.support_matrix.data,
+                                 m.support_matrix.nnz * sizeof(math_t),
+                                 rmm::CUDA_ALLOCATION_ALIGNMENT,
+                                 stream);
     }
   }
   m.support_matrix.nnz = -1;
-  if (m.unique_labels) rmm_alloc->deallocate(m.unique_labels, m.n_classes * sizeof(math_t), stream);
+  if (m.unique_labels)
+    rmm_alloc.deallocate_async(
+      m.unique_labels, m.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   m.dual_coefs    = nullptr;
   m.support_idx   = nullptr;
   m.unique_labels = nullptr;

diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
@@ -31,6 +31,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/device_ptr.h>
@@ -501,9 +502,12 @@ class GetResultsTest : public ::testing::Test {
  protected:
   void FreeDenseSupport()
   {
-    rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
-    auto stream                                = this->handle.get_stream();
-    rmm_alloc->deallocate(support_matrix.data, n_coefs * n_cols * sizeof(math_t), stream);
+    rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
+    auto stream                              = this->handle.get_stream();
+    rmm_alloc.deallocate_async(support_matrix.data,
+                               n_coefs * n_cols * sizeof(math_t),
+                               rmm::CUDA_ALLOCATION_ALIGNMENT,
+                               stream);
     support_matrix.data = nullptr;
   }
 

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -116,8 +116,6 @@ dependencies:
         packages:
           - c-compiler
           - cxx-compiler
-          - gmock>=1.13.0
-          - gtest>=1.13.0
           - libcumlprims==24.6.*
           - libraft==24.6.*
           - libraft-headers==24.6.*

diff --git a/python/cuml/experimental/fil/fil.pyx b/python/cuml/experimental/fil/fil.pyx
@@ -55,6 +55,10 @@ nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator)
 
 cdef extern from "treelite/c_api.h":
     ctypedef void* TreeliteModelHandle
+    cdef int TreeliteDeserializeModelFromBytes(const char* bytes_seq, size_t len,
+                                               TreeliteModelHandle* out) except +
+    cdef int TreeliteFreeModel(TreeliteModelHandle handle) except +
+    cdef const char* TreeliteGetLastError()
 
 
 cdef raft_proto_device_t get_device_type(arr):
@@ -137,16 +141,19 @@ cdef class ForestInference_impl():
             use_double_precision_bool = use_double_precision
             use_double_precision_c = use_double_precision_bool
 
-        try:
-            model_handle = tl_model.handle.value
-        except AttributeError:
-            try:
-                model_handle = tl_model.handle
-            except AttributeError:
-                try:
-                    model_handle = tl_model.value
-                except AttributeError:
-                    model_handle = tl_model
+        if not isinstance(tl_model, treelite.Model):
+            raise ValueError("tl_model must be a treelite.Model object")
+        # Serialize Treelite model object and de-serialize again,
+        # to get around C++ ABI incompatibilities (due to different compilers
+        # being used to build cuML pip wheel vs. Treelite pip wheel)
+        bytes_seq = tl_model.serialize_bytes()
+        cdef TreeliteModelHandle model_handle = NULL
+        cdef int res = TreeliteDeserializeModelFromBytes(bytes_seq, len(bytes_seq),
+                                                         &model_handle)
+        cdef str err_msg
+        if res < 0:
+            err_msg = TreeliteGetLastError().decode("UTF-8")
+            raise RuntimeError(f"Failed to load Treelite model from bytes ({err_msg})")
 
         cdef raft_proto_device_t dev_type
         if mem_type.is_device_accessible:
@@ -169,6 +176,8 @@ cdef class ForestInference_impl():
             self.raft_proto_handle.get_next_usable_stream()
         )
 
+        TreeliteFreeModel(model_handle)
+
     def get_dtype(self):
         return [np.float32, np.float64][self.model.is_double_precision()]