rapidsai · rapids-bot · Dec 12, 2023 · Dec 11, 2023
@@ -220,7 +220,10 @@ if(BUILD_CUML_TESTS OR BUILD_PRIMS_TESTS)
   find_package(Threads)
 endif()
 
+# thrust before rmm, rmm before raft so we get the right version of thrust/rmm
+include(cmake/thirdparty/get_thrust.cmake)
 include(cmake/thirdparty/get_libcudacxx.cmake)
+include(cmake/thirdparty/get_rmm.cmake)
 include(cmake/thirdparty/get_raft.cmake)
 
 if(LINK_TREELITE)

@@ -0,0 +1,23 @@
+#=============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_rmm)
+    include(${rapids-cmake-dir}/cpm/rmm.cmake)
+    rapids_cpm_rmm(BUILD_EXPORT_SET cuml-exports
+                   INSTALL_EXPORT_SET cuml-exports)
+endfunction()
+
+find_and_configure_rmm()
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone thrust
+function(find_and_configure_thrust)
+    include(${rapids-cmake-dir}/cpm/thrust.cmake)
+    rapids_cpm_thrust(NAMESPACE cuml
+                      BUILD_EXPORT_SET cuml-exports
+                      INSTALL_EXPORT_SET cuml-exports)
+endfunction()
+
+find_and_configure_thrust()
@@ -26,6 +26,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/functional>
 #include <ml_cuda_utils.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -241,7 +242,8 @@ void initKMeansPlusPlus(const raft::handle_t& handle,
     minClusterDistance.view(),
     workspace,
     clusterCost.view(),
-    [] __device__(const DataT& a, const DataT& b) { return a + b; });
+    cuda::proclaim_return_type<DataT>(
+      [] __device__(const DataT& a, const DataT& b) { return a + b; }));
 
   // compute total cluster cost by accumulating the partial cost from all the
   // ranks
@@ -291,7 +293,8 @@ void initKMeansPlusPlus(const raft::handle_t& handle,
       minClusterDistance.view(),
       workspace,
       clusterCost.view(),
-      [] __device__(const DataT& a, const DataT& b) { return a + b; });
+      cuda::proclaim_return_type<DataT>(
+        [] __device__(const DataT& a, const DataT& b) { return a + b; }));
     comm.allreduce(
       clusterCost.data_handle(), clusterCost.data_handle(), 1, raft::comms::op_t::SUM, stream);
     raft::copy(&psi, clusterCost.data_handle(), 1, stream);
@@ -481,7 +484,7 @@ void checkWeights(const raft::handle_t& handle,
       weight.data_handle(),
       weight.data_handle(),
       weight.size(),
-      [=] __device__(const DataT& wt) { return wt * scale; },
+      cuda::proclaim_return_type<DataT>([=] __device__(const DataT& wt) { return wt * scale; }),
       stream);
   }
 }
@@ -621,12 +624,12 @@ void fit(const raft::handle_t& handle,
       newCentroids.extent(0),
       true,
       false,
-      [=] __device__(DataT mat, DataT vec) {
+      cuda::proclaim_return_type<DataT>([=] __device__(DataT mat, DataT vec) {
         if (vec == 0)
           return DataT(0);
         else
           return mat / vec;
-      },
+      }),
       stream);
 
     // copy the centroids[i] to newCentroids[i] when wtInCluster[i] is 0
@@ -639,16 +642,18 @@ void fit(const raft::handle_t& handle,
       itr_wt,
       wtInCluster.extent(0),
       newCentroids.data_handle(),
-      [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // predicate
-        // copy when the # of samples in the cluster is 0
-        if (map.value == 0)
-          return true;
-        else
-          return false;
-      },
-      [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // map
-        return map.key;
-      },
+      cuda::proclaim_return_type<bool>(
+        [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // predicate
+          // copy when the # of samples in the cluster is 0
+          if (map.value == 0)
+            return true;
+          else
+            return false;
+        }),
+      cuda::proclaim_return_type<ptrdiff_t>(
+        [=] __device__(raft::KeyValuePair<ptrdiff_t, DataT> map) {  // map
+          return map.key;
+        }),
       stream);
 
     // compute the squared norm between the newCentroids and the original
@@ -657,10 +662,10 @@ void fit(const raft::handle_t& handle,
     raft::linalg::mapThenSumReduce(
       sqrdNorm.data_handle(),
       newCentroids.size(),
-      [=] __device__(const DataT a, const DataT b) {
+      cuda::proclaim_return_type<DataT>([=] __device__(const DataT a, const DataT b) {
         DataT diff = a - b;
         return diff * diff;
-      },
+      }),
       stream,
       centroids.data_handle(),
       newCentroids.data_handle());
@@ -680,13 +685,14 @@ void fit(const raft::handle_t& handle,
         minClusterAndDistance.view(),
         workspace,
         raft::make_device_scalar_view(clusterCostD.data()),
-        [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
-                      const raft::KeyValuePair<IndexT, DataT>& b) {
-          raft::KeyValuePair<IndexT, DataT> res;
-          res.key   = 0;
-          res.value = a.value + b.value;
-          return res;
-        });
+        cuda::proclaim_return_type<raft::KeyValuePair<IndexT, DataT>>(
+          [] __device__(const raft::KeyValuePair<IndexT, DataT>& a,
+                        const raft::KeyValuePair<IndexT, DataT>& b) {
+            raft::KeyValuePair<IndexT, DataT> res;
+            res.key   = 0;
+            res.value = a.value + b.value;
+            return res;
+          }));
 
       // Cluster cost phi_x(C) from all ranks
       comm.allreduce(&(clusterCostD.data()->value),