Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into fea/devcontainer-run-args
Browse files Browse the repository at this point in the history
  • Loading branch information
dantegd authored Apr 22, 2024
2 parents b48ac58 + b5309dc commit 8e04198
Show file tree
Hide file tree
Showing 13 changed files with 160 additions and 73 deletions.
2 changes: 1 addition & 1 deletion ci/test_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ rapids-logger "pytest cuml single GPU"
./ci/run_cuml_singlegpu_pytests.sh \
--numprocesses=8 \
--dist=worksteal \
-k 'not test_sparse_pca_inputs and not test_fil_skl_classification' \
-k 'not test_sparse_pca_inputs' \
--junitxml="${RAPIDS_TESTS_DIR}/junit-cuml.xml"

# Run test_sparse_pca_inputs separately
Expand Down
2 changes: 0 additions & 2 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ dependencies:
- dask-ml
- doxygen=1.9.1
- gcc_linux-64=11.*
- gmock>=1.13.0
- graphviz
- gtest>=1.13.0
- hdbscan<=0.8.30
- hypothesis>=6.0,<7
- ipykernel
Expand Down
2 changes: 0 additions & 2 deletions conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ dependencies:
- dask-ml
- doxygen=1.9.1
- gcc_linux-64=11.*
- gmock>=1.13.0
- graphviz
- gtest>=1.13.0
- hdbscan<=0.8.30
- hypothesis>=6.0,<7
- ipykernel
Expand Down
2 changes: 0 additions & 2 deletions conda/environments/clang_tidy_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ dependencies:
- cudatoolkit
- cxx-compiler
- gcc_linux-64=11.*
- gmock>=1.13.0
- gtest>=1.13.0
- libcublas-dev=11.11.3.6
- libcublas=11.11.3.6
- libcufft-dev=10.9.0.58
Expand Down
2 changes: 0 additions & 2 deletions conda/environments/cpp_all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ dependencies:
- cudatoolkit
- cxx-compiler
- gcc_linux-64=11.*
- gmock>=1.13.0
- gtest>=1.13.0
- libcublas-dev=11.11.3.6
- libcublas=11.11.3.6
- libcufft-dev=10.9.0.58
Expand Down
2 changes: 0 additions & 2 deletions conda/environments/cpp_all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@ dependencies:
- cuda-version=12.2
- cxx-compiler
- gcc_linux-64=11.*
- gmock>=1.13.0
- gtest>=1.13.0
- libcublas-dev
- libcufft-dev
- libcumlprims==24.6.*
Expand Down
58 changes: 42 additions & 16 deletions cpp/include/cuml/tsa/arima_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

#include <raft/util/cudart_utils.hpp>

#include <rmm/aligned.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cuda_runtime.h>
#include <thrust/execution_policy.h>
Expand Down Expand Up @@ -79,15 +81,27 @@ struct ARIMAParams {
*/
void allocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
{
rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
if (order.k && !tr) mu = (DataT*)rmm_alloc->allocate(batch_size * sizeof(DataT), stream);
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
if (order.k && !tr)
mu = (DataT*)rmm_alloc.allocate_async(
batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.n_exog && !tr)
beta = (DataT*)rmm_alloc->allocate(order.n_exog * batch_size * sizeof(DataT), stream);
if (order.p) ar = (DataT*)rmm_alloc->allocate(order.p * batch_size * sizeof(DataT), stream);
if (order.q) ma = (DataT*)rmm_alloc->allocate(order.q * batch_size * sizeof(DataT), stream);
if (order.P) sar = (DataT*)rmm_alloc->allocate(order.P * batch_size * sizeof(DataT), stream);
if (order.Q) sma = (DataT*)rmm_alloc->allocate(order.Q * batch_size * sizeof(DataT), stream);
sigma2 = (DataT*)rmm_alloc->allocate(batch_size * sizeof(DataT), stream);
beta = (DataT*)rmm_alloc.allocate_async(
order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.p)
ar = (DataT*)rmm_alloc.allocate_async(
order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.q)
ma = (DataT*)rmm_alloc.allocate_async(
order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.P)
sar = (DataT*)rmm_alloc.allocate_async(
order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.Q)
sma = (DataT*)rmm_alloc.allocate_async(
order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
sigma2 = (DataT*)rmm_alloc.allocate_async(
batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
}

/**
Expand All @@ -101,15 +115,27 @@ struct ARIMAParams {
*/
void deallocate(const ARIMAOrder& order, int batch_size, cudaStream_t stream, bool tr = false)
{
rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
if (order.k && !tr) rmm_alloc->deallocate(mu, batch_size * sizeof(DataT), stream);
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
if (order.k && !tr)
rmm_alloc.deallocate_async(
mu, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.n_exog && !tr)
rmm_alloc->deallocate(beta, order.n_exog * batch_size * sizeof(DataT), stream);
if (order.p) rmm_alloc->deallocate(ar, order.p * batch_size * sizeof(DataT), stream);
if (order.q) rmm_alloc->deallocate(ma, order.q * batch_size * sizeof(DataT), stream);
if (order.P) rmm_alloc->deallocate(sar, order.P * batch_size * sizeof(DataT), stream);
if (order.Q) rmm_alloc->deallocate(sma, order.Q * batch_size * sizeof(DataT), stream);
rmm_alloc->deallocate(sigma2, batch_size * sizeof(DataT), stream);
rmm_alloc.deallocate_async(
beta, order.n_exog * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.p)
rmm_alloc.deallocate_async(
ar, order.p * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.q)
rmm_alloc.deallocate_async(
ma, order.q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.P)
rmm_alloc.deallocate_async(
sar, order.P * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (order.Q)
rmm_alloc.deallocate_async(
sma, order.Q * batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
rmm_alloc.deallocate_async(
sigma2, batch_size * sizeof(DataT), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
}

/**
Expand Down
14 changes: 9 additions & 5 deletions cpp/src/svm/results.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
#include <raft/util/cuda_utils.cuh>
#include <raft/util/cudart_utils.hpp>

#include <rmm/aligned.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <cub/device/device_select.cuh>

Expand Down Expand Up @@ -150,8 +152,8 @@ class Results {
// allow ~1GB dense support matrix
if (isDenseType<MatrixViewType>() ||
((size_t)n_support * n_cols * sizeof(math_t) < (1 << 30))) {
support_matrix.data =
(math_t*)rmm_alloc->allocate(n_support * n_cols * sizeof(math_t), stream);
support_matrix.data = (math_t*)rmm_alloc.allocate_async(
n_support * n_cols * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
ML::SVM::extractRows<math_t>(matrix, support_matrix.data, idx, n_support, handle);
} else {
ML::SVM::extractRows<math_t>(matrix,
Expand Down Expand Up @@ -208,7 +210,8 @@ class Results {
// Return only the non-zero coefficients
auto select_op = [] __device__(math_t a) { return 0 != a; };
*n_support = SelectByCoef(val_tmp, n_rows, val_tmp, select_op, val_selected.data());
*dual_coefs = (math_t*)rmm_alloc->allocate(*n_support * sizeof(math_t), stream);
*dual_coefs = (math_t*)rmm_alloc.allocate_async(
*n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
raft::copy(*dual_coefs, val_selected.data(), *n_support, stream);
handle.sync_stream(stream);
}
Expand All @@ -225,7 +228,8 @@ class Results {
{
auto select_op = [] __device__(math_t a) -> bool { return 0 != a; };
SelectByCoef(coef, n_rows, f_idx.data(), select_op, idx_selected.data());
int* idx = (int*)rmm_alloc->allocate(n_support * sizeof(int), stream);
int* idx = (int*)rmm_alloc.allocate_async(
n_support * sizeof(int), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
raft::copy(idx, idx_selected.data(), n_support, stream);
return idx;
}
Expand Down Expand Up @@ -297,7 +301,7 @@ class Results {
return n_selected;
}

rmm::mr::device_memory_resource* rmm_alloc;
rmm::device_async_resource_ref rmm_alloc;

private:
const raft::handle_t& handle;
Expand Down
43 changes: 32 additions & 11 deletions cpp/src/svm/svc_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
#include <raft/label/classlabels.cuh>
#include <raft/linalg/gemv.cuh>

#include <rmm/aligned.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>

#include <thrust/copy.h>
#include <thrust/device_ptr.h>
Expand Down Expand Up @@ -70,8 +72,9 @@ void svcFitX(const raft::handle_t& handle,
{
rmm::device_uvector<math_t> unique_labels(0, stream);
model.n_classes = raft::label::getUniquelabels(unique_labels, labels, n_rows, stream);
rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
model.unique_labels = (math_t*)rmm_alloc->allocate(model.n_classes * sizeof(math_t), stream);
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
model.unique_labels = (math_t*)rmm_alloc.allocate_async(
model.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
raft::copy(model.unique_labels, unique_labels.data(), model.n_classes, stream);
handle_impl.sync_stream(stream);
}
Expand Down Expand Up @@ -352,27 +355,45 @@ void svcPredictSparse(const raft::handle_t& handle,
template <typename math_t>
void svmFreeBuffers(const raft::handle_t& handle, SvmModel<math_t>& m)
{
cudaStream_t stream = handle.get_stream();
rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
if (m.dual_coefs) rmm_alloc->deallocate(m.dual_coefs, m.n_support * sizeof(math_t), stream);
if (m.support_idx) rmm_alloc->deallocate(m.support_idx, m.n_support * sizeof(int), stream);
cudaStream_t stream = handle.get_stream();
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
if (m.dual_coefs)
rmm_alloc.deallocate_async(
m.dual_coefs, m.n_support * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (m.support_idx)
rmm_alloc.deallocate_async(
m.support_idx, m.n_support * sizeof(int), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
if (m.support_matrix.indptr) {
rmm_alloc->deallocate(m.support_matrix.indptr, (m.n_support + 1) * sizeof(int), stream);
rmm_alloc.deallocate_async(m.support_matrix.indptr,
(m.n_support + 1) * sizeof(int),
rmm::CUDA_ALLOCATION_ALIGNMENT,
stream);
m.support_matrix.indptr = nullptr;
}
if (m.support_matrix.indices) {
rmm_alloc->deallocate(m.support_matrix.indices, m.support_matrix.nnz * sizeof(int), stream);
rmm_alloc.deallocate_async(m.support_matrix.indices,
m.support_matrix.nnz * sizeof(int),
rmm::CUDA_ALLOCATION_ALIGNMENT,
stream);
m.support_matrix.indices = nullptr;
}
if (m.support_matrix.data) {
if (m.support_matrix.nnz == -1) {
rmm_alloc->deallocate(m.support_matrix.data, m.n_support * m.n_cols * sizeof(math_t), stream);
rmm_alloc.deallocate_async(m.support_matrix.data,
m.n_support * m.n_cols * sizeof(math_t),
rmm::CUDA_ALLOCATION_ALIGNMENT,
stream);
} else {
rmm_alloc->deallocate(m.support_matrix.data, m.support_matrix.nnz * sizeof(math_t), stream);
rmm_alloc.deallocate_async(m.support_matrix.data,
m.support_matrix.nnz * sizeof(math_t),
rmm::CUDA_ALLOCATION_ALIGNMENT,
stream);
}
}
m.support_matrix.nnz = -1;
if (m.unique_labels) rmm_alloc->deallocate(m.unique_labels, m.n_classes * sizeof(math_t), stream);
if (m.unique_labels)
rmm_alloc.deallocate_async(
m.unique_labels, m.n_classes * sizeof(math_t), rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
m.dual_coefs = nullptr;
m.support_idx = nullptr;
m.unique_labels = nullptr;
Expand Down
10 changes: 7 additions & 3 deletions cpp/test/sg/svc_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <raft/util/cudart_utils.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/resource_ref.hpp>

#include <cub/cub.cuh>
#include <thrust/device_ptr.h>
Expand Down Expand Up @@ -501,9 +502,12 @@ class GetResultsTest : public ::testing::Test {
protected:
void FreeDenseSupport()
{
rmm::mr::device_memory_resource* rmm_alloc = rmm::mr::get_current_device_resource();
auto stream = this->handle.get_stream();
rmm_alloc->deallocate(support_matrix.data, n_coefs * n_cols * sizeof(math_t), stream);
rmm::device_async_resource_ref rmm_alloc = rmm::mr::get_current_device_resource();
auto stream = this->handle.get_stream();
rmm_alloc.deallocate_async(support_matrix.data,
n_coefs * n_cols * sizeof(math_t),
rmm::CUDA_ALLOCATION_ALIGNMENT,
stream);
support_matrix.data = nullptr;
}

Expand Down
2 changes: 0 additions & 2 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ dependencies:
packages:
- c-compiler
- cxx-compiler
- gmock>=1.13.0
- gtest>=1.13.0
- libcumlprims==24.6.*
- libraft==24.6.*
- libraft-headers==24.6.*
Expand Down
29 changes: 19 additions & 10 deletions python/cuml/experimental/fil/fil.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ nvtx_annotate = gpu_only_import_from("nvtx", "annotate", alt=null_decorator)

cdef extern from "treelite/c_api.h":
ctypedef void* TreeliteModelHandle
cdef int TreeliteDeserializeModelFromBytes(const char* bytes_seq, size_t len,
TreeliteModelHandle* out) except +
cdef int TreeliteFreeModel(TreeliteModelHandle handle) except +
cdef const char* TreeliteGetLastError()


cdef raft_proto_device_t get_device_type(arr):
Expand Down Expand Up @@ -137,16 +141,19 @@ cdef class ForestInference_impl():
use_double_precision_bool = use_double_precision
use_double_precision_c = use_double_precision_bool

try:
model_handle = tl_model.handle.value
except AttributeError:
try:
model_handle = tl_model.handle
except AttributeError:
try:
model_handle = tl_model.value
except AttributeError:
model_handle = tl_model
if not isinstance(tl_model, treelite.Model):
raise ValueError("tl_model must be a treelite.Model object")
# Serialize Treelite model object and de-serialize again,
# to get around C++ ABI incompatibilities (due to different compilers
# being used to build cuML pip wheel vs. Treelite pip wheel)
bytes_seq = tl_model.serialize_bytes()
cdef TreeliteModelHandle model_handle = NULL
cdef int res = TreeliteDeserializeModelFromBytes(bytes_seq, len(bytes_seq),
&model_handle)
cdef str err_msg
if res < 0:
err_msg = TreeliteGetLastError().decode("UTF-8")
raise RuntimeError(f"Failed to load Treelite model from bytes ({err_msg})")

cdef raft_proto_device_t dev_type
if mem_type.is_device_accessible:
Expand All @@ -169,6 +176,8 @@ cdef class ForestInference_impl():
self.raft_proto_handle.get_next_usable_stream()
)

TreeliteFreeModel(model_handle)

def get_dtype(self):
return [np.float32, np.float64][self.model.is_double_precision()]

Expand Down
Loading

0 comments on commit 8e04198

Please sign in to comment.