diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3f67853597..241d214f49 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,7 @@
 
 ## New Features
 - Initial RAFT version
+- PR #3: defining raft::handle_t, device_buffer, host_buffer, allocator classes
 
 ## Improvements
 
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 160c3a2c69..cc5bb08907 100644
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # Copyright (c) 2020, NVIDIA CORPORATION.
 #####################
-# cuML Style Tester #
+# RAFT Style Tester #
 #####################
 
 # Ignore errors and set path
@@ -41,15 +41,9 @@ else
 fi
 
 # Check for a consistent #include syntax
-# TODO: keep adding more dirs as and when we update the syntax
 HASH_INCLUDE=`python cpp/scripts/include_checker.py \
-                     cpp/bench \
-                     cpp/comms/mpi/include \
-                     cpp/comms/mpi/src \
-                     cpp/comms/std/include \
-                     cpp/comms/std/src \
                      cpp/include \
-                     cpp/examples \
+                     cpp/test \
                      2>&1`
 HASH_RETVAL=$?
 if [ "$RETVAL" = "0" ]; then
@@ -66,7 +60,6 @@ else
 fi
 
 # Check for a consistent code format
-# TODO: keep adding more dirs when we add more source folders in cuml
 FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
 FORMAT_RETVAL=$?
 if [ "$RETVAL" = "0" ]; then
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 62a511eaf0..6c3fc0dc7b 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -108,9 +108,7 @@ nvidia-smi
 
 logger "GoogleTest for raft..."
 cd $WORKSPACE/cpp/build
-# Googletests haven't been moved over/integrated yet
-# GTEST_OUTPUT="xml:${WORKSPACE}/test-results/raft_cpp/" ./test_raft
-# running simple tests meanwhile
+GTEST_OUTPUT="xml:${WORKSPACE}/test-results/raft_cpp/" ./test_raft
 
 logger "Python pytest for cuml..."
 cd $WORKSPACE/python
diff --git a/cpp/.clang-format b/cpp/.clang-format
new file mode 100644
index 0000000000..779ca0033a
--- /dev/null
+++ b/cpp/.clang-format
@@ -0,0 +1,157 @@
+---
+# Refer to the following link for the explanation of each params:
+#   http://releases.llvm.org/8.0.1/tools/clang/docs/ClangFormatStyleOptions.html
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 2
+ContinuationIndentWidth: 2
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories: 
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats: 
+  - Language:        Cpp
+    Delimiters:      
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments:  false
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+# We are C++14, but clang-format puts this under `Cpp11` itself
+Standard:        Cpp11
+StatementMacros: 
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth:        2
+UseTab:          Never
+...
diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy
new file mode 100644
index 0000000000..30d96069b0
--- /dev/null
+++ b/cpp/.clang-tidy
@@ -0,0 +1,230 @@
+---
+# Refer to the following link for the explanation of each params:
+#  https://releases.llvm.org/8.0.1/tools/clang/tools/extra/docs/clang-tidy/checks/list.html
+Checks:          'clang-diagnostic-*,clang-analyzer-*,modernize-*,-modernize-make-*,-modernize-raw-string-literal,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming,-*,modernize-*,-modernize-make-*,-modernize-raw-string-literal,google-*,-google-default-arguments,-clang-diagnostic-#pragma-messages,readability-identifier-naming'
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+User:            snanditale
+CheckOptions:    
+  - key:             google-build-namespaces.HeaderFileExtensions
+    value:           ',h,hh,hpp,hxx'
+  - key:             google-global-names-in-headers.HeaderFileExtensions
+    value:           ',h,hh,hpp,hxx'
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             google-readability-function-size.BranchThreshold
+    value:           '4294967295'
+  - key:             google-readability-function-size.LineThreshold
+    value:           '4294967295'
+  - key:             google-readability-function-size.NestingThreshold
+    value:           '4294967295'
+  - key:             google-readability-function-size.ParameterThreshold
+    value:           '4294967295'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             google-readability-function-size.VariableThreshold
+    value:           '4294967295'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+  - key:             google-runtime-int.SignedTypePrefix
+    value:           int
+  - key:             google-runtime-int.TypeSuffix
+    value:           ''
+  - key:             google-runtime-int.UnsignedTypePrefix
+    value:           uint
+  - key:             google-runtime-references.WhiteListTypes
+    value:           ''
+  - key:             modernize-loop-convert.MaxCopySize
+    value:           '16'
+  - key:             modernize-loop-convert.MinConfidence
+    value:           reasonable
+  - key:             modernize-loop-convert.NamingStyle
+    value:           CamelCase
+  - key:             modernize-pass-by-value.IncludeStyle
+    value:           llvm
+  - key:             modernize-pass-by-value.ValuesOnly
+    value:           '0'
+  - key:             modernize-replace-auto-ptr.IncludeStyle
+    value:           llvm
+  - key:             modernize-replace-random-shuffle.IncludeStyle
+    value:           llvm
+  - key:             modernize-use-auto.MinTypeNameLength
+    value:           '5'
+  - key:             modernize-use-auto.RemoveStars
+    value:           '0'
+  - key:             modernize-use-default-member-init.IgnoreMacros
+    value:           '1'
+  - key:             modernize-use-default-member-init.UseAssignment
+    value:           '0'
+  - key:             modernize-use-emplace.ContainersWithPushBack
+    value:           '::std::vector;::std::list;::std::deque'
+  - key:             modernize-use-emplace.SmartPointers
+    value:           '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr'
+  - key:             modernize-use-emplace.TupleMakeFunctions
+    value:           '::std::make_pair;::std::make_tuple'
+  - key:             modernize-use-emplace.TupleTypes
+    value:           '::std::pair;::std::tuple'
+  - key:             modernize-use-equals-default.IgnoreMacros
+    value:           '1'
+  - key:             modernize-use-noexcept.ReplacementString
+    value:           ''
+  - key:             modernize-use-noexcept.UseNoexceptFalse
+    value:           '1'
+  - key:             modernize-use-nullptr.NullMacros
+    value:           'NULL'
+  - key:             modernize-use-transparent-functors.SafeMode
+    value:           '0'
+  - key:             modernize-use-using.IgnoreMacros
+    value:           '1'
+  - key:             readability-identifier-naming.AbstractClassCase
+    value:           lower_case
+  - key:             readability-identifier-naming.AbstractClassPrefix
+    value:           ''
+  - key:             readability-identifier-naming.AbstractClassSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ClassCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ClassPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ClassSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ClassConstantCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ClassConstantPrefix
+    value:           'k'
+  - key:             readability-identifier-naming.ClassConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ClassMemberCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ClassMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ClassMemberSuffix
+    value:           '_'
+  - key:             readability-identifier-naming.ClassMethodCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ClassMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ClassMethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprFunctionCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ConstexprFunctionPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprFunctionSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprMethodCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ConstexprMethodPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprMethodSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstexprVariableCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.ConstexprVariablePrefix
+    value:           'k'
+  - key:             readability-identifier-naming.ConstexprVariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.EnumCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumPrefix
+    value:           ''
+  - key:             readability-identifier-naming.EnumSuffix
+    value:           ''
+  - key:             readability-identifier-naming.EnumConstantCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.EnumConstantPrefix
+    value:           'k'
+  - key:             readability-identifier-naming.EnumConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.FunctionCase
+    value:           lower_case
+  - key:             readability-identifier-naming.FunctionPrefix
+    value:           ''
+  - key:             readability-identifier-naming.FunctionSuffix
+    value:           ''
+  - key:             readability-identifier-naming.GlobalConstantCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.GlobalConstantPrefix
+    value:           'k'
+  - key:             readability-identifier-naming.GlobalConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.IgnoreFailedSplit
+    value:           '0'
+  - key:             readability-identifier-naming.LocalVariableCase
+    value:           'lower_case'
+  - key:             readability-identifier-naming.LocalVariablePrefix
+    value:           ''
+  - key:             readability-identifier-naming.LocalVariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.ConstExprVariableCase
+    value:           'CamelCase'
+  - key:             readability-identifier-naming.ConstExprVariablePrefix
+    value:           'k'
+  - key:             readability-identifier-naming.ConstExprVariableSuffix
+    value:           ''
+  - key:             readability-identifier-naming.MemberCase
+    value:           lower_case
+  - key:             readability-identifier-naming.MemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.MemberSuffix
+    value:           ''
+  - key:             readability-identifier-naming.NamespaceCase
+    value:           lower_case
+  - key:             readability-identifier-naming.NamespacePrefix
+    value:           ''
+  - key:             readability-identifier-naming.NamespaceSuffix
+    value:           ''
+  - key:             readability-identifier-naming.PrivateMemberCase
+    value:           lower_case
+  - key:             readability-identifier-naming.PrivateMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.PrivateMemberSuffix
+    value:           '_'
+  - key:             readability-identifier-naming.ProtectedMemberCase
+    value:           lower_case
+  - key:             readability-identifier-naming.ProtectedMemberPrefix
+    value:           ''
+  - key:             readability-identifier-naming.ProtectedMemberSuffix
+    value:           '_'
+  - key:             readability-identifier-naming.StaticConstantCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.StaticConstantPrefix
+    value:           'k'
+  - key:             readability-identifier-naming.StaticConstantSuffix
+    value:           ''
+  - key:             readability-identifier-naming.StructCase
+    value:           lower_case
+  - key:             readability-identifier-naming.StructPrefix
+    value:           ''
+  - key:             readability-identifier-naming.StructSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TypeAliasCase
+    value:           lower_case
+  - key:             readability-identifier-naming.TypeAliasPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TypeAliasSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TypeTemplateParameterCase
+    value:           CamelCase
+  - key:             readability-identifier-naming.TypeTemplateParameterPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TypeTemplateParameterSuffix
+    value:           ''
+  - key:             readability-identifier-naming.TypedefCase
+    value:           lower_case
+  - key:             readability-identifier-naming.TypedefPrefix
+    value:           ''
+  - key:             readability-identifier-naming.TypedefSuffix
+    value:           ''
+  - key:             readability-identifier-naming.VariableCase
+    value:           lower_case
+  - key:             readability-identifier-naming.VariablePrefix
+    value:           ''
+  - key:             readability-identifier-naming.VariableSuffix
+    value:           ''
+...
+
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7163b0dd77..947d0318cb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
-project(CUML VERSION 0.14.0 LANGUAGES CXX CUDA)
+project(RAFT VERSION 0.14.0 LANGUAGES CXX CUDA)
 
 ##############################################################################
 # - build type ---------------------------------------------------------------
@@ -34,9 +34,14 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     "Debug" "Release")
 endif()
 
+# this is needed for clang-tidy runs
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 ##############################################################################
 # - User Options  ------------------------------------------------------------
 
+option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON)
+
 option(EMPTY_MARKER_KERNEL "Enable empty marker kernel after nvtxRangePop" ON)
 
 option(KERNEL_INFO "Enable kernel resource usage info" OFF)
@@ -45,6 +50,8 @@ option(LINE_INFO "Enable lineinfo in nvcc" OFF)
 
 option(NVTX "Enable nvtx markers" OFF)
 
+option(BUILD_RAFT_TESTS "Build raft unit-tests" ON)
+
 set(PARALLEL_LEVEL "" CACHE STRING
     "Sub-projects parallel level for compilation. Currently only affects FAISS" )
 
@@ -53,12 +60,13 @@ set(GPU_ARCHS "" CACHE STRING
     Pass 'ALL' if you want to compile for all supported GPU architectures.
     Empty string means to auto-detect the GPUs on the current system")
 
-
 ##############################################################################
 # - Requirements -------------------------------------------------------------
 
 find_package(CUDA 10.0 REQUIRED)
 
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
 ##############################################################################
 # - Compiler Options  --------------------------------------------------------
 
@@ -85,7 +93,7 @@ endif(OPENMP_FOUND)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 
 if(${CMAKE_VERSION} VERSION_LESS "3.17.0")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++14")
 endif(${CMAKE_VERSION} VERSION_LESS "3.17.0")
 
 if(LINE_INFO)
@@ -149,8 +157,10 @@ endif(CMAKE_COMPILER_IS_GNUCXX)
 set(CMAKE_CUDA_FLAGS
   "${CMAKE_CUDA_FLAGS} -Xcudafe --diag_suppress=unrecognized_gcc_pragma")
 
+##############################################################################
+# - dependencies -------------------------------------------------------------
 
-
+include(cmake/Dependencies.cmake)
 
 ##############################################################################
 # - include paths ------------------------------------------------------------
@@ -159,17 +169,61 @@ set(RAFT_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include CACHE STRING
   "Path to RAFT include directories")
 
 set(RAFT_INCLUDE_DIRECTORIES
-    ${RAFT_INCLUDE_DIR}
-)
+  ${RAFT_INCLUDE_DIR}
+  ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+
+if(DEFINED ENV{CONDA_PREFIX})
+  message(STATUS "Using RMM installation froM $ENV{CONDA_PREFIX}")
+  list(APPEND RAFT_INCLUDE_DIRECTORIES $ENV{CONDA_PREFIX}/include)
+endif(DEFINED ENV{CONDA_PREFIX})
 
 ##############################################################################
-# - build test executables ---------------------------------------------------
+# - libraries ----------------------------------------------------------------
+
+set(RAFT_LINK_LIBRARIES
+  ${CUDA_cublas_LIBRARY}
+  ${CUDA_cusolver_LIBRARY}
+  ${CUDA_CUDART_LIBRARY}
+  ${CUDA_cusparse_LIBRARY}
+  rmm)
 
-add_executable(test_raft
-               test/test.cpp)
+set(RAFT_LINK_DIRECTORIES "")
 
-target_include_directories(test_raft PRIVATE
-                           ${RAFT_INCLUDE_DIRECTORIES})
+if(DEFINED ENV{CONDA_PREFIX})
+  list(APPEND RAFT_LINK_DIRECTORIES $ENV{CONDA_PREFIX}/lib)
+endif(DEFINED ENV{CONDA_PREFIX})
+
+##############################################################################
+# - build test executable ----------------------------------------------------
+
+if(BUILD_RAFT_TESTS)
+  find_package(OpenMP REQUIRED)
+
+  # keep the files in alphabetical order!
+  add_executable(test_raft
+    test/cudart_utils.cpp
+    test/handle.cpp
+    test/mr/device/buffer.cpp
+    test/mr/host/buffer.cpp
+    test/test.cpp)
+
+  target_include_directories(test_raft
+    PRIVATE
+      ${RAFT_INCLUDE_DIRECTORIES}
+      ${GTEST_DIR}/include)
+
+  target_link_directories(test_raft
+    PRIVATE
+      ${RAFT_LINK_DIRECTORIES})
+
+  target_link_libraries(test_raft
+    PRIVATE
+      ${RAFT_LINK_LIBRARIES}
+      gtestlib
+      gtest_mainlib
+      OpenMP::OpenMP_CXX
+      Threads::Threads)
+endif(BUILD_RAFT_TESTS)
 
 ##############################################################################
 # - doxygen targets ----------------------------------------------------------
diff --git a/cpp/cmake/Dependencies.cmake b/cpp/cmake/Dependencies.cmake
new file mode 100644
index 0000000000..d9d15a0ea8
--- /dev/null
+++ b/cpp/cmake/Dependencies.cmake
@@ -0,0 +1,42 @@
+#=============================================================================
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+include(ExternalProject)
+
+##############################################################################
+# - googletest ---------------------------------------------------------------
+
+set(GTEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/googletest CACHE STRING
+  "Path to googletest repo")
+include(ExternalProject)
+ExternalProject_Add(googletest
+  GIT_REPOSITORY    https://github.com/google/googletest.git
+  GIT_TAG           6ce9b98f541b8bcd84c5c5b3483f29a933c4aefb
+  PREFIX            ${GTEST_DIR}
+  CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                    -DBUILD_SHARED_LIBS=OFF
+                    -DCMAKE_INSTALL_LIBDIR=lib
+  BUILD_BYPRODUCTS  ${GTEST_DIR}/lib/libgtest.a
+                    ${GTEST_DIR}/lib/libgtest_main.a
+  UPDATE_COMMAND    "")
+add_library(gtestlib STATIC IMPORTED)
+add_library(gtest_mainlib STATIC IMPORTED)
+set_property(TARGET gtestlib PROPERTY
+  IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest.a)
+set_property(TARGET gtest_mainlib PROPERTY
+  IMPORTED_LOCATION ${GTEST_DIR}/lib/libgtest_main.a)
+add_dependencies(gtestlib googletest)
+add_dependencies(gtest_mainlib googletest)
diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index 0c12fb09dc..f380d276b2 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -26,4 +26,4 @@ inline std::string test_raft() {
   return status;
 }
 
-} // namespace raft
+}  // namespace raft
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
new file mode 100644
index 0000000000..8bd4caf121
--- /dev/null
+++ b/cpp/include/raft/cudart_utils.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <execinfo.h>
+#include <chrono>
+#include <cstdio>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <utility>
+///@todo: enable once logging has been enabled in raft
+//#include "logger.hpp"
+
+namespace raft {
+
+/** base exception class for the whole of raft */
+class exception : public std::exception {
+ public:
+  /** default ctor */
+  explicit exception() noexcept : std::exception(), msg_() {}
+
+  /** copy ctor */
+  exception(const exception& src) noexcept
+    : std::exception(), msg_(src.what()) {
+    collect_call_stack();
+  }
+
+  /** ctor from an input message */
+  explicit exception(const std::string _msg) noexcept
+    : std::exception(), msg_(std::move(_msg)) {
+    collect_call_stack();
+  }
+
+  /** get the message associated with this exception */
+  const char* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+  /** message associated with this exception */
+  std::string msg_;
+
+  /** append call stack info to this exception's message for ease of debug */
+  // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
+  void collect_call_stack() noexcept {
+#ifdef __GNUC__
+    constexpr int kMaxStackDepth = 64;
+    void* stack[kMaxStackDepth];  // NOLINT
+    auto depth = backtrace(stack, kMaxStackDepth);
+    std::ostringstream oss;
+    oss << std::endl << "Obtained " << depth << " stack frames" << std::endl;
+    char** strings = backtrace_symbols(stack, depth);
+    if (strings == nullptr) {
+      oss << "But no stack trace could be found!" << std::endl;
+      msg_ += oss.str();
+      return;
+    }
+    ///@todo: support for demangling of C++ symbol names
+    for (int i = 0; i < depth; ++i) {
+      oss << "#" << i << " in " << strings[i] << std::endl;
+    }
+    free(strings);
+    msg_ += oss.str();
+#endif  // __GNUC__
+  }
+};
+
+/** macro to throw a runtime error */
+#define THROW(fmt, ...)                                                        \
+  do {                                                                         \
+    std::string msg;                                                           \
+    char errMsg[2048]; /* NOLINT */                                            \
+    std::snprintf(errMsg, sizeof(errMsg),                                      \
+                  "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += errMsg;                                                             \
+    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                 \
+    msg += errMsg;                                                             \
+    throw raft::exception(msg);                                                \
+  } while (0)
+
+/** macro to check for a conditional and assert on failure */
+#define ASSERT(check, fmt, ...)              \
+  do {                                       \
+    if (!(check)) THROW(fmt, ##__VA_ARGS__); \
+  } while (0)
+
+/** check for cuda runtime API errors and assert accordingly */
+#define CUDA_CHECK(call)                                               \
+  do {                                                                 \
+    cudaError_t status = call;                                         \
+    ASSERT(status == cudaSuccess, "FAIL: call='%s'. Reason:%s", #call, \
+           cudaGetErrorString(status));                                \
+  } while (0)
+
+///@todo: enable this only after we have added logging support in raft
+// /**
+//  * @brief check for cuda runtime API errors but log error instead of raising
+//  *        exception.
+//  */
+// #define CUDA_CHECK_NO_THROW(call)                                            \
+//   do {                                                                       \
+//     cudaError_t status = call;                                               \
+//     if (status != cudaSuccess) {                                             \
+//       RAFT_LOG_ERROR("CUDA call='%s' at file=%s line=%d failed with %s ",    \
+//                      #call, __FILE__, __LINE__, cudaGetErrorString(status)); \
+//     }                                                                        \
+//   } while (0)
+
+/** helper method to get max usable shared mem per block parameter */
+inline int get_shared_memory_per_block() {
+  int dev_id;
+  CUDA_CHECK(cudaGetDevice(&dev_id));
+  int smem_per_blk;
+  CUDA_CHECK(cudaDeviceGetAttribute(
+    &smem_per_blk, cudaDevAttrMaxSharedMemoryPerBlock, dev_id));
+  return smem_per_blk;
+}
+/** helper method to get multi-processor count parameter */
+inline int get_multi_processor_count() {
+  int dev_id;
+  CUDA_CHECK(cudaGetDevice(&dev_id));
+  int mp_count;
+  CUDA_CHECK(
+    cudaDeviceGetAttribute(&mp_count, cudaDevAttrMultiProcessorCount, dev_id));
+  return mp_count;
+}
+
+/** Helper method to get to know warp size in device code */
+constexpr inline int warp_size() { return 32; }
+
+/**
+ * @brief Generic copy method for all kinds of transfers
+ * @tparam Type data type
+ * @param dst destination pointer
+ * @param src source pointer
+ * @param len lenth of the src/dst buffers in terms of number of elements
+ * @param stream cuda stream
+ */
+template <typename Type>
+void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
+  CUDA_CHECK(
+    cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+}
+
+/**
+ * @defgroup Copy Copy methods
+ * These are here along with the generic 'copy' method in order to improve
+ * code readability using explicitly specified function names
+ * @{
+ */
+/** performs a host to device copy */
+template <typename Type>
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
+                   cudaStream_t stream) {
+  copy(d_ptr, h_ptr, len, stream);
+}
+
+/** performs a device to host copy */
+template <typename Type>
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
+                 cudaStream_t stream) {
+  copy(h_ptr, d_ptr, len, stream);
+}
+
+template <typename Type>
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
+                cudaStream_t stream) {
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
+                             cudaMemcpyDeviceToDevice, stream));
+}
+/** @} */
+
+/**
+ * @defgroup Debug Utils for debugging host/device buffers
+ * @{
+ */
+template <class T, class OutStream>
+void print_host_vector(const char* variable_name, const T* host_mem,
+                       size_t componentsCount, OutStream& out) {
+  out << variable_name << "=[";
+  for (size_t i = 0; i < componentsCount; ++i) {
+    if (i != 0) out << ",";
+    out << host_mem[i];
+  }
+  out << "];\n";
+}
+
+template <class T, class OutStream>
+void print_device_vector(const char* variable_name, const T* devMem,
+                         size_t componentsCount, OutStream& out) {
+  T* host_mem = new T[componentsCount];
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T),
+                        cudaMemcpyDeviceToHost));
+  print_host_vector(variable_name, host_mem, componentsCount, out);
+  delete[] host_mem;
+}
+/** @} */
+
+};  // namespace raft
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
new file mode 100644
index 0000000000..81e63342ce
--- /dev/null
+++ b/cpp/include/raft/handle.hpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+///@todo: enable once we have migrated cuml-comms layer too
+//#include <common/cuml_comms_int.hpp>
+
+#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/cusolver_wrappers.h>
+#include <raft/sparse/cusparse_wrappers.h>
+#include <raft/mr/device/allocator.hpp>
+#include <raft/mr/host/allocator.hpp>
+#include "cudart_utils.h"
+
+namespace raft {
+
+/**
+ * @brief Main handle object that stores all necessary context used for calling
+ *        necessary cuda kernels and/or libraries
+ */
+class handle_t {
+ private:
+  static constexpr int kNumDefaultWorkerStreams = 0;
+
+ public:
+  /**
+   * @brief Construct a handle with the specified number of worker streams
+   *
+   * @param[in] n_streams number worker streams to be created
+   */
+  explicit handle_t(int n_streams = kNumDefaultWorkerStreams)
+    : dev_id_([]() -> int {
+        int cur_dev = -1;
+        CUDA_CHECK(cudaGetDevice(&cur_dev));
+        return cur_dev;
+      }()),
+      num_streams_(n_streams),
+      device_allocator_(std::make_shared<mr::device::default_allocator>()),
+      host_allocator_(std::make_shared<mr::host::default_allocator>()) {
+    create_resources();
+  }
+
+  /** Destroys all held-up resources */
+  ~handle_t() { destroy_resources(); }
+
+  int get_device() const { return dev_id_; }
+
+  void set_stream(cudaStream_t stream) { user_stream_ = stream; }
+  cudaStream_t get_stream() const { return user_stream_; }
+
+  void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator) {
+    device_allocator_ = allocator;
+  }
+  std::shared_ptr<mr::device::allocator> get_device_allocator() const {
+    return device_allocator_;
+  }
+
+  void set_host_allocator(std::shared_ptr<mr::host::allocator> allocator) {
+    host_allocator_ = allocator;
+  }
+  std::shared_ptr<mr::host::allocator> get_host_allocator() const {
+    return host_allocator_;
+  }
+
+  cublasHandle_t get_cublas_handle() const {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cublas_initialized_) {
+      CUBLAS_CHECK(cublasCreate(&cublas_handle_));
+      cublas_initialized_ = true;
+    }
+    return cublas_handle_;
+  }
+
+  cusolverDnHandle_t get_cusolver_dn_handle() const {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_dn_initialized_) {
+      CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_));
+      cusolver_dn_initialized_ = true;
+    }
+    return cusolver_dn_handle_;
+  }
+
+  cusolverSpHandle_t get_cusolver_sp_handle() const {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusolver_sp_initialized_) {
+      CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_));
+      cusolver_sp_initialized_ = true;
+    }
+    return cusolver_sp_handle_;
+  }
+
+  cusparseHandle_t get_cusparse_handle() const {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!cusparse_initialized_) {
+      CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
+      cusparse_initialized_ = true;
+    }
+    return cusparse_handle_;
+  }
+
+  cudaStream_t get_internal_stream(int sid) const { return streams_[sid]; }
+  int get_num_internal_streams() const { return num_streams_; }
+  std::vector<cudaStream_t> get_internal_streams() const {
+    std::vector<cudaStream_t> int_streams_vec(num_streams_);
+    for (auto s : streams_) {
+      int_streams_vec.push_back(s);
+    }
+    return int_streams_vec;
+  }
+
+  void wait_on_user_stream() const {
+    CUDA_CHECK(cudaEventRecord(event_, user_stream_));
+    for (auto s : streams_) {
+      CUDA_CHECK(cudaStreamWaitEvent(s, event_, 0));
+    }
+  }
+
+  void wait_on_internal_streams() const {
+    for (auto s : streams_) {
+      CUDA_CHECK(cudaEventRecord(event_, s));
+      CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
+    }
+  }
+
+  ///@todo: enable this once we have cuml-comms migrated
+  // void setCommunicator(
+  //   std::shared_ptr<MLCommon::cumlCommunicator> communicator);
+  // const MLCommon::cumlCommunicator& getCommunicator() const;
+  // bool commsInitialized() const;
+
+  const cudaDeviceProp& get_device_properties() const {
+    std::lock_guard<std::mutex> _(mutex_);
+    if (!device_prop_initialized_) {
+      CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_));
+      device_prop_initialized_ = true;
+    }
+    return prop_;
+  }
+
+ private:
+  const int dev_id_;
+  const int num_streams_;
+  std::vector<cudaStream_t> streams_;
+  mutable cublasHandle_t cublas_handle_;
+  mutable bool cublas_initialized_{false};
+  mutable cusolverDnHandle_t cusolver_dn_handle_;
+  mutable bool cusolver_dn_initialized_{false};
+  mutable cusolverSpHandle_t cusolver_sp_handle_;
+  mutable bool cusolver_sp_initialized_{false};
+  mutable cusparseHandle_t cusparse_handle_;
+  mutable bool cusparse_initialized_{false};
+  std::shared_ptr<mr::device::allocator> device_allocator_;
+  std::shared_ptr<mr::host::allocator> host_allocator_;
+  cudaStream_t user_stream_{nullptr};
+  cudaEvent_t event_;
+  mutable cudaDeviceProp prop_;
+  mutable bool device_prop_initialized_{false};
+  mutable std::mutex mutex_;
+
+  ///@todo: enable this once we have migrated cuml-comms
+  //std::shared_ptr<MLCommon::cumlCommunicator> _communicator;
+
+  void create_resources() {
+    for (int i = 0; i < num_streams_; ++i) {
+      cudaStream_t stream;
+      CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+      streams_.push_back(stream);
+    }
+    CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
+
+  void destroy_resources() {
+    ///@todo: enable *_NO_THROW variants once we have enabled logging
+    if (cusparse_initialized_) {
+      //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
+      CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
+    }
+    if (cusolver_dn_initialized_) {
+      //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+      CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_));
+    }
+    if (cusolver_sp_initialized_) {
+      //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+      CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_));
+    }
+    if (cublas_initialized_) {
+      //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
+      CUBLAS_CHECK(cublasDestroy(cublas_handle_));
+    }
+    while (!streams_.empty()) {
+      //CUDA_CHECK_NO_THROW(cudaStreamDestroy(streams_.back()));
+      CUDA_CHECK(cudaStreamDestroy(streams_.back()));
+      streams_.pop_back();
+    }
+    //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
+    CUDA_CHECK(cudaEventDestroy(event_));
+  }
+};  // class handle_t
+
+/**
+ * @brief RAII approach to synchronizing across all streams in the handle
+ */
+class stream_syncer {
+ public:
+  explicit stream_syncer(const handle_t& handle) : handle_(handle) {
+    handle_.wait_on_user_stream();
+  }
+  ~stream_syncer() { handle_.wait_on_internal_streams(); }
+
+  stream_syncer(const stream_syncer& other) = delete;
+  stream_syncer& operator=(const stream_syncer& other) = delete;
+
+ private:
+  const handle_t& handle_;
+};  // class stream_syncer
+
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
new file mode 100644
index 0000000000..cd8a508a84
--- /dev/null
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cublas_v2.h>
+///@todo: enable this once we have logger enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+#include <cstdint>
+
+namespace raft {
+namespace linalg {
+
+#define _CUBLAS_ERR_TO_STR(err) \
+  case err:                     \
+    return #err
+inline const char *cublas_error_to_string(cublasStatus_t err) {
+  switch (err) {
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ALLOC_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INVALID_VALUE);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_ARCH_MISMATCH);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_MAPPING_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_EXECUTION_FAILED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
+    _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
+    default:
+      return "CUBLAS_STATUS_UNKNOWN";
+  };
+}
+#undef _CUBLAS_ERR_TO_STR
+
+/** check for cublas runtime API errors and assert accordingly */
+#define CUBLAS_CHECK(call)                                         \
+  do {                                                             \
+    cublasStatus_t err = call;                                     \
+    ASSERT(err == CUBLAS_STATUS_SUCCESS,                           \
+           "CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \
+           raft::linalg::cublas_error_to_string(err));             \
+  } while (0)
+
+///@todo: enable this once we have logging enabled
+// /** check for cublas runtime API errors but do not assert */
+// #define CUBLAS_CHECK_NO_THROW(call)                                          \
+//   do {                                                                       \
+//     cublasStatus_t err = call;                                               \
+//     if (err != CUBLAS_STATUS_SUCCESS) {                                      \
+//       CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \
+//                      raft::linalg::cublas_error_to_string(err));             \
+//     }                                                                        \
+//   } while (0)
+
+/**
+ * @defgroup Axpy cublas ax+y operations
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha,
+                          const T *x, int incx, T *y, int incy,
+                          cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
+                                 const float *alpha, const float *x, int incx,
+                                 float *y, int incy, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
+}
+
+template <>
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
+                                 const double *alpha, const double *x, int incx,
+                                 double *y, int incy, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
+}
+/** @} */
+
+/**
+ * @defgroup gemv cublas gemv calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA,
+                          int m, int n, const T *alfa, const T *A, int lda,
+                          const T *x, int incx, const T *beta, T *y, int incy,
+                          cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgemv(cublasHandle_t handle,
+                                 cublasOperation_t transA, int m, int n,
+                                 const float *alfa, const float *A, int lda,
+                                 const float *x, int incx, const float *beta,
+                                 float *y, int incy, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
+                     incy);
+}
+
+template <>
+inline cublasStatus_t cublasgemv(cublasHandle_t handle,
+                                 cublasOperation_t transA, int m, int n,
+                                 const double *alfa, const double *A, int lda,
+                                 const double *x, int incx, const double *beta,
+                                 double *y, int incy, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
+                     incy);
+}
+/** @} */
+
+/**
+ * @defgroup ger cublas a(x*y.T) + A calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha,
+                         const T *x, int incx, const T *y, int incy, T *A,
+                         int lda, cudaStream_t stream);
+template <>
+inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
+                                const float *alpha, const float *x, int incx,
+                                const float *y, int incy, float *A, int lda,
+                                cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+template <>
+inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
+                                const double *alpha, const double *x, int incx,
+                                const double *y, int incy, double *A, int lda,
+                                cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+/** @} */
+
+/**
+ * @defgroup gemm cublas gemm calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA,
+                          cublasOperation_t transB, int m, int n, int k,
+                          const T *alfa, const T *A, int lda, const T *B,
+                          int ldb, const T *beta, T *C, int ldc,
+                          cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgemm(cublasHandle_t handle,
+                                 cublasOperation_t transA,
+                                 cublasOperation_t transB, int m, int n, int k,
+                                 const float *alfa, const float *A, int lda,
+                                 const float *B, int ldb, const float *beta,
+                                 float *C, int ldc, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+
+template <>
+inline cublasStatus_t cublasgemm(cublasHandle_t handle,
+                                 cublasOperation_t transA,
+                                 cublasOperation_t transB, int m, int n, int k,
+                                 const double *alfa, const double *A, int lda,
+                                 const double *B, int ldb, const double *beta,
+                                 double *C, int ldc, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+/** @} */
+
+/**
+ * @defgroup gemmbatched cublas gemmbatched calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasgemmBatched(cublasHandle_t handle,  // NOLINT
+                                 cublasOperation_t transa,
+                                 cublasOperation_t transb, int m, int n, int k,
+                                 const T *alpha,
+                                 const T *const Aarray[],           // NOLINT
+                                 int lda, const T *const Barray[],  // NOLINT
+                                 int ldb, const T *beta,
+                                 T *Carray[],  // NOLINT
+                                 int ldc, int batchCount, cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgemmBatched(  // NOLINT
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const float *alpha,
+  const float *const Aarray[],                  // NOLINT
+  int lda, const float *const Barray[],         // NOLINT
+  int ldb, const float *beta, float *Carray[],  // NOLINT
+  int ldc, int batchCount, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                            Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+template <>
+inline cublasStatus_t cublasgemmBatched(  // NOLINT
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const double *alpha,
+  const double *const Aarray[],                   // NOLINT
+  int lda, const double *const Barray[],          // NOLINT
+  int ldb, const double *beta, double *Carray[],  // NOLINT
+  int ldc, int batchCount, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                            Barray, ldb, beta, Carray, ldc, batchCount);
+}
+/** @} */
+
+/**
+ * @defgroup gemmbatched cublas gemmbatched calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasgemmStridedBatched(  // NOLINT
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const T *alpha, const T *const Aarray, int lda,
+  int64_t strideA, const T *const Barray, int ldb, int64_t strideB,
+  const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount,
+  cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const float *alpha, const float *const Aarray, int lda,
+  int64_t strideA, const float *const Barray, int ldb, int64_t strideB,
+  const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount,
+  cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
+                                   Aarray, lda, strideA, Barray, ldb, strideB,
+                                   beta, Carray, ldc, strideC, batchCount);
+}
+
+template <>
+inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const double *alpha, const double *const Aarray, int lda,
+  int64_t strideA, const double *const Barray, int ldb, int64_t strideB,
+  const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount,
+  cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
+                                   Aarray, lda, strideA, Barray, ldb, strideB,
+                                   beta, Carray, ldc, strideC, batchCount);
+}
+/** @} */
+
+/**
+ * @defgroup solverbatched cublas getrf/gettribatched calls
+ * @{
+ */
+
+template <typename T>
+cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n,  // NOLINT
+                                  T *const A[],                  // NOLINT
+                                  int lda, int *P, int *info, int batchSize,
+                                  cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,    // NOLINT
+                                         int n, float *const A[],  // NOLINT
+                                         int lda, int *P, int *info,
+                                         int batchSize, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
+}
+
+template <>
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,     // NOLINT
+                                         int n, double *const A[],  // NOLINT
+                                         int lda, int *P, int *info,
+                                         int batchSize, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
+}
+
+template <typename T>
+cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n,  // NOLINT
+                                  const T *const A[],            // NOLINT
+                                  int lda, const int *P,
+                                  T *const C[],  // NOLINT
+                                  int ldc, int *info, int batchSize,
+                                  cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgetriBatched(                // NOLINT
+  cublasHandle_t handle, int n, const float *const A[],  // NOLINT
+  int lda, const int *P, float *const C[],               // NOLINT
+  int ldc, int *info, int batchSize, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+template <>
+inline cublasStatus_t cublasgetriBatched(                 // NOLINT
+  cublasHandle_t handle, int n, const double *const A[],  // NOLINT
+  int lda, const int *P, double *const C[],               // NOLINT
+  int ldc, int *info, int batchSize, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+/** @} */
+
+/**
+ * @defgroup gelsbatched cublas gelsbatched calls
+ * @{
+ */
+
+template <typename T>
+inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
+                                        cublasOperation_t trans, int m, int n,
+                                        int nrhs, T *Aarray[],  // NOLINT
+                                        int lda, T *Carray[],   // NOLINT
+                                        int ldc, int *info, int *devInfoArray,
+                                        int batchSize, cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
+                                        cublasOperation_t trans, int m, int n,
+                                        int nrhs, float *Aarray[],  // NOLINT
+                                        int lda, float *Carray[],   // NOLINT
+                                        int ldc, int *info, int *devInfoArray,
+                                        int batchSize, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                            info, devInfoArray, batchSize);
+}
+
+template <>
+inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
+                                        cublasOperation_t trans, int m, int n,
+                                        int nrhs, double *Aarray[],  // NOLINT
+                                        int lda, double *Carray[],   // NOLINT
+                                        int ldc, int *info, int *devInfoArray,
+                                        int batchSize, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                            info, devInfoArray, batchSize);
+}
+
+/** @} */
+
+/**
+ * @defgroup geam cublas geam calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA,
+                          cublasOperation_t transB, int m, int n, const T *alfa,
+                          const T *A, int lda, const T *beta, const T *B,
+                          int ldb, T *C, int ldc, cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasgeam(cublasHandle_t handle,
+                                 cublasOperation_t transA,
+                                 cublasOperation_t transB, int m, int n,
+                                 const float *alfa, const float *A, int lda,
+                                 const float *beta, const float *B, int ldb,
+                                 float *C, int ldc, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
+                     C, ldc);
+}
+
+template <>
+inline cublasStatus_t cublasgeam(cublasHandle_t handle,
+                                 cublasOperation_t transA,
+                                 cublasOperation_t transB, int m, int n,
+                                 const double *alfa, const double *A, int lda,
+                                 const double *beta, const double *B, int ldb,
+                                 double *C, int ldc, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
+                     C, ldc);
+}
+/** @} */
+
+/**
+ * @defgroup symm cublas symm calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
+                          cublasFillMode_t uplo, int m, int n, const T *alpha,
+                          const T *A, int lda, const T *B, int ldb,
+                          const T *beta, T *C, int ldc, cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, int m, int n,
+                                 const float *alpha, const float *A, int lda,
+                                 const float *B, int ldb, const float *beta,
+                                 float *C, int ldc, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                     ldc);
+}
+
+template <>
+inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, int m, int n,
+                                 const double *alpha, const double *A, int lda,
+                                 const double *B, int ldb, const double *beta,
+                                 double *C, int ldc, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                     ldc);
+}
+/** @} */
+
+/**
+ * @defgroup syrk cublas syrk calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
+                          cublasOperation_t trans, int n, int k, const T *alpha,
+                          const T *A, int lda, const T *beta, T *C, int ldc,
+                          cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
+                                 cublasOperation_t trans, int n, int k,
+                                 const float *alpha, const float *A, int lda,
+                                 const float *beta, float *C, int ldc,
+                                 cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+template <>
+inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
+                                 cublasOperation_t trans, int n, int k,
+                                 const double *alpha, const double *A, int lda,
+                                 const double *beta, double *C, int ldc,
+                                 cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+/** @} */
+
+/**
+ * @defgroup nrm2 cublas nrm2 calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx,
+                          T *result, cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x,
+                                 int incx, float *result, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSnrm2(handle, n, x, incx, result);
+}
+
+template <>
+inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x,
+                                 int incx, double *result,
+                                 cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDnrm2(handle, n, x, incx, result);
+}
+/** @} */
+
+template <typename T>
+cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
+                          cublasFillMode_t uplo, cublasOperation_t trans,
+                          cublasDiagType_t diag, int m, int n, const T *alpha,
+                          const T *A, int lda, T *B, int ldb,
+                          cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, cublasOperation_t trans,
+                                 cublasDiagType_t diag, int m, int n,
+                                 const float *alpha, const float *A, int lda,
+                                 float *B, int ldb, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
+                     ldb);
+}
+
+template <>
+inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, cublasOperation_t trans,
+                                 cublasDiagType_t diag, int m, int n,
+                                 const double *alpha, const double *A, int lda,
+                                 double *B, int ldb, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
+                     ldb);
+}
+
+/**
+ * @defgroup dot cublas dot calls
+ * @{
+ */
+template <typename T>
+cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx,
+                         const T *y, int incy, T *result, cudaStream_t stream);
+
+template <>
+inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x,
+                                int incx, const float *y, int incy,
+                                float *result, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasSdot(handle, n, x, incx, y, incy, result);
+}
+
+template <>
+inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
+                                int incx, const double *y, int incy,
+                                double *result, cudaStream_t stream) {
+  CUBLAS_CHECK(cublasSetStream(handle, stream));
+  return cublasDdot(handle, n, x, incx, y, incy, result);
+}
+/** @} */
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
new file mode 100644
index 0000000000..92ba1a2194
--- /dev/null
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+
+namespace raft {
+namespace linalg {
+
+#define _CUSOLVER_ERR_TO_STR(err) \
+  case err:                       \
+    return #err;
+inline const char *cusolver_error_to_string(cusolverStatus_t err) {
+  switch (err) {
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ALLOC_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INVALID_VALUE);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ARCH_MISMATCH);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_EXECUTION_FAILED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_INTERNAL_ERROR);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
+    _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
+    default:
+      return "CUSOLVER_STATUS_UNKNOWN";
+  };
+}
+#undef _CUSOLVER_ERR_TO_STR
+
+/** check for cusolver runtime API errors and assert accordingly */
+#define CUSOLVER_CHECK(call)                                         \
+  do {                                                               \
+    cusolverStatus_t err = call;                                     \
+    ASSERT(err == CUSOLVER_STATUS_SUCCESS,                           \
+           "CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \
+           raft::linalg::cusolver_error_to_string(err));             \
+  } while (0)
+
+///@todo: enable this once logging is enabled
+// /** check for cusolver runtime API errors but do not assert */
+// #define CUSOLVER_CHECK_NO_THROW(call)                                          \
+//   do {                                                                         \
+//     cusolverStatus_t err = call;                                               \
+//     if (err != CUSOLVER_STATUS_SUCCESS) {                                      \
+//       CUML_LOG_ERROR("CUSOLVER call='%s' got errorcode=%d err=%s", #call, err, \
+//                      raft::linalg::cusolver_error_to_string(err));             \
+//     }                                                                          \
+//   } while (0)
+
+/**
+ * @defgroup Getrf cusolver getrf operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m,  // NOLINT
+                                 int n, T *A, int lda, T *Workspace,
+                                 int *devIpiv, int *devInfo,
+                                 cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
+                                        int m, int n, float *A, int lda,
+                                        float *Workspace, int *devIpiv,
+                                        int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+template <>
+inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
+                                        int m, int n, double *A, int lda,
+                                        double *Workspace, int *devIpiv,
+                                        int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+template <typename T>
+cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+
+template <>
+inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
+}
+
+template <>
+inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
+}
+
+/**
+ * @defgroup Getrs cusolver getrs operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
+                                 cublasOperation_t trans, int n, int nrhs,
+                                 const T *A, int lda, const int *devIpiv, T *B,
+                                 int ldb, int *devInfo, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
+                                        cublasOperation_t trans, int n,
+                                        int nrhs, const float *A, int lda,
+                                        const int *devIpiv, float *B, int ldb,
+                                        int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
+                          devInfo);
+}
+
+template <>
+inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
+                                        cublasOperation_t trans, int n,
+                                        int nrhs, const double *A, int lda,
+                                        const int *devIpiv, double *B, int ldb,
+                                        int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
+                          devInfo);
+}
+/** @} */
+
+/**
+ * @defgroup syevd cusolver syevd operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const T *A, int lda, const T *W, int *lwork);
+
+template <>
+inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const float *A, int lda, const float *W, int *lwork) {
+  return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const double *A, int lda, const double *W, int *lwork) {
+  return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+/** @} */
+
+/**
+ * @defgroup syevj cusolver syevj operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle,  // NOLINT
+                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
+                                 int n, T *A, int lda, T *W, T *work, int lwork,
+                                 int *info, syevjInfo_t params,
+                                 cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, float *A, int lda, float *W, float *work, int lwork, int *info,
+  syevjInfo_t params, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
+                          params);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, double *A, int lda, double *W, double *work, int lwork, int *info,
+  syevjInfo_t params, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
+                          params);
+}
+
+template <typename T>
+cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params);
+
+template <>
+inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const float *A, int lda, const float *W, int *lwork,
+  syevjInfo_t params) {
+  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
+                                     params);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const double *A, int lda, const double *W, int *lwork,
+  syevjInfo_t params) {
+  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
+                                     params);
+}
+/** @} */
+
+/**
+ * @defgroup syevd cusolver syevd operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
+                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
+                                 int n, T *A, int lda, T *W, T *work, int lwork,
+                                 int *devInfo, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
+                                        cusolverEigMode_t jobz,
+                                        cublasFillMode_t uplo, int n, float *A,
+                                        int lda, float *W, float *work,
+                                        int lwork, int *devInfo,
+                                        cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
+                          devInfo);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
+                                        cusolverEigMode_t jobz,
+                                        cublasFillMode_t uplo, int n, double *A,
+                                        int lda, double *W, double *work,
+                                        int lwork, int *devInfo,
+                                        cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
+                          devInfo);
+}
+/** @} */
+
+#if CUDART_VERSION >= 10010
+/**
+ * @defgroup syevdx cusolver syevdx operations
+ * @{
+*/
+template <typename T>
+cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu,
+  int *h_meig, const T *W, int *lwork);
+
+template <>
+inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
+  int il, int iu, int *h_meig, const float *W, int *lwork) {
+  return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
+                                      vu, il, iu, h_meig, W, lwork);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu,
+  int il, int iu, int *h_meig, const double *W, int *lwork) {
+  return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
+                                      vu, il, iu, h_meig, W, lwork);
+}
+
+template <typename T>
+cusolverStatus_t cusolverDnsyevdx(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu,
+  int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
+  int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo,
+  cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
+                           h_meig, W, work, lwork, devInfo);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
+  int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo,
+  cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
+                           h_meig, W, work, lwork, devInfo);
+}
+/** @} */
+#endif
+
+/**
+ * @defgroup svd cusolver svd operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  if (typeid(T) == typeid(float)) {
+    return cusolverDnSgesvd_bufferSize(handle, m, n, lwork);
+  } else {
+    return cusolverDnDgesvd_bufferSize(handle, m, n, lwork);
+  }
+}
+template <typename T>
+cusolverStatus_t cusolverDngesvd(  // NOLINT
+  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
+  T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork,
+  T *rwork, int *devInfo, cudaStream_t stream);
+template <>
+inline cusolverStatus_t cusolverDngesvd(  // NOLINT
+  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
+  float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
+  float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
+                          ldvt, work, lwork, rwork, devInfo);
+}
+template <>
+inline cusolverStatus_t cusolverDngesvd(  // NOLINT
+  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
+  double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt,
+  double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
+                          ldvt, work, lwork, rwork, devInfo);
+}
+
+template <typename T>
+inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv,
+  int *lwork, gesvdjInfo_t params);
+template <>
+inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  const float *A, int lda, const float *S, const float *U, int ldu,
+  const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
+                                      ldu, V, ldv, lwork, params);
+}
+template <>
+inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  const double *A, int lda, const double *S, const double *U, int ldu,
+  const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
+                                      ldu, V, ldv, lwork, params);
+}
+template <typename T>
+inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork,
+  int *info, gesvdjInfo_t params, cudaStream_t stream);
+template <>
+inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
+  float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
+                           work, lwork, info, params);
+}
+template <>
+inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
+  double *work, int lwork, int *info, gesvdjInfo_t params,
+  cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
+                           work, lwork, info, params);
+}
+/** @} */
+
+/**
+ * @defgroup potrf cusolver potrf operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda,
+  int *Lwork);
+
+template <>
+inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
+  int *Lwork) {
+  return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+  int *Lwork) {
+  return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
+}
+
+template <typename T>
+inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
+                                        cublasFillMode_t uplo, int n, T *A,
+                                        int lda, T *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
+                                        cublasFillMode_t uplo, int n, float *A,
+                                        int lda, float *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
+                                        cublasFillMode_t uplo, int n, double *A,
+                                        int lda, double *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+/** @} */
+
+/**
+ * @defgroup potrs cusolver potrs operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
+                                 cublasFillMode_t uplo, int n, int nrhs,
+                                 const T *A, int lda, T *B, int ldb,
+                                 int *devInfo, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
+                                        cublasFillMode_t uplo, int n, int nrhs,
+                                        const float *A, int lda, float *B,
+                                        int ldb, int *devInfo,
+                                        cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
+                                        cublasFillMode_t uplo, int n, int nrhs,
+                                        const double *A, int lda, double *B,
+                                        int ldb, int *devInfo,
+                                        cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+/** @} */
+
+/**
+ * @defgroup geqrf cusolver geqrf operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m,  // NOLINT
+                                 int n, T *A, int lda, T *TAU, T *Workspace,
+                                 int Lwork, int *devInfo, cudaStream_t stream);
+template <>
+inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
+                                        int m, int n, float *A, int lda,
+                                        float *TAU, float *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+template <>
+inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
+                                        int m, int n, double *A, int lda,
+                                        double *TAU, double *Workspace,
+                                        int Lwork, int *devInfo,
+                                        cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+template <typename T>
+cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+template <>
+inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
+}
+template <>
+inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
+}
+/** @} */
+
+/**
+ * @defgroup orgqr cusolver orgqr operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnorgqr(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau,
+  T *work, int lwork, int *devInfo, cudaStream_t stream);
+template <>
+inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda,
+  const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
+}
+template <>
+inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda,
+  const double *tau, double *work, int lwork, int *devInfo,
+  cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
+}
+
+template <typename T>
+cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda,
+  const T *TAU, int *lwork);
+template <>
+inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
+  const float *TAU, int *lwork) {
+  return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
+}
+template <>
+inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
+  const double *TAU, int *lwork) {
+  return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
+}
+/** @} */
+
+/**
+ * @defgroup ormqr cusolver ormqr operations
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle,  // NOLINT
+                                 cublasSideMode_t side, cublasOperation_t trans,
+                                 int m, int n, int k, const T *A, int lda,
+                                 const T *tau, T *C, int ldc, T *work,
+                                 int lwork, int *devInfo, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverDnormqr(  // NOLINT
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const float *A, int lda, const float *tau, float *C,
+  int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
+                          work, lwork, devInfo);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnormqr(  // NOLINT
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const double *A, int lda, const double *tau, double *C,
+  int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
+  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
+                          work, lwork, devInfo);
+}
+
+template <typename T>
+cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc,
+  int *lwork);
+
+template <>
+inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const float *A, int lda, const float *tau,
+  const float *C, int ldc, int *lwork) {
+  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
+                                     C, ldc, lwork);
+}
+
+template <>
+inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const double *A, int lda, const double *tau,
+  const double *C, int ldc, int *lwork) {
+  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
+                                     C, ldc, lwork);
+}
+/** @} */
+
+/**
+ * @defgroup csrqrBatched cusolver batched
+ * @{
+ */
+template <typename T>
+cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, int batchSize, csrqrInfo_t info,
+  size_t *internalDataInBytes, size_t *workspaceInBytes);
+
+template <>
+inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, int batchSize, csrqrInfo_t info,
+  size_t *internalDataInBytes, size_t *workspaceInBytes) {
+  return cusolverSpScsrqrBufferInfoBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
+    info, internalDataInBytes, workspaceInBytes);
+}
+
+template <>
+inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, int batchSize, csrqrInfo_t info,
+  size_t *internalDataInBytes, size_t *workspaceInBytes) {
+  return cusolverSpDcsrqrBufferInfoBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
+    info, internalDataInBytes, workspaceInBytes);
+}
+
+template <typename T>
+cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info,
+  void *pBuffer, cudaStream_t stream);
+
+template <>
+inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, const float *b, float *x, int batchSize,
+  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
+  return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
+                                   csrRowPtrA, csrColIndA, b, x, batchSize,
+                                   info, pBuffer);
+}
+
+template <>
+inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, const double *b, double *x, int batchSize,
+  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
+  return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
+                                   csrRowPtrA, csrColIndA, b, x, batchSize,
+                                   info, pBuffer);
+}
+/** @} */
+
+};  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/mr/allocator.hpp b/cpp/include/raft/mr/allocator.hpp
new file mode 100644
index 0000000000..707b71d468
--- /dev/null
+++ b/cpp/include/raft/mr/allocator.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+namespace raft {
+namespace mr {
+
+/**
+ * @brief Interface for an asynchronous device/host allocator.
+ *
+ * An implementation of this interface can make the following assumptions:
+ * - It does not need to be but it can allow async allocate and deallocate.
+ *
+ * @note This interface does NOT support RAII. Thus, if you need RAII-enabled
+ *       interface, better to use `device_buffer` or `host_buffer`.
+ */
+class base_allocator {
+ public:
+  /**
+   * @brief Asynchronously allocates a memory region.
+   *
+   * An implementation of this need to return a allocation of n bytes properly
+   * align bytes on the configured device. The allocation can optionally be
+   * asynchronous in the sense that it is only save to use after all work
+   * submitted to the passed in stream prior to the call to allocate has
+   * completed. If the allocation is used before, e.g. in another stream the
+   * behaviour may be undefined.
+   * @todo: Add alignment requirments.
+   *
+   * @param[in] n         number of bytes to allocate
+   * @param[in] stream    stream to issue the possible asynchronous allocation in
+   */
+  virtual void* allocate(std::size_t n, cudaStream_t stream) = 0;
+
+  /**
+   * @brief Asynchronously deallocates device memory
+   *
+   * An implementation of this need to ensure that the allocation that the
+   * passed in pointer points to remains usable until all work sheduled in
+   * stream prior to the call to deallocate has completed.
+   *
+   * @param[inout] p      pointer to the buffer to deallocte
+   * @param[in] n         size of the buffer to deallocte in bytes
+   * @param[in] stream    stream in which the allocation might be still in use
+   */
+  virtual void deallocate(void* p, std::size_t n, cudaStream_t stream) = 0;
+
+  virtual ~base_allocator() = default;
+};  // class base_allocator
+
+};  // namespace mr
+};  // namespace raft
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
new file mode 100644
index 0000000000..f1d74d4b24
--- /dev/null
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/cudart_utils.h>
+#include <memory>
+#include <utility>
+
+namespace raft {
+namespace mr {
+
+/**
+ * @brief Base for all RAII-based owning of temporary memory allocations. This
+ *        class should ideally not be used by users directly, but instead via
+ *        the child classes `device_buffer` and `host_buffer`.
+ *
+ * @tparam T          data type
+ * @tparam AllocatorT The underly allocator object
+ */
+template <typename T, typename AllocatorT>
+class buffer_base {
+ public:
+  using size_type = std::size_t;
+  using value_type = T;
+  using iterator = value_type*;
+  using const_iterator = const value_type*;
+  using reference = T&;
+  using const_reference = const T&;
+
+  buffer_base() = delete;
+
+  buffer_base(const buffer_base& other) = delete;
+
+  buffer_base& operator=(const buffer_base& other) = delete;
+
+  /**
+   * @brief Main ctor
+   *
+   * @param[in] allocator asynchronous allocator used for managing buffer life
+   * @param[in] stream    cuda stream where this allocation operations are async
+   * @param[in] n         size of the buffer (in number of elements)
+   */
+  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream,
+              size_type n = 0)
+    : size_(n),
+      capacity_(n),
+      data_(nullptr),
+      stream_(stream),
+      allocator_(std::move(allocator)) {
+    if (capacity_ > 0) {
+      data_ = static_cast<value_type*>(
+        allocator_->allocate(capacity_ * sizeof(value_type), stream_));
+      CUDA_CHECK(cudaStreamSynchronize(stream_));
+    }
+  }
+
+  ~buffer_base() { release(); }
+
+  value_type* data() { return data_; }
+
+  const value_type* data() const { return data_; }
+
+  size_type size() const { return size_; }
+
+  void clear() { size_ = 0; }
+
+  iterator begin() { return data_; }
+
+  const_iterator begin() const { return data_; }
+
+  iterator end() { return data_ + size_; }
+
+  const_iterator end() const { return data_ + size_; }
+
+  /**
+   * @brief Reserve new memory size for this buffer.
+   *
+   * It re-allocates a fresh buffer if the new requested capacity is more than
+   * the current one, copies the old buffer contents to this new buffer and
+   * removes the old one.
+   *
+   * @param[in] new_capacity new capacity (in number of elements)
+   * @param[in] stream       cuda stream where allocation operations are queued
+   * @{
+   */
+  void reserve(size_type new_capacity) {
+    if (new_capacity > capacity_) {
+      auto* new_data = static_cast<value_type*>(
+        allocator_->allocate(new_capacity * sizeof(value_type), stream_));
+      if (size_ > 0) {
+        raft::copy(new_data, data_, size_, stream_);
+        allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
+      }
+      data_ = new_data;
+      capacity_ = new_capacity;
+    }
+  }
+
+  void reserve(size_type new_capacity, cudaStream_t stream) {
+    set_stream(stream);
+    reserve(new_capacity);
+  }
+  /** @} */
+
+  /**
+   * @brief Resize the underlying buffer (uses `reserve` method internally)
+   *
+   * @param[in] new_size new buffer size
+   * @param[in] stream   cuda stream where the work will be queued
+   * @{
+   */
+  void resize(const size_type new_size) {
+    reserve(new_size);
+    size_ = new_size;
+  }
+
+  void resize(const size_type new_size, cudaStream_t stream) {
+    set_stream(stream);
+    resize(new_size);
+  }
+  /** @} */
+
+  /**
+   * @brief Deletes the underlying buffer
+   *
+   * If this method is not explicitly called, it will be during the destructor
+   *
+   * @param[in] stream   cuda stream where the work will be queued
+   * @{
+   */
+  void release() {
+    if (nullptr != data_) {
+      allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
+    }
+    data_ = nullptr;
+    capacity_ = 0;
+    size_ = 0;
+  }
+
+  void release(cudaStream_t stream) {
+    set_stream(stream);
+    release();
+  }
+  /** @} */
+
+  /**
+   * @brief returns the underlying allocator used
+   *
+   * @return the allocator pointer
+   */
+  std::shared_ptr<AllocatorT> get_allocator() const { return allocator_; }
+
+  /**
+   * @brief returns the underlying stream used
+   *
+   * @return the cuda stream
+   */
+  cudaStream_t get_stream() const { return stream_; }
+
+ protected:
+  value_type* data_;
+
+ private:
+  size_type size_;
+  size_type capacity_;
+  cudaStream_t stream_;
+  std::shared_ptr<AllocatorT> allocator_;
+
+  /**
+   * @brief Sets a new cuda stream where the future operations will be queued
+   *
+   * This method makes sure that the inter-stream dependencies are met and taken
+   * care of, before setting the input stream as a new stream for this buffer.
+   * Ideally, the same cuda stream passed during constructor is expected to be
+   * used throughout this buffer's lifetime, for performance.
+   *
+   * @param[in] stream new cuda stream to be set. If it is the same as the
+   *                   current one, then this method will be a no-op.
+   */
+  void set_stream(cudaStream_t stream) {
+    if (stream_ != stream) {
+      cudaEvent_t event;
+      CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      CUDA_CHECK(cudaEventRecord(event, stream_));
+      CUDA_CHECK(cudaStreamWaitEvent(stream, event, 0));
+      stream_ = stream;
+      CUDA_CHECK(cudaEventDestroy(event));
+    }
+  }
+};  // class buffer_base
+
+};  // namespace mr
+};  // namespace raft
diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp
new file mode 100644
index 0000000000..be6ea6fc67
--- /dev/null
+++ b/cpp/include/raft/mr/device/allocator.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/mr/allocator.hpp>
+#include <rmm/mr/device/default_memory_resource.hpp>
+
+namespace raft {
+namespace mr {
+namespace device {
+
+/**
+ * @brief An explicit interface for an asynchronous device allocator.
+ *
+ * This is mostly done in order to reduce work needed in cuML codebase.
+ * An implementation of this interface can make the following assumptions,
+ * further to the ones listed in `Allocator`:
+ * - Allocations may be always on the device that was specified on construction.
+ */
+class allocator : public base_allocator {};
+
+/** Default device allocator based on the one provided by RMM */
+class default_allocator : public allocator {
+ public:
+  void* allocate(std::size_t n, cudaStream_t stream) override {
+    void* ptr = rmm::mr::get_default_resource()->allocate(n, stream);
+    return ptr;
+  }
+
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
+    rmm::mr::get_default_resource()->deallocate(p, n, stream);
+  }
+};  // class default_allocator
+
+};  // namespace device
+};  // namespace mr
+};  // namespace raft
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
new file mode 100644
index 0000000000..39b5674ce4
--- /dev/null
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <raft/mr/buffer_base.hpp>
+#include "allocator.hpp"
+
+namespace raft {
+namespace mr {
+namespace device {
+
+/**
+ * @brief RAII object owning a contiguous typed device buffer. The passed in
+ *        allocator supports asynchronous allocation and deallocation so this
+ *        can also be used for temporary memory
+ *
+ * @code{.cpp}
+ * template<typename T>
+ * void foo(..., cudaStream_t stream) {
+ *   ...
+ *   raft::mr::device::buffer<T> temp(stream, 0);
+ *   ...
+ *   temp.resize(n);
+ *   kernelA<<<grid,block,0,stream>>>(...,temp.data(),...);
+ *   kernelB<<<grid,block,0,stream>>>(...,temp.data(),...);
+ *   temp.release();
+ *   ...
+ * }
+ * @endcode
+ */
+template <typename T>
+class buffer : public buffer_base<T, allocator> {
+ public:
+  using size_type = typename buffer_base<T, allocator>::size_type;
+  using value_type = typename buffer_base<T, allocator>::value_type;
+  using iterator = typename buffer_base<T, allocator>::iterator;
+  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
+  using reference = typename buffer_base<T, allocator>::reference;
+  using const_reference = typename buffer_base<T, allocator>::const_reference;
+
+  buffer() = delete;
+
+  buffer(const buffer& other) = delete;
+
+  buffer& operator=(const buffer& other) = delete;
+
+  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
+    : buffer_base<T, device::allocator>(alloc, stream, n) {}
+};  // class buffer
+
+};  // namespace device
+};  // namespace mr
+};  // namespace raft
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
new file mode 100644
index 0000000000..9ad6ea7532
--- /dev/null
+++ b/cpp/include/raft/mr/host/allocator.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <raft/cudart_utils.h>
+#include <raft/mr/allocator.hpp>
+
+namespace raft {
+namespace mr {
+namespace host {
+
+/**
+ * @brief An explicit interface for an asynchronous host allocations.
+ *
+ * This is mostly done in order to reduce work needed in cuML codebase.
+ * An implementation of this interface can make the following assumptions,
+ * further to the ones listed in `Allocator`:
+ * - Allocations don't need to be zero copy accessible form a device.
+ */
+class allocator : public base_allocator {};
+
+/** Default cudaMallocHost/cudaFreeHost based host allocator */
+class default_allocator : public allocator {
+ public:
+  void* allocate(std::size_t n, cudaStream_t stream) override {
+    void* ptr = nullptr;
+    CUDA_CHECK(cudaMallocHost(&ptr, n));
+    return ptr;
+  }
+
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
+    ///@todo: enable this once logging is enabled in raft
+    //CUDA_CHECK_NO_THROW(cudaFreeHost(p));
+    CUDA_CHECK(cudaFreeHost(p));
+  }
+};  // class default_allocator
+
+};  // namespace host
+};  // namespace mr
+};  // namespace raft
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
new file mode 100644
index 0000000000..c26617e072
--- /dev/null
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cudart_utils.h>
+#include <memory>
+#include <raft/mr/buffer_base.hpp>
+#include <raft/mr/device/buffer.hpp>
+#include "allocator.hpp"
+
+namespace raft {
+namespace mr {
+namespace host {
+
+/**
+ * @brief RAII object owning a contigous typed host buffer (aka pinned memory).
+ *        The passed in allocator supports asynchronus allocation and
+ *        deallocation so this can also be used for temporary memory
+ *
+ * @code{.cpp}
+ * template<typename T>
+ * void foo(const T* in_d , T* out_d, ..., cudaStream_t stream) {
+ *   ...
+ *   raft::mr::host::buffer<T> temp(stream, 0);
+ *   ...
+ *   temp.resize(n);
+ *   raft::copy(temp.data(), in_d, temp.size());
+ *   ...
+ *   raft::copy(out_d, temp.data(), temp.size());
+ *   temp.release(stream);
+ *   ...
+ * }
+ * @endcode
+ */
+template <typename T>
+class buffer : public buffer_base<T, allocator> {
+ public:
+  using size_type = typename buffer_base<T, allocator>::size_type;
+  using value_type = typename buffer_base<T, allocator>::value_type;
+  using iterator = typename buffer_base<T, allocator>::iterator;
+  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
+  using reference = typename buffer_base<T, allocator>::reference;
+  using const_reference = typename buffer_base<T, allocator>::const_reference;
+
+  buffer() = delete;
+
+  buffer(const buffer& other) = delete;
+
+  buffer& operator=(const buffer& other) = delete;
+
+  buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
+    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size()) {
+    if (other.size() > 0) {
+      raft::copy(data_, other.data(), other.size(), other.get_stream());
+    }
+  }
+
+  buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
+    : buffer_base<T, allocator>(alloc, stream, n) {}
+
+  reference operator[](size_type pos) { return data_[pos]; }
+
+  const_reference operator[](size_type pos) const { return data_[pos]; }
+
+ private:
+  using buffer_base<T, allocator>::data_;
+};
+
+};  // namespace host
+};  // namespace mr
+};  // namespace raft
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
new file mode 100644
index 0000000000..1c63d2348b
--- /dev/null
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cusparse_v2.h>
+///@todo: enable this once logging is enabled
+//#include <cuml/common/logger.hpp>
+#include <raft/cudart_utils.h>
+
+namespace raft {
+namespace sparse {
+
+#define _CUSPARSE_ERR_TO_STR(err) \
+  case err:                       \
+    return #err;
+inline const char* cusparse_error_to_string(cusparseStatus_t err) {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
+  return cusparseGetErrorString(status);
+#else   // CUDART_VERSION
+  switch (err) {
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_SUCCESS);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_NOT_INITIALIZED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ALLOC_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INVALID_VALUE);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_ARCH_MISMATCH);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
+    _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
+    default:
+      return "CUSPARSE_STATUS_UNKNOWN";
+  };
+#endif  // CUDART_VERSION
+}
+#undef _CUSPARSE_ERR_TO_STR
+
+/** check for cusparse runtime API errors and assert accordingly */
+#define CUSPARSE_CHECK(call)                                         \
+  do {                                                               \
+    cusparseStatus_t err = call;                                     \
+    ASSERT(err == CUSPARSE_STATUS_SUCCESS,                           \
+           "CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \
+           raft::sparse::cusparse_error_to_string(err));             \
+  } while (0)
+
+///@todo: enable this once logging is enabled
+// /** check for cusparse runtime API errors but do not assert */
+// #define CUSPARSE_CHECK_NO_THROW(call)                                          \
+//   do {                                                                         \
+//     cusparseStatus_t err = call;                                               \
+//     if (err != CUSPARSE_STATUS_SUCCESS) {                                      \
+//       CUML_LOG_ERROR("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \
+//                      raft::sparse::cusparse_error_to_string(err));             \
+//     }                                                                          \
+//   } while (0)
+
+/**
+ * @defgroup gthr cusparse gather methods
+ * @{
+ */
+template <typename T>
+cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals,
+                              T* vals_sorted, int* d_P, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
+                                     const double* vals, double* vals_sorted,
+                                     int* d_P, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P,
+                       CUSPARSE_INDEX_BASE_ZERO);
+}
+template <>
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
+                                     const float* vals, float* vals_sorted,
+                                     int* d_P, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P,
+                       CUSPARSE_INDEX_BASE_ZERO);
+}
+/** @} */
+
+/**
+ * @defgroup coo2csr cusparse COO to CSR converter methods
+ * @{
+ */
+template <typename T>
+void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz,
+                     int m, T* csrRowPtr, cudaStream_t stream);
+template <>
+inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
+                            int nnz, int m, int* csrRowPtr,
+                            cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr,
+                                  CUSPARSE_INDEX_BASE_ZERO));
+}
+/** @} */
+
+/**
+ * @defgroup coosort cusparse coo sort methods
+ * @{
+ */
+template <typename T>
+size_t cusparsecoosort_bufferSizeExt(  // NOLINT
+  cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows,
+  const T* cooCols, cudaStream_t stream);
+template <>
+inline size_t cusparsecoosort_bufferSizeExt(  // NOLINT
+  cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows,
+  const int* cooCols, cudaStream_t stream) {
+  size_t val;
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  CUSPARSE_CHECK(
+    cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
+  return val;
+}
+
+template <typename T>
+void cusparsecoosortByRow(  // NOLINT
+  cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P,
+  void* pBuffer, cudaStream_t stream);
+template <>
+inline void cusparsecoosortByRow(  // NOLINT
+  cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols,
+  int* P, void* pBuffer, cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  CUSPARSE_CHECK(
+    cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
+}
+/** @} */
+
+/**
+ * @defgroup Gemmi cusparse gemmi operations
+ * @{
+ */
+inline cusparseStatus_t cusparsegemmi(
+  cusparseHandle_t handle, int m, int n, int k, int nnz, const float* alpha,
+  const float* A, int lda, const float* cscValB, const int* cscColPtrB,
+  const int* cscRowIndB, const float* beta, float* C, int ldc) {
+  return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
+                        cscColPtrB, cscRowIndB, beta, C, ldc);
+}
+inline cusparseStatus_t cusparsegemmi(
+  cusparseHandle_t handle, int m, int n, int k, int nnz, const double* alpha,
+  const double* A, int lda, const double* cscValB, const int* cscColPtrB,
+  const int* cscRowIndB, const double* beta, double* C, int ldc) {
+  return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
+                        cscColPtrB, cscRowIndB, beta, C, ldc);
+}
+/** @} */
+
+};  // namespace sparse
+};  // namespace raft
diff --git a/cpp/scripts/include_checker.py b/cpp/scripts/include_checker.py
index e8e752380e..1ced05e743 100644
--- a/cpp/scripts/include_checker.py
+++ b/cpp/scripts/include_checker.py
@@ -22,6 +22,7 @@
 
 
 IncludeRegex = re.compile(r"\s*#include\s*(\S+)")
+RemoveComments = re.compile(r"//.*")
 
 
 def parse_args():
@@ -52,6 +53,7 @@ def check_includes_in(src):
     errs = []
     dir = os.path.dirname(src)
     for line_number, line in enumerate(open(src)):
+        line = RemoveComments.sub("", line)
         match = IncludeRegex.search(line)
         if match is None:
             continue
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
new file mode 100644
index 0000000000..23260d2f4d
--- /dev/null
+++ b/cpp/scripts/run-clang-tidy.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import sys
+import re
+import os
+import subprocess
+import argparse
+import json
+import multiprocessing as mp
+
+
+EXPECTED_VERSION = "8.0.1"
+VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
+GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
+SPACES = re.compile(r"\s+")
+SEPARATOR = "-" * 16
+
+
+def parse_args():
+    argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
+    argparser.add_argument("-cdb", type=str, default="compile_commands.json",
+                           help="Path to cmake-generated compilation database")
+    argparser.add_argument("-exe", type=str, default="clang-tidy",
+                           help="Path to clang-tidy exe")
+    argparser.add_argument("-ignore", type=str, default="[.]cu$",
+                           help="Regex used to ignore files from checking")
+    argparser.add_argument("-select", type=str, default=None,
+                           help="Regex used to select files for checking")
+    argparser.add_argument("-j", type=int, default=-1,
+                           help="Number of parallel jobs to launch.")
+    args = argparser.parse_args()
+    if args.j <= 0:
+        args.j = mp.cpu_count()
+    args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
+    args.select_compiled = re.compile(args.select) if args.select else None
+    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
+    ret = ret.decode("utf-8")
+    version = VERSION_REGEX.search(ret)
+    if version is None:
+        raise Exception("Failed to figure out clang-tidy version!")
+    version = version.group(1)
+    if version != EXPECTED_VERSION:
+        raise Exception("clang-tidy exe must be v%s found '%s'" % \
+                        (EXPECTED_VERSION, version))
+    if not os.path.exists(args.cdb):
+        raise Exception("Compilation database '%s' missing" % args.cdb)
+    return args
+
+
+def list_all_cmds(cdb):
+    with open(cdb, "r") as fp:
+        return json.load(fp)
+
+
+def get_gpu_archs(command):
+    archs = []
+    for loc in range(len(command)):
+        if command[loc] != "-gencode":
+            continue
+        arch_flag = command[loc + 1]
+        match = GPU_ARCH_REGEX.search(arch_flag)
+        if match is not None:
+            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
+    return archs
+
+
+def get_index(arr, item):
+    try:
+        return arr.index(item)
+    except:
+        return -1
+
+
+def remove_item(arr, item):
+    loc = get_index(arr, item)
+    if loc >= 0:
+        del arr[loc]
+    return loc
+
+
+def remove_item_plus_one(arr, item):
+    loc = get_index(arr, item)
+    if loc >= 0:
+        del arr[loc + 1]
+        del arr[loc]
+    return loc
+
+
+def get_clang_includes(exe):
+    dir = os.getenv("CONDA_PREFIX")
+    if dir is None:
+        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
+        ret = ret.decode("utf-8")
+        dir = os.path.dirname(os.path.dirname(ret))
+    header = os.path.join(dir, "include", "ClangHeaders")
+    return ["-I", header]
+
+
+def get_tidy_args(cmd, exe):
+    command, file = cmd["command"], cmd["file"]
+    is_cuda = file.endswith(".cu")
+    command = re.split(SPACES, command)
+    # compiler is always clang++!
+    command[0] = "clang++"
+    # remove compilation and output targets from the original command
+    remove_item_plus_one(command, "-c")
+    remove_item_plus_one(command, "-o")
+    if is_cuda:
+        # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
+        archs = get_gpu_archs(command)
+        command.extend(archs)
+        while True:
+            loc = remove_item_plus_one(command, "-gencode")
+            if loc < 0:
+                break
+        # "-x cuda" is the right usage in clang
+        loc = get_index(command, "-x")
+        if loc >= 0:
+            command[loc + 1] = "cuda"
+        remove_item_plus_one(command, "-ccbin")
+        remove_item(command, "--expt-extended-lambda")
+        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
+    command.extend(get_clang_includes(exe))
+    return command, is_cuda
+
+
+def run_clang_tidy_command(tidy_cmd):
+    cmd = " ".join(tidy_cmd)
+    result = subprocess.run(cmd, check=False, shell=True,
+                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    status = result.returncode == 0
+    if status:
+        out = ""
+    else:
+        out = "CMD: " + cmd
+    out += result.stdout.decode("utf-8").rstrip()
+    return status, out
+
+
+def run_clang_tidy(cmd, args):
+    command, is_cuda = get_tidy_args(cmd, args.exe)
+    tidy_cmd = [args.exe, "-header-filter=.*raft/cpp/.*", cmd["file"], "--", ]
+    tidy_cmd.extend(command)
+    status = True
+    out = ""
+    if is_cuda:
+        tidy_cmd.append("--cuda-device-only")
+        tidy_cmd.append(cmd["file"])
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        out += out1
+        out += "%s" % SEPARATOR
+        if not ret:
+            status = ret
+        tidy_cmd[-2] = "--cuda-host-only"
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        if not ret:
+            status = ret
+        out += out1
+    else:
+        tidy_cmd.append(cmd["file"])
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        if not ret:
+            status = ret
+        out += out1
+    return status, out, cmd["file"]
+
+
+# yikes! global var :(
+results = []
+def collect_result(result):
+    global results
+    results.append(result)
+
+
+def print_result(passed, stdout, file):
+    status_str = "PASSED" if passed else "FAILED"
+    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
+    if stdout:
+        print(stdout)
+        print("%s File:%s ENDS %s" % (SEPARATOR, file, SEPARATOR))
+
+
+def print_results():
+    global results
+    status = True
+    for passed, stdout, file in results:
+        print_result(passed, stdout, file)
+        if not passed:
+            status = False
+    return status
+
+
+# mostly used for debugging purposes
+def run_sequential(args, all_files):
+    status = True
+    # actual tidy checker
+    for cmd in all_files:
+        # skip files that we don't want to look at
+        if args.ignore_compiled is not None and \
+           re.search(args.ignore_compiled, cmd["file"]) is not None:
+            continue
+        if args.select_compiled is not None and \
+           re.search(args.select_compiled, cmd["file"]) is None:
+            continue
+        passed, stdout, file = run_clang_tidy(cmd, args)
+        print_result(passed, stdout, file)
+        if not passed:
+            status = False
+    return status
+
+
+def run_parallel(args, all_files):
+    pool = mp.Pool(args.j)
+    # actual tidy checker
+    for cmd in all_files:
+        # skip files that we don't want to look at
+        if args.ignore_compiled is not None and \
+           re.search(args.ignore_compiled, cmd["file"]) is not None:
+            continue
+        if args.select_compiled is not None and \
+           re.search(args.select_compiled, cmd["file"]) is None:
+            continue
+        pool.apply_async(run_clang_tidy, args=(cmd, args),
+                         callback=collect_result)
+    pool.close()
+    pool.join()
+    return print_results()
+
+
+def main():
+    args = parse_args()
+    # Attempt to making sure that we run this script from root of repo always
+    if not os.path.exists(".git"):
+        raise Exception("This needs to always be run from the root of repo")
+    all_files = list_all_cmds(args.cdb)
+    if args.j == 1:
+        status = run_sequential(args, all_files)
+    else:
+        status = run_parallel(args, all_files)
+    if not status:
+        raise Exception("clang-tidy failed! Refer to the errors above.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
new file mode 100644
index 0000000000..c14d880efd
--- /dev/null
+++ b/cpp/test/cudart_utils.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <raft/cudart_utils.h>
+#include <iostream>
+
+namespace raft {
+
+TEST(Raft, Utils) {
+  ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!"));
+  ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception);
+  ASSERT_THROW(THROW("Should throw!"), exception);
+  ASSERT_NO_THROW(CUDA_CHECK(cudaFree(nullptr)));
+}
+
+}  // namespace raft
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
new file mode 100644
index 0000000000..2c5280199d
--- /dev/null
+++ b/cpp/test/handle.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/handle.hpp>
+
+namespace raft {
+
+TEST(Raft, HandleDefault) {
+  handle_t h;
+  ASSERT_EQ(0, h.get_num_internal_streams());
+  ASSERT_EQ(0, h.get_device());
+  ASSERT_EQ(nullptr, h.get_stream());
+  ASSERT_NE(nullptr, h.get_cublas_handle());
+  ASSERT_NE(nullptr, h.get_cusolver_dn_handle());
+  ASSERT_NE(nullptr, h.get_cusolver_sp_handle());
+  ASSERT_NE(nullptr, h.get_cusparse_handle());
+}
+
+TEST(Raft, Handle) {
+  handle_t h(4);
+  ASSERT_EQ(4, h.get_num_internal_streams());
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  h.set_stream(stream);
+  ASSERT_EQ(stream, h.get_stream());
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+}  // namespace raft
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
new file mode 100644
index 0000000000..86aee43ce3
--- /dev/null
+++ b/cpp/test/mr/device/buffer.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/mr/device/buffer.hpp>
+
+namespace raft {
+namespace mr {
+namespace device {
+
+TEST(Raft, DeviceBuffer) {
+  auto alloc = std::make_shared<default_allocator>();
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  // no allocation at construction
+  buffer<char> buff(alloc, stream);
+  ASSERT_EQ(0, buff.size());
+  // explicit allocation after construction
+  buff.resize(20, stream);
+  ASSERT_EQ(20, buff.size());
+  // resizing to a smaller buffer size
+  buff.resize(10, stream);
+  ASSERT_EQ(10, buff.size());
+  // explicit deallocation
+  buff.release(stream);
+  ASSERT_EQ(0, buff.size());
+  // use these methods without the explicit stream parameter
+  buff.resize(20);
+  ASSERT_EQ(20, buff.size());
+  buff.resize(10);
+  ASSERT_EQ(10, buff.size());
+  buff.release();
+  ASSERT_EQ(0, buff.size());
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+}  // namespace device
+}  // namespace mr
+}  // namespace raft
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
new file mode 100644
index 0000000000..953f65ddfb
--- /dev/null
+++ b/cpp/test/mr/host/buffer.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/mr/device/buffer.hpp>
+#include <raft/mr/host/buffer.hpp>
+
+namespace raft {
+namespace mr {
+namespace host {
+
+TEST(Raft, HostBuffer) {
+  auto alloc = std::make_shared<default_allocator>();
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  // no allocation at construction
+  buffer<char> buff(alloc, stream);
+  ASSERT_EQ(0, buff.size());
+  // explicit allocation after construction
+  buff.resize(20, stream);
+  ASSERT_EQ(20, buff.size());
+  // resizing to a smaller buffer size
+  buff.resize(10, stream);
+  ASSERT_EQ(10, buff.size());
+  // explicit deallocation
+  buff.release(stream);
+  ASSERT_EQ(0, buff.size());
+  // use these methods without the explicit stream parameter
+  buff.resize(20);
+  ASSERT_EQ(20, buff.size());
+  buff.resize(10);
+  ASSERT_EQ(10, buff.size());
+  buff.release();
+  ASSERT_EQ(0, buff.size());
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+TEST(Raft, DeviceToHostBuffer) {
+  auto d_alloc = std::make_shared<device::default_allocator>();
+  auto h_alloc = std::make_shared<default_allocator>();
+  cudaStream_t stream;
+  CUDA_CHECK(cudaStreamCreate(&stream));
+  device::buffer<char> d_buff(d_alloc, stream, 32);
+  CUDA_CHECK(
+    cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
+  buffer<char> h_buff(h_alloc, d_buff);
+  ASSERT_EQ(d_buff.size(), h_buff.size());
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+  CUDA_CHECK(cudaStreamDestroy(stream));
+}
+
+}  // namespace host
+}  // namespace mr
+}  // namespace raft
diff --git a/cpp/test/test.cpp b/cpp/test/test.cpp
index 1ca69febca..7477d7d0b5 100644
--- a/cpp/test/test.cpp
+++ b/cpp/test/test.cpp
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#include <gtest/gtest.h>
 #include <iostream>
 #include <raft.hpp>
 
-int main() {
-  std::string result = raft::test_raft();
-  std::cout << result;
-}
+namespace raft {
+
+TEST(Raft, print) { std::cout << test_raft() << std::endl; }
+
+}  // namespace raft