From e93c32d99b91ef8df88079aa678dc755be0e0ef2 Mon Sep 17 00:00:00 2001
From: koparasy <parasyris1@llnl.gov>
Date: Fri, 24 May 2024 15:09:19 -0700
Subject: [PATCH 1/5] Fixes issues raised on the example code and issues of GPU
 enabled executions

---
 src/AMSlib/wf/basedb.hpp | 1 -
 1 file changed, 1 deletion(-)
diff --git a/src/AMSlib/wf/basedb.hpp b/src/AMSlib/wf/basedb.hpp
index d77af6a6..ebcc50ef 100644
--- a/src/AMSlib/wf/basedb.hpp
+++ b/src/AMSlib/wf/basedb.hpp
@@ -2377,7 +2377,6 @@ class RMQInterface
   }
 };
 
-
 /* A class that provides a BaseDB interface to AMS workflow.
  * When storing data it pushes the data to the RMQ server asynchronously
 */

From 4c1148d16a5f659e19929e2e573567650b820129 Mon Sep 17 00:00:00 2001
From: koparasy <parasyris1@llnl.gov>
Date: Mon, 3 Jun 2024 09:36:25 -0700
Subject: [PATCH 2/5] Remove umpire as an allocator and move device code to a
 single file

---
 src/AMSlib/AMS.cpp                            |   4 +-
 src/AMSlib/CMakeLists.txt                     |  40 +-
 src/AMSlib/ml/hdcache.hpp                     |   5 +-
 src/AMSlib/ml/random_uq.hpp                   |  21 +-
 src/AMSlib/ml/surrogate.hpp                   |  61 +--
 src/AMSlib/wf/basedb.hpp                      |  19 +-
 .../wf/cuda/{utilities.cuh => utilities.cpp}  | 400 +++++++++++++++---
 src/AMSlib/wf/device.hpp                      | 398 +++++++++--------
 src/AMSlib/wf/resource_manager.cpp            | 111 +++--
 src/AMSlib/wf/resource_manager.hpp            |  91 ++--
 src/AMSlib/wf/workflow.hpp                    |  12 +-
 tests/AMSlib/ams_ete.cpp                      |   8 -
 tests/AMSlib/ams_ete_env.cpp                  |  12 +-
 tests/AMSlib/ams_update_model.cpp             |  12 +-
 tests/AMSlib/cpu_packing_test.cpp             |  24 +-
 15 files changed, 783 insertions(+), 435 deletions(-)
 rename src/AMSlib/wf/cuda/{utilities.cuh => utilities.cpp} (52%)

diff --git a/src/AMSlib/AMS.cpp b/src/AMSlib/AMS.cpp
index 267d5d59..d78538d3 100644
--- a/src/AMSlib/AMS.cpp
+++ b/src/AMSlib/AMS.cpp
@@ -389,8 +389,8 @@ class AMSWrap
       } else if (log_prefix.find("<PID>") != std::string::npos) {
         pattern = std::string("<PID>");
         id = getpid();
-      }
-      // Combine hostname and pid
+      }  // Combine hostname and pid
+
       std::ostringstream combined;
       combined << "." << hostname << "." << id;
 
diff --git a/src/AMSlib/CMakeLists.txt b/src/AMSlib/CMakeLists.txt
index 8c265aae..35159b0c 100644
--- a/src/AMSlib/CMakeLists.txt
+++ b/src/AMSlib/CMakeLists.txt
@@ -8,27 +8,35 @@ file(GLOB_RECURSE MINIAPP_INCLUDES "*.hpp")
 #set global library path to link with tests if necessary
 set(LIBRARY_OUTPUT_PATH ${AMS_LIB_OUT_PATH})
 set(AMS_LIB_SRC ${MINIAPP_INCLUDES} AMS.cpp wf/resource_manager.cpp wf/debug.cpp wf/basedb.cpp wf/logger.cpp)
+
+if (WITH_CUDA)
+  list(APPEND AMS_LIB_SRC wf/cuda/utilities.cpp)
+  message(WARNING "FILES ARE ${AMS_LIB_SRC}")
+endif()
+
+
 # two targets: a shared lib and an exec
 add_library(AMS ${AMS_LIB_SRC} ${MINIAPP_INCLUDES})
 
 # ------------------------------------------------------------------------------
 if (WITH_CUDA)
-    set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
-
-    #    if (BUILD_SHARED_LIBS)
-    #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    #    else()
-    #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    #      set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-    #    endif()
-
-    set_source_files_properties(AMS.cpp PROPERTIES LANGUAGE CUDA)
-    set_source_files_properties(AMS.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
-    set_source_files_properties(AMS.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
-
-    if (WITH_PERFFLOWASPECT)
-        set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
-        set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
+
+  set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
+
+  #    if (BUILD_SHARED_LIBS)
+  #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  #    else()
+  #      set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+  #      set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+  #    endif()
+
+  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES LANGUAGE CUDA)
+  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
+  set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")
+
+  if (WITH_PERFFLOWASPECT)
+      set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
+      set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
     endif()
 endif()
 
diff --git a/src/AMSlib/ml/hdcache.hpp b/src/AMSlib/ml/hdcache.hpp
index 33949db0..31719979 100644
--- a/src/AMSlib/ml/hdcache.hpp
+++ b/src/AMSlib/ml/hdcache.hpp
@@ -17,6 +17,7 @@
 #include <stdexcept>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <vector>
 
 #ifdef __ENABLE_FAISS__
@@ -29,6 +30,8 @@
 #include <faiss/gpu/GpuCloner.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
 #include <faiss/gpu/StandardGpuResources.h>
+
+#include "wf/device.hpp"
 #endif
 #endif
 
@@ -366,7 +369,7 @@ class HDCache
     _evaluate(ndata, data, is_acceptable);
 
     if (cache_location == AMSResourceType::AMS_DEVICE) {
-      deviceCheckErrors(__FILE__, __LINE__);
+      ams::deviceCheckErrors(__FILE__, __LINE__);
     }
 
     DBG(UQModule, "Done with evalution of uq")
diff --git a/src/AMSlib/ml/random_uq.hpp b/src/AMSlib/ml/random_uq.hpp
index c031e8ba..dc3bbb8d 100644
--- a/src/AMSlib/ml/random_uq.hpp
+++ b/src/AMSlib/ml/random_uq.hpp
@@ -11,13 +11,9 @@
 
 #include "AMS.h"
 #include "wf/debug.h"
+#include "wf/device.hpp"
 #include "wf/utils.hpp"
 
-static size_t round_up(size_t num, size_t denom)
-{
-  return (num + denom - 1) / denom;
-}
-
 class RandomUQ
 {
 public:
@@ -25,20 +21,7 @@ class RandomUQ
   inline void evaluate(const size_t ndata, bool *is_acceptable)
   {
     if (resourceLocation == AMSResourceType::AMS_DEVICE) {
-#ifdef __ENABLE_CUDA__
-      //TODO: Move all of this code on device.cpp and provide better logic regarding
-      // number of threads
-      size_t threads = 256;
-      size_t blocks = round_up(ndata, threads);
-      random_uq_device<<<blocks, threads>>>(seed,
-                                            is_acceptable,
-                                            ndata,
-                                            threshold);
-      seed = seed + 1;
-#else
-      THROW(std::runtime_error,
-            "Random-uq is not configured to use device allocations");
-#endif
+      ams::device_random_uq(seed, is_acceptable, ndata, threshold);
     } else {
       random_uq_host(is_acceptable, ndata, threshold);
     }
diff --git a/src/AMSlib/ml/surrogate.hpp b/src/AMSlib/ml/surrogate.hpp
index 4f168469..b194de94 100644
--- a/src/AMSlib/ml/surrogate.hpp
+++ b/src/AMSlib/ml/surrogate.hpp
@@ -15,7 +15,7 @@
 #include <unordered_map>
 
 #include "AMS.h"
-#include "wf/cuda/utilities.cuh"
+#include "wf/device.hpp"
 
 #ifdef __ENABLE_TORCH__
 #include <ATen/core/interned_strings.h>
@@ -100,43 +100,12 @@ class SurrogateModel
   {
     // Transpose to get continuous memory and
     // perform single memcpy.
+    auto& rm = ams::ResourceManager::getInstance();
     tensor = tensor.transpose(1, 0);
-    if (model_resource == AMSResourceType::AMS_HOST) {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
-    } else {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        DtoDMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
-    }
-  }
-
-  PERFFASPECT()
-  inline void tensorToHostArray(at::Tensor tensor,
-                                long numRows,
-                                long numCols,
-                                TypeInValue** array)
-  {
-    // Transpose to get continuous memory and
-    // perform single memcpy.
-    tensor = tensor.transpose(1, 0);
-    if (model_resource == AMSResourceType::AMS_HOST) {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
-    } else {
-      for (long j = 0; j < numCols; j++) {
-        auto tmp = tensor[j].contiguous();
-        TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
-        DtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
-      }
+    for (long j = 0; j < numCols; j++) {
+      auto tmp = tensor[j].contiguous();
+      TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
+      rm.copy(ptr, model_resource, array[j], model_resource, numRows);
     }
   }
 
@@ -216,13 +185,9 @@ class SurrogateModel
       if (model_resource == AMSResourceType::AMS_DEVICE) {
 #ifdef __ENABLE_CUDA__
         DBG(Surrogate, "Compute mean delta uq predicates on device\n");
-        constexpr int block_size = 256;
-        int grid_size = divup(nrows, block_size);
-        computeDeltaUQMeanPredicatesKernel<<<grid_size, block_size>>>(
-            outputs_stdev, predicates, nrows, ncols, threshold);
         // TODO: use combined routine when it lands.
-        cudaDeviceSynchronize();
-        CUDACHECKERROR();
+        ams::Device::computeDeltaUQMeanPredicatesDevice(
+            outputs_stdev, predicates, nrows, ncols, threshold);
 #else
         THROW(std::runtime_error,
               "Expected CUDA is enabled when model data are on DEVICE");
@@ -235,13 +200,9 @@ class SurrogateModel
       if (model_resource == AMSResourceType::AMS_DEVICE) {
 #ifdef __ENABLE_CUDA__
         DBG(Surrogate, "Compute max delta uq predicates on device\n");
-        constexpr int block_size = 256;
-        int grid_size = divup(nrows, block_size);
-        computeDeltaUQMaxPredicatesKernel<<<grid_size, block_size>>>(
-            outputs_stdev, predicates, nrows, ncols, threshold);
         // TODO: use combined routine when it lands.
-        cudaDeviceSynchronize();
-        CUDACHECKERROR();
+        ams::Device::computeDeltaUQMaxPredicatesDevice(
+            outputs_stdev, predicates, nrows, ncols, threshold);
 #else
         THROW(std::runtime_error,
               "Expected CUDA is enabled when model data are on DEVICE");
@@ -306,7 +267,7 @@ class SurrogateModel
     }
 
     if (is_device()) {
-      deviceCheckErrors(__FILE__, __LINE__);
+      ams::deviceCheckErrors(__FILE__, __LINE__);
     }
 
     DBG(Surrogate,
diff --git a/src/AMSlib/wf/basedb.hpp b/src/AMSlib/wf/basedb.hpp
index ebcc50ef..d6acc086 100644
--- a/src/AMSlib/wf/basedb.hpp
+++ b/src/AMSlib/wf/basedb.hpp
@@ -1962,14 +1962,8 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
            msg_id)
     auto& msg = *it;
     auto& rm = ams::ResourceManager::getInstance();
-    try {
-      rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);
-    } catch (const umpire::util::Exception& e) {
-      FATAL(RMQPublisherHandler,
-            "Failed to deallocate #%d (%p)",
-            msg.id(),
-            msg.data());
-    }
+    rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);
+
     DBG(RMQPublisherHandler, "Deallocated msg #%d (%p)", msg.id(), msg.data())
     buf.erase(it);
   }
@@ -1984,14 +1978,7 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
     auto& rm = ams::ResourceManager::getInstance();
     for (auto& dp : buffer) {
       DBG(RMQPublisherHandler, "deallocate msg #%d (%p)", dp.id(), dp.data())
-      try {
-        rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
-      } catch (const umpire::util::Exception& e) {
-        FATAL(RMQPublisherHandler,
-              "Failed to deallocate msg #%d (%p)",
-              dp.id(),
-              dp.data());
-      }
+      rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
     }
     buffer.clear();
   }
diff --git a/src/AMSlib/wf/cuda/utilities.cuh b/src/AMSlib/wf/cuda/utilities.cpp
similarity index 52%
rename from src/AMSlib/wf/cuda/utilities.cuh
rename to src/AMSlib/wf/cuda/utilities.cpp
index c3c688ab..0af826da 100644
--- a/src/AMSlib/wf/cuda/utilities.cuh
+++ b/src/AMSlib/wf/cuda/utilities.cpp
@@ -8,8 +8,8 @@
 #ifndef __DEVICE_UTILITIES__
 #define __DEVICE_UTILITIES__
 
-#ifdef __ENABLE_CUDA__
 
+#include <cuda_runtime.h>
 #include <curand.h>
 #include <curand_kernel.h>
 #include <thrust/device_vector.h>
@@ -17,11 +17,14 @@
 
 #include <iostream>
 
+#include "wf/device.hpp"
 #include "wf/resource_manager.hpp"
 
-//#include <stdio.h>
-//#include <stdlib.h>
-//
+namespace ams
+{
+namespace Device
+{
+
 const int warpSize = 32;
 const unsigned int fullMask = 0xffffffff;
 
@@ -168,14 +171,14 @@ __global__ void assignK(T** sparse,
 }
 
 template <typename T>
-__global__ void compactK(bool cond,
-                         T** d_input,
-                         T** d_output,
-                         const bool* predicates,
-                         const size_t length,
-                         int dims,
-                         int* d_BlocksOffset,
-                         bool reverse)
+__global__ void device_compactK(bool cond,
+                                T** d_input,
+                                T** d_output,
+                                const bool* predicates,
+                                const size_t length,
+                                int dims,
+                                int* d_BlocksOffset,
+                                bool reverse)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   extern __shared__ int warpTotals[];
@@ -279,19 +282,20 @@ void __global__ compute_predicate(float* data,
 }
 
 template <typename T>
-int compact(bool cond,
-            const T** sparse,
-            T** dense,
-            const bool* dPredicate,
-            const size_t length,
-            int dims,
-            int blockSize,
-            bool isReverse = false)
+int device_compact(bool cond,
+                   const T** sparse,
+                   T** dense,
+                   const bool* dPredicate,
+                   const size_t length,
+                   int dims,
+                   int blockSize,
+                   bool isReverse)
 {
   int numBlocks = divup(length, blockSize);
   auto& rm = ams::ResourceManager::getInstance();
   int* d_BlocksCount = rm.allocate<int>(numBlocks, AMSResourceType::AMS_DEVICE);
-  int* d_BlocksOffset = rm.allocate<int>(numBlocks, AMSResourceType::AMS_DEVICE);
+  int* d_BlocksOffset =
+      rm.allocate<int>(numBlocks, AMSResourceType::AMS_DEVICE);
   // determine number of elements in the compacted list
   int* h_BlocksCount = rm.allocate<int>(numBlocks, AMSResourceType::AMS_HOST);
   int* h_BlocksOffset = rm.allocate<int>(numBlocks, AMSResourceType::AMS_HOST);
@@ -299,10 +303,18 @@ int compact(bool cond,
   T** d_dense = rm.allocate<T*>(dims, AMSResourceType::AMS_DEVICE);
   T** d_sparse = rm.allocate<T*>(dims, AMSResourceType::AMS_DEVICE);
 
-  rm.registerExternal(dense, sizeof(T*) * dims, AMSResourceType::AMS_HOST);
-  rm.registerExternal(sparse, sizeof(T*) * dims, AMSResourceType::AMS_HOST);
-  rm.copy(dense, d_dense);
-  rm.copy(const_cast<T**>(sparse), d_sparse);
+
+  rm.copy(dense,
+          AMSResourceType::AMS_HOST,
+          d_dense,
+          AMSResourceType::AMS_DEVICE,
+          dims);
+  rm.copy(const_cast<T**>(sparse),
+          AMSResourceType::AMS_HOST,
+          d_sparse,
+          AMSResourceType::AMS_DEVICE,
+          dims);
+
   thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
   thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
 
@@ -319,20 +331,30 @@ int compact(bool cond,
                          d_BlocksOffset);
 
   //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
-  compactK<<<numBlocks, blockSize, sizeof(int) * (blockSize / warpSize)>>>(
-      cond,
-      d_sparse,
-      d_dense,
-      dPredicate,
-      length,
-      dims,
-      d_BlocksOffset,
-      isReverse);
+  device_compactK<<<numBlocks,
+                    blockSize,
+                    sizeof(int) * (blockSize / warpSize)>>>(cond,
+                                                            d_sparse,
+                                                            d_dense,
+                                                            dPredicate,
+                                                            length,
+                                                            dims,
+                                                            d_BlocksOffset,
+                                                            isReverse);
   cudaDeviceSynchronize();
   CUDACHECKERROR();
 
-  rm.copy(d_BlocksCount, h_BlocksCount);
-  rm.copy(d_BlocksOffset, h_BlocksOffset);
+  rm.copy(d_BlocksCount,
+          AMSResourceType::AMS_DEVICE,
+          h_BlocksCount,
+          AMSResourceType::AMS_HOST,
+          numBlocks);
+  rm.copy(d_BlocksOffset,
+          AMSResourceType::AMS_DEVICE,
+          h_BlocksOffset,
+          AMSResourceType::AMS_HOST,
+          numBlocks);
+
   int compact_length =
       h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1];
 
@@ -345,8 +367,6 @@ int compact(bool cond,
   rm.deallocate(d_dense, AMSResourceType::AMS_DEVICE);
   rm.deallocate(d_sparse, AMSResourceType::AMS_DEVICE);
 
-  rm.deregisterExternal(dense);
-  rm.deregisterExternal(sparse);
   cudaDeviceSynchronize();
   CUDACHECKERROR();
 
@@ -354,15 +374,15 @@ int compact(bool cond,
 }
 
 template <typename T>
-int compact(bool cond,
-            T** sparse,
-            T** dense,
-            int* indices,
-            const size_t length,
-            int dims,
-            int blockSize,
-            const bool* dPredicate,
-            bool isReverse = false)
+int device_compact(bool cond,
+                   T** sparse,
+                   T** dense,
+                   int* indices,
+                   const size_t length,
+                   int dims,
+                   int blockSize,
+                   const bool* dPredicate,
+                   bool isReverse)
 {
   int numBlocks = divup(length, blockSize);
   size_t sparseElements = length;
@@ -442,6 +462,7 @@ void cuda_rand_init(bool* predicate, const size_t length, T threshold)
   CUDACHECKERROR();
 }
 
+
 void device_compute_predicate(float* data,
                               bool* predicate,
                               size_t nData,
@@ -459,6 +480,289 @@ void device_compute_predicate(float* data,
   CUDACHECKERROR();
 }
 
-#endif
+__global__ void random_uq_device(int seed,
+                                 bool* uq_flags,
+                                 int ndata,
+                                 double acceptable_error)
+{
+
+  /* CUDA's random number library uses curandState_t to keep track of the seed
+     value we will store a random state for every thread  */
+  curandState_t state;
+  int id = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (id >= ndata) return;
+
+  /* we have to initialize the state */
+  curand_init(
+      seed +
+          id, /* the seed controls the sequence of random values that are produced */
+      0,      /* the sequence number is only important with multiple cores */
+      0, /* the offset is how much extra we advance in the sequence for each
+            call, can be 0 */
+      &state);
+
+  float x = curand_uniform(&state);
+  uq_flags[id] = (x <= acceptable_error);
+}
+
+
+template <typename scalar_t>
+__global__ void computeDeltaUQMeanPredicatesKernel(
+    const scalar_t* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold)
+{
+
+  size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  size_t stride = blockDim.x * gridDim.x;
+  // Compute mean over columns, strided loop.
+  for (size_t i = idx; i < nrows; i += stride) {
+    double mean = 0.0;
+    for (size_t j = 0; j < ncols; ++j)
+      mean += outputs_stdev[j + i * ncols];
+    mean /= ncols;
+
+    predicates[i] = (mean < threshold);
+  }
+}
+
+template <typename scalar_t>
+__global__ void computeDeltaUQMaxPredicatesKernel(
+    const scalar_t* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold)
+{
+
+  size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  size_t stride = blockDim.x * gridDim.x;
+  // Compute max delta uq over columns, strided loop.
+  for (size_t i = idx; i < nrows; i += stride) {
+    predicates[i] = true;
+    for (size_t j = 0; j < ncols; ++j)
+      if (outputs_stdev[j + i * ncols] >= threshold) {
+        predicates[i] = false;
+        break;
+      }
+  }
+}
+
+
+template <typename TypeValue>
+void rand_init(bool* predicate, const size_t n, TypeValue threshold)
+{
+  cuda_rand_init(predicate, n, threshold);
+  return;
+}
+
+template <typename scalar_t>
+void computeDeltaUQMeanPredicatesDevice(
+    const scalar_t* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold)
+{
+  constexpr int block_size = 256;
+  int grid_size = divup(nrows, block_size);
+  computeDeltaUQMeanPredicatesKernel<<<grid_size, block_size>>>(
+      outputs_stdev, predicates, nrows, ncols, threshold);
+  cudaDeviceSynchronize();
+  CUDACHECKERROR();
+};
+
+template <typename scalar_t>
+void computeDeltaUQMaxPredicatesDevice(
+    const scalar_t* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold)
+{
+  constexpr int block_size = 256;
+  int grid_size = divup(nrows, block_size);
+  computeDeltaUQMaxPredicatesKernel<<<grid_size, block_size>>>(
+      outputs_stdev, predicates, nrows, ncols, threshold);
+  cudaDeviceSynchronize();
+  CUDACHECKERROR();
+}
+
+
+// Specializations
+
+template void computeDeltaUQMaxPredicatesDevice<float>(
+    const float* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold);
+
+template void computeDeltaUQMaxPredicatesDevice<double>(
+    const double* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold);
+
+template void computeDeltaUQMeanPredicatesDevice<float>(
+    const float* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold);
+
+template void computeDeltaUQMeanPredicatesDevice<double>(
+    const double* __restrict__ outputs_stdev,
+    bool* __restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold);
+
+template void cuda_rand_init<float>(bool* predicate,
+                                    const size_t length,
+                                    float threshold);
+
+template void cuda_rand_init<double>(bool* predicate,
+                                     const size_t length,
+                                     double threshold);
+
+
+template void device_linearize<float, float>(float* output,
+                                             const float* const* inputs,
+                                             size_t dims,
+                                             size_t elements);
+
+template void device_linearize<float, double>(double* output,
+                                              const float* const* inputs,
+                                              size_t dims,
+                                              size_t elements);
+
+template void device_linearize<double, double>(double* output,
+                                               const double* const* inputs,
+                                               size_t dims,
+                                               size_t elements);
+
+template void device_linearize<double, float>(float* output,
+                                              const double* const* inputs,
+                                              size_t dims,
+                                              size_t elements);
+
+template int device_compact<double>(bool cond,
+                                    const double** sparse,
+                                    double** dense,
+                                    const bool* dPredicate,
+                                    const size_t length,
+                                    int dims,
+                                    int blockSize,
+                                    bool isReverse);
+
+template int device_compact<float>(bool cond,
+                                   const float** sparse,
+                                   float** dense,
+                                   const bool* dPredicate,
+                                   const size_t length,
+                                   int dims,
+                                   int blockSize,
+                                   bool isReverse);
+
+template int device_compact<double>(bool cond,
+                                    double** sparse,
+                                    double** dense,
+                                    int* indices,
+                                    const size_t length,
+                                    int dims,
+                                    int blockSize,
+                                    const bool* dPredicate,
+                                    bool isReverse);
+
+template int device_compact<float>(bool cond,
+                                   float** sparse,
+                                   float** dense,
+                                   int* indices,
+                                   const size_t length,
+                                   int dims,
+                                   int blockSize,
+                                   const bool* dPredicate,
+                                   bool isReverse);
+
+
+template void rand_init<double>(bool* predicate,
+                                const size_t n,
+                                double threshold);
+
+template void rand_init<float>(bool* predicate,
+                               const size_t n,
+                               float threshold);
+
+}  // namespace Device
+
+
+void DtoDMemcpy(void* dest, void* src, size_t nBytes)
+{
+  cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToDevice);
+}
+
+void HtoHMemcpy(void* dest, void* src, size_t nBytes)
+{
+  std::memcpy(dest, src, nBytes);
+}
+
+void HtoDMemcpy(void* dest, void* src, size_t nBytes)
+{
+  cudaMemcpy(dest, src, nBytes, cudaMemcpyHostToDevice);
+};
+
+void DtoHMemcpy(void* dest, void* src, size_t nBytes)
+{
+  cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToHost);
+}
+
+void* DeviceAllocate(size_t nBytes)
+{
+  void* devPtr;
+  cudaMalloc(&devPtr, nBytes);
+  return devPtr;
+}
+
+void DeviceFree(void* ptr)
+{
+  cudaFree(ptr);
+  return;
+}
+
+void* DevicePinnedAlloc(size_t nBytes)
+{
+  void* ptr;
+  cudaHostAlloc(&ptr, nBytes, cudaHostAllocPortable);
+  return ptr;
+}
+
+void DeviceFreePinned(void* ptr) { cudaFreeHost(ptr); }
+
+
+void deviceCheckErrors(const char* file, int line)
+{
+  ams::Device::__cudaCheckError(file, line);
+}
+
+void device_random_uq(int seed,
+                      bool* uq_flags,
+                      int ndata,
+                      double acceptable_error)
+{
+  size_t block_size = 256;
+  size_t blocks = ams::Device::divup(ndata, block_size);
+  ams::Device::random_uq_device<<<blocks, block_size>>>(seed,
+                                                        uq_flags,
+                                                        ndata,
+                                                        acceptable_error);
+}
+
+
+}  // namespace ams
 
 #endif
diff --git a/src/AMSlib/wf/device.hpp b/src/AMSlib/wf/device.hpp
index e990f439..a7dd7bdf 100644
--- a/src/AMSlib/wf/device.hpp
+++ b/src/AMSlib/wf/device.hpp
@@ -13,183 +13,185 @@
 #include <iostream>
 #include <stdexcept>
 
+#include "AMS.h"
 #include "wf/debug.h"
 
 #ifdef __ENABLE_CUDA__
-#include "cuda/utilities.cuh"
-#endif
-
 namespace ams
 {
+void DtoDMemcpy(void *dest, void *src, size_t nBytes);
+
+void HtoHMemcpy(void *dest, void *src, size_t nBytes);
+
+void HtoDMemcpy(void *dest, void *src, size_t nBytes);
+
+void DtoHMemcpy(void *dest, void *src, size_t nBytes);
+
+void *DeviceAllocate(size_t nBytes);
+
+void DeviceFree(void *ptr);
+
+void *DevicePinnedAlloc(size_t nBytes);
+
+void DeviceFreePinned(void *ptr);
+
+void deviceCheckErrors(const char *file, int line);
+
+void device_random_uq(int seed,
+                      bool *uq_flags,
+                      int ndata,
+                      double acceptable_error);
+
 namespace Device
 {
+
+template <typename scalar_t>
+void computeDeltaUQMeanPredicatesDevice(
+    const scalar_t *__restrict__ outputs_stdev,
+    bool *__restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold);
+
+
+template <typename scalar_t>
+void computeDeltaUQMaxPredicatesDevice(
+    const scalar_t *__restrict__ outputs_stdev,
+    bool *__restrict__ predicates,
+    const size_t nrows,
+    const size_t ncols,
+    const double threshold);
+
+void device_compute_predicate(float *data,
+                              bool *predicate,
+                              size_t nData,
+                              const size_t kneigh,
+                              float threshold);
+
+template <typename TypeValue>
 PERFFASPECT()
-void computePredicate(float *data,
-                      bool *predicate,
-                      size_t nData,
-                      const size_t kneigh,
-                      float threshold)
-{
-#ifdef __ENABLE_CUDA__
-  return device_compute_predicate(data, predicate, nData, kneigh, threshold);
-#else
-  return;
-#endif
-}
+void rand_init(bool *predicate, const size_t n, TypeValue threshold);
+
+template <typename TypeInValue, typename TypeOutValue>
+void device_linearize(TypeOutValue *output,
+                      const TypeInValue *const *inputs,
+                      size_t dims,
+                      size_t elements);
+
+template <typename T>
+int device_compact(bool cond,
+                   const T **sparse,
+                   T **dense,
+                   const bool *dPredicate,
+                   const size_t length,
+                   int dims,
+                   int blockSize,
+                   bool isReverse = false);
+
+template <typename T>
+int device_compact(bool cond,
+                   T **sparse,
+                   T **dense,
+                   int *indices,
+                   const size_t length,
+                   int dims,
+                   int blockSize,
+                   const bool *dPredicate,
+                   bool isReverse = false);
+
 
 PERFFASPECT()
-void computePredicateDeltaUQ()
+inline void computePredicate(float *data,
+                             bool *predicate,
+                             size_t nData,
+                             const size_t kneigh,
+                             float threshold)
 {
-  THROW(std::runtime_error,
-        "Computing DeltaUQ predications on device is not supported yet");
+  return device_compute_predicate(data, predicate, nData, kneigh, threshold);
 }
 
+
 template <typename TypeInValue, typename TypeOutValue>
 PERFFASPECT()
-void linearize(TypeOutValue *output,
-               const TypeInValue *const *inputs,
-               size_t dims,
-               size_t elements)
+inline void linearize(TypeOutValue *output,
+                      const TypeInValue *const *inputs,
+                      size_t dims,
+                      size_t elements)
 {
-#ifdef __ENABLE_CUDA__
   return device_linearize(output, inputs, dims, elements);
-#else
-  return;
-#endif
 }
 
 template <typename TypeValue>
 PERFFASPECT()
-int pack(bool cond,
-         const bool *predicate,
-         const size_t n,
-         const TypeValue **sparse,
-         TypeValue **dense,
-         int dims)
+inline int pack(bool cond,
+                const bool *predicate,
+                const size_t n,
+                const TypeValue **sparse,
+                TypeValue **dense,
+                int dims)
 {
-#ifdef __ENABLE_CUDA__
-  return compact(cond, sparse, dense, predicate, n, dims, 1024);
-#else
-  return 0;
-#endif
+  return device_compact(cond, sparse, dense, predicate, n, dims, 1024);
 }
 
 template <typename TypeValue>
 PERFFASPECT()
-int pack(bool cond,
-         const bool *predicate,
-         const size_t n,
-         TypeValue **sparse,
-         TypeValue **dense,
-         int *sparse_indices,
-         int dims)
+inline int pack(bool cond,
+                const bool *predicate,
+                const size_t n,
+                TypeValue **sparse,
+                TypeValue **dense,
+                int *sparse_indices,
+                int dims)
 {
-#ifdef __ENABLE_CUDA__
-  return compact(cond, sparse, dense, sparse_indices, n, dims, 1024, predicate);
-#else
-  return 0;
-#endif
+  return device_compact(
+      cond, sparse, dense, sparse_indices, n, dims, 1024, predicate);
 }
 
 template <typename TypeValue>
 PERFFASPECT()
-int unpack(bool cond,
-           const bool *predicate,
-           const size_t n,
-           TypeValue **sparse,
-           TypeValue **dense,
-           int dims)
+inline int unpack(bool cond,
+                  const bool *predicate,
+                  const size_t n,
+                  TypeValue **sparse,
+                  TypeValue **dense,
+                  int dims)
 {
-#ifdef __ENABLE_CUDA__
-  return compact(cond,
-                 const_cast<const TypeValue **>(sparse),
-                 dense,
-                 predicate,
-                 n,
-                 dims,
-                 1024,
-                 true);
-#else
-  return 0;
-#endif
+  return device_compact(cond,
+                        const_cast<const TypeValue **>(sparse),
+                        dense,
+                        predicate,
+                        n,
+                        dims,
+                        1024,
+                        true);
 }
 
 template <typename TypeValue>
 PERFFASPECT()
-int unpack(bool cond,
-           const size_t n,
-           TypeValue **sparse,
-           TypeValue **dense,
-           int *sparse_indices,
-           int dims)
+inline int unpack(bool cond,
+                  const size_t n,
+                  TypeValue **sparse,
+                  TypeValue **dense,
+                  int *sparse_indices,
+                  int dims)
 {
-#ifdef __ENABLE_CUDA__
-  return compact(
+  return device_compact(
       cond, sparse, dense, sparse_indices, n, dims, 1024, NULL, true);
-#else
-  return 0;
-#endif
-}
-
-template <typename TypeValue>
-PERFFASPECT()
-void rand_init(bool *predicate, const size_t n, TypeValue threshold)
-{
-#ifdef __ENABLE_CUDA__
-  cuda_rand_init(predicate, n, threshold);
-#endif
-  return;
 }
 
 }  // namespace Device
 }  // namespace ams
 
-void deviceCheckErrors(const char *file, const int line)
-{
-#ifdef __ENABLE_CUDA__
-  __cudaCheckError(file, line);
-#endif
-  return;
-}
-
-
-#ifdef __ENABLE_CUDA__
+#else
 
-#include <curand.h>
-#include <curand_kernel.h>
-PERFFASPECT()
-__global__ void random_uq_device(int seed,
-                                 bool *uq_flags,
-                                 int ndata,
-                                 double acceptable_error)
+namespace ams
 {
 
-  /* CUDA's random number library uses curandState_t to keep track of the seed
-     value we will store a random state for every thread  */
-  curandState_t state;
-  int id = threadIdx.x + blockDim.x * blockIdx.x;
-
-  if (id >= ndata) return;
 
-  /* we have to initialize the state */
-  curand_init(
-      seed +
-          id, /* the seed controls the sequence of random values that are produced */
-      0,      /* the sequence number is only important with multiple cores */
-      0, /* the offset is how much extra we advance in the sequence for each
-            call, can be 0 */
-      &state);
-
-  float x = curand_uniform(&state);
-  uq_flags[id] = (x <= acceptable_error);
-}
-
-
-#include <cuda_runtime.h>
 PERFFASPECT()
 inline void DtoDMemcpy(void *dest, void *src, size_t nBytes)
 {
-  cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToDevice);
+  FATAL(Device, "DtoD Memcpy Not Enabled");
 }
 
 PERFFASPECT()
@@ -201,86 +203,124 @@ inline void HtoHMemcpy(void *dest, void *src, size_t nBytes)
 PERFFASPECT()
 inline void HtoDMemcpy(void *dest, void *src, size_t nBytes)
 {
-  cudaMemcpy(dest, src, nBytes, cudaMemcpyHostToDevice);
-};
+  FATAL(Device, "HtoD Memcpy Not Enabled");
+}
 
 PERFFASPECT()
 inline void DtoHMemcpy(void *dest, void *src, size_t nBytes)
 {
-  cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToHost);
+  FATAL(Device, "DtoH Memcpy Not Enabled");
 }
 
-template <typename scalar_t>
-__global__ void computeDeltaUQMeanPredicatesKernel(
-    const scalar_t *__restrict__ outputs_stdev,
-    bool *__restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold)
+
+inline void *DeviceAllocate(size_t nBytes)
 {
+  FATAL(Device, "DtoH Memcpy Not Enabled");
+}
 
-  size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  size_t stride = blockDim.x * gridDim.x;
-  // Compute mean over columns, strided loop.
-  for (size_t i = idx; i < nrows; i += stride) {
-    double mean = 0.0;
-    for (size_t j = 0; j < ncols; ++j)
-      mean += outputs_stdev[j + i * ncols];
-    mean /= ncols;
-
-    predicates[i] = (mean < threshold);
-  }
+
+PERFFASPECT()
+inline void DeviceFree(void *ptr) { FATAL(Device, "DtoH Memcpy Not Enabled"); }
+
+PERFFASPECT()
+inline void *DevicePinnedAlloc(size_t nBytes)
+{
+  FATAL(Device, "Pinned Alloc Not Enabled");
 }
 
-template <typename scalar_t>
-__global__ void computeDeltaUQMaxPredicatesKernel(
-    const scalar_t *__restrict__ outputs_stdev,
-    bool *__restrict__ predicates,
-    const size_t nrows,
-    const size_t ncols,
-    const double threshold)
+PERFFASPECT()
+inline void DeviceFreePinned(void *ptr)
+{
+  FATAL(Device, "Pinned Free Pinned Not Enabled");
+}
+
+inline void device_random_uq(int seed,
+                             bool *uq_flags,
+                             int ndata,
+                             double acceptable_error)
 {
+  FATAL(Device, "Called Device Runtime UQ without enabling Device compilation");
+}
+
+
+inline void deviceCheckErrors(const char *file, int line) { return; }
 
-  size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  size_t stride = blockDim.x * gridDim.x;
-  // Compute max delta uq over columns, strided loop.
-  for (size_t i = idx; i < nrows; i += stride) {
-    predicates[i] = true;
-    for (size_t j = 0; j < ncols; ++j)
-      if (outputs_stdev[j + i * ncols] >= threshold) {
-        predicates[i] = false;
-        break;
-      }
-  }
+namespace Device
+{
+PERFFASPECT()
+inline void computePredicate(float *data,
+                             bool *predicate,
+                             size_t nData,
+                             const size_t kneigh,
+                             float threshold)
+{
+  return;
 }
 
-#else
+
+template <typename TypeInValue, typename TypeOutValue>
 PERFFASPECT()
-inline void DtoDMemcpy(void *dest, void *src, size_t nBytes)
+inline void linearize(TypeOutValue *output,
+                      const TypeInValue *const *inputs,
+                      size_t dims,
+                      size_t elements)
 {
-  std::cerr << "DtoD Memcpy Not Enabled" << std::endl;
-  exit(-1);
+  return;
 }
 
+template <typename TypeValue>
 PERFFASPECT()
-inline void HtoHMemcpy(void *dest, void *src, size_t nBytes)
+inline int pack(bool cond,
+                const bool *predicate,
+                const size_t n,
+                const TypeValue **sparse,
+                TypeValue **dense,
+                int dims)
 {
-  std::memcpy(dest, src, nBytes);
+  return -1;
 }
 
+template <typename TypeValue>
 PERFFASPECT()
-inline void HtoDMemcpy(void *dest, void *src, size_t nBytes)
+inline int pack(bool cond,
+                const bool *predicate,
+                const size_t n,
+                TypeValue **sparse,
+                TypeValue **dense,
+                int *sparse_indices,
+                int dims)
 {
-  std::cerr << "HtoD Memcpy Not Enabled" << std::endl;
-  exit(-1);
-};
+  return -1;
+}
 
+template <typename TypeValue>
 PERFFASPECT()
-inline void DtoHMemcpy(void *dest, void *src, size_t nBytes)
+inline int unpack(bool cond,
+                  const bool *predicate,
+                  const size_t n,
+                  TypeValue **sparse,
+                  TypeValue **dense,
+                  int dims)
+{
+  return -1;
+}
+
+template <typename TypeValue>
+PERFFASPECT()
+inline int unpack(bool cond,
+                  const size_t n,
+                  TypeValue **sparse,
+                  TypeValue **dense,
+                  int *sparse_indices,
+                  int dims)
 {
-  std::cerr << "DtoH Memcpy Not Enabled" << std::endl;
-  exit(-1);
+  return -1;
 }
+
+}  // namespace Device
+}  // namespace ams
+
 #endif
 
+
 #endif
diff --git a/src/AMSlib/wf/resource_manager.cpp b/src/AMSlib/wf/resource_manager.cpp
index d941c745..8c08d287 100644
--- a/src/AMSlib/wf/resource_manager.cpp
+++ b/src/AMSlib/wf/resource_manager.cpp
@@ -5,48 +5,107 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  */
 
-#include "resource_manager.hpp"
-
+#include <cstdlib>
+#include <cstring>
 #include <umpire/ResourceManager.hpp>
 #include <umpire/Umpire.hpp>
 #include <umpire/strategy/QuickPool.hpp>
 
 #include "debug.h"
+#include "device.hpp"
+#include "resource_manager.hpp"
 
 namespace ams
 {
 
-std::string AMSAllocator::getName() { return allocator.getName(); }
+std::string AMSAllocator::getName() { return name; }
 
-void *AMSAllocator::allocate(size_t num_bytes)
-{
-  void *ptr = allocator.allocate(num_bytes);
-  CFATAL(ResourceManager,
-         ptr == nullptr,
-         "Failed to allocated %ld values using allocator %s",
-         num_bytes,
-         getName().c_str());
-  return ptr;
-}
 
-void AMSAllocator::deallocate(void *ptr) { allocator.deallocate(ptr); }
+struct AMSDefaultDeviceAllocator final : AMSAllocator {
+  AMSDefaultDeviceAllocator(std::string name) : AMSAllocator(name){};
+  ~AMSDefaultDeviceAllocator() = default;
+
+  void *allocate(size_t num_bytes) { return DeviceAllocate(num_bytes); }
+
+  void deallocate(void *ptr) { return DeviceFree(ptr); }
+};
+
+struct AMSDefaultHostAllocator final : AMSAllocator {
+  AMSDefaultHostAllocator(std::string name) : AMSAllocator(name) {}
+  ~AMSDefaultHostAllocator() = default;
 
-void AMSAllocator::registerPtr(void *ptr, size_t nBytes)
+  void *allocate(size_t num_bytes) { return aligned_alloc(8, num_bytes); }
+
+  void deallocate(void *ptr) { free(ptr); }
+};
+
+struct AMSDefaultPinnedAllocator final : AMSAllocator {
+  AMSDefaultPinnedAllocator(std::string name) : AMSAllocator(name) {}
+  ~AMSDefaultPinnedAllocator() = default;
+
+  void *allocate(size_t num_bytes) { return DevicePinnedAlloc(num_bytes); }
+
+  void deallocate(void *ptr) { DeviceFreePinned(ptr); }
+};
+
+
+namespace internal
 {
-  auto &rm = umpire::ResourceManager::getInstance();
-  rm.registerAllocation(ptr,
-                        umpire::util::AllocationRecord(
-                            ptr, nBytes, allocator.getAllocationStrategy()));
+void _raw_copy(void *src,
+               AMSResourceType src_dev,
+               void *dest,
+               AMSResourceType dest_dev,
+               size_t num_bytes)
+{
+  if (src_dev == AMSResourceType::AMS_HOST) {
+    if (dest_dev == AMSResourceType::AMS_HOST) {
+      std::memcpy(dest, src, num_bytes);
+    } else if (dest_dev == AMSResourceType::AMS_DEVICE) {
+      HtoDMemcpy(dest, src, num_bytes);
+    } else if (dest_dev == AMSResourceType::AMS_PINNED) {
+      std::memcpy(dest, src, num_bytes);
+    } else {
+      FATAL(AMSResource, "Unknown copy dest")
+    }
+  } else if (src_dev == AMSResourceType::AMS_DEVICE) {
+    if (dest_dev == AMSResourceType::AMS_HOST) {
+      DtoHMemcpy(dest, src, num_bytes);
+    } else if (dest_dev == AMSResourceType::AMS_DEVICE) {
+      DtoDMemcpy(dest, src, num_bytes);
+    } else if (dest_dev == AMSResourceType::AMS_PINNED) {
+      DtoHMemcpy(dest, src, num_bytes);
+    } else {
+      FATAL(AMSResource, "Unknown copy dest")
+    }
+  } else if (src_dev == AMSResourceType::AMS_PINNED) {
+    if (dest_dev == AMSResourceType::AMS_HOST) {
+      std::memcpy(dest, src, num_bytes);
+    } else if (dest_dev == AMSResourceType::AMS_DEVICE) {
+      HtoDMemcpy(dest, src, num_bytes);
+    } else if (dest_dev == AMSResourceType::AMS_PINNED) {
+      std::memcpy(dest, src, num_bytes);
+    } else {
+      FATAL(AMSResource, "Unknown copy dest")
+    }
+  }
 }
 
-void AMSAllocator::getAllocatorStats(size_t &wm, size_t &cs, size_t &as)
+AMSAllocator *_get_allocator(std::string alloc_name, AMSResourceType resource)
 {
-  wm = allocator.getHighWatermark();
-  cs = allocator.getCurrentSize();
-  as = allocator.getActualSize();
+  if (resource == AMSResourceType::AMS_DEVICE) {
+    return new AMSDefaultDeviceAllocator(alloc_name);
+  } else if (resource == AMSResourceType::AMS_HOST) {
+    return new AMSDefaultHostAllocator(alloc_name);
+  } else if (resource == AMSResourceType::AMS_PINNED) {
+    return new AMSDefaultPinnedAllocator(alloc_name);
+  } else {
+    FATAL(ResourceManager,
+          "Requested allocator %s for Unknown resource type",
+          alloc_name.c_str());
+  }
 }
 
-// -----------------------------------------------------------------------------
-// set up the resource manager
-// -----------------------------------------------------------------------------
+void _release_allocator(AMSAllocator *allocator) { delete allocator; }
+
+}  // namespace internal
 }  // namespace ams
diff --git a/src/AMSlib/wf/resource_manager.hpp b/src/AMSlib/wf/resource_manager.hpp
index 407bdff6..88d44447 100644
--- a/src/AMSlib/wf/resource_manager.hpp
+++ b/src/AMSlib/wf/resource_manager.hpp
@@ -9,9 +9,7 @@
 #define __AMS_ALLOCATOR__
 
 #include <cstddef>
-#include <umpire/Allocator.hpp>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
+#include <vector>
 
 #include "AMS.h"
 #include "wf/debug.h"
@@ -19,44 +17,39 @@
 
 namespace ams
 {
+// Forward decl.
+struct AMSAllocator;
+namespace internal
+{
+void _raw_copy(void* src,
+               AMSResourceType src_dev,
+               void* dest,
+               AMSResourceType dest_dev,
+               size_t num_bytes);
+
+AMSAllocator* _get_allocator(std::string alloc_name, AMSResourceType resource);
+void _release_allocator(AMSAllocator* allocator);
+
+}  // namespace internal
 /**
  * @brief A "utility" class that provides
  * a unified interface to the umpire library for memory allocations
  * and data movements/copies.
  */
 
-struct AMSAllocator {
-  int id;
-  umpire::Allocator allocator;
-
-  AMSAllocator(std::string alloc_name)
-  {
-    auto& rm = umpire::ResourceManager::getInstance();
-    allocator = rm.getAllocator(alloc_name);
-    DBG(AMSAllocator,
-        "in AMSAllocator(%d, %s, %p)",
-        id,
-        alloc_name.c_str(),
-        this)
-  }
 
-  ~AMSAllocator() { DBG(AMSAllocator, "in ~AMSAllocator(%d, %p)", id, this) }
+struct AMSAllocator {
+  std::string name;
+  AMSAllocator(std::string alloc_name) : name(alloc_name) {}
+  virtual ~AMSAllocator() = default;
 
-  void* allocate(size_t num_bytes);
-  void deallocate(void* ptr);
+  virtual void* allocate(size_t num_bytes) = 0;
+  virtual void deallocate(void* ptr) = 0;
 
   std::string getName();
-
-  void registerPtr(void* ptr, size_t nBytes);
-  static void deregisterPtr(void* ptr)
-  {
-    auto& rm = umpire::ResourceManager::getInstance();
-    rm.deregisterAllocation(ptr);
-  }
-
-  void getAllocatorStats(size_t& wm, size_t& cs, size_t& as);
 };
 
+
 class ResourceManager
 {
 private:
@@ -116,37 +109,26 @@ class ResourceManager
     RMAllocators[dev]->deallocate(data);
   }
 
-  /** @brief registers an external pointer in the umpire allocation records.
-   *  @param[in] ptr pointer to memory to register.
-   *  @param[in] nBytes number of bytes to register.
-   *  @param[in] dev resource to register the memory to.
-   *  @return void.
-   */
-  PERFFASPECT()
-  void registerExternal(void* ptr, size_t nBytes, AMSResourceType dev)
-  {
-    RMAllocators[dev]->registerPtr(ptr, nBytes);
-  }
-
-  /** @brief removes a registered external pointer from the umpire allocation records.
-   *  @param[in] ptr pointer to memory to de-register.
-   *  @return void.
-   */
-  void deregisterExternal(void* ptr) { AMSAllocator::deregisterPtr(ptr); }
-
   /** @brief copy values from src to destination regardless of their memory location.
    *  @tparam TypeInValue type of pointers
    *  @param[in] src Source memory pointer.
    *  @param[out] dest destination memory pointer.
-   *  @param[in] size number of bytes to copy. (When 0 copies entire allocated area)
+   *  @param[in] size number of values to copy.
    *  @return void.
    */
   template <typename TypeInValue>
   PERFFASPECT()
-  void copy(TypeInValue* src, TypeInValue* dest, size_t size = 0)
+  void copy(TypeInValue* src,
+            AMSResourceType src_dev,
+            TypeInValue* dest,
+            AMSResourceType dest_dev,
+            size_t nvalues)
   {
-    static auto& rm = umpire::ResourceManager::getInstance();
-    rm.copy(dest, src, size);
+    ams::internal::_raw_copy(static_cast<void*>(src),
+                             src_dev,
+                             static_cast<void*>(dest),
+                             dest_dev,
+                             nvalues * sizeof(TypeInValue));
   }
 
   /** @brief Utility function that deallocates all C-Vectors inside the vector.
@@ -163,7 +145,7 @@ class ResourceManager
 
   void init()
   {
-    DBG(ResourceManager, "Default initialization of allocators");
+    DBG(ResourceManager, "Initialization of allocators");
     if (!RMAllocators[AMSResourceType::AMS_HOST])
       setAllocator("HOST", AMSResourceType::AMS_HOST);
 #ifdef __ENABLE_CUDA__
@@ -181,7 +163,8 @@ class ResourceManager
       delete RMAllocators[resource];
     }
 
-    RMAllocators[resource] = new AMSAllocator(alloc_name);
+    RMAllocators[resource] =
+        ams::internal::_get_allocator(alloc_name, resource);
     DBG(ResourceManager,
         "Set Allocator [%d] to pool with name : %s",
         resource,
@@ -205,13 +188,13 @@ class ResourceManager
                          size_t& cs,
                          size_t& as)
   {
-    RMAllocators[resource]->getAllocatorStats(wm, cs, as);
     return;
   }
 
   //! ------------------------------------------------------------------------
 };
 
+
 }  // namespace ams
 
 #endif
diff --git a/src/AMSlib/wf/workflow.hpp b/src/AMSlib/wf/workflow.hpp
index c164e7d5..9807461c 100644
--- a/src/AMSlib/wf/workflow.hpp
+++ b/src/AMSlib/wf/workflow.hpp
@@ -140,12 +140,20 @@ class AMSWorkflow
       size_t actualElems = std::min(elPerDim, num_elements - i);
       // Copy input data to host
       for (int k = 0; k < numIn; k++) {
-        rm.copy(&inputs[k][i], hInputs[k], actualElems * sizeof(FPTypeValue));
+        rm.copy(&inputs[k][i],
+                AMSResourceType::AMS_DEVICE,
+                hInputs[k],
+                AMSResourceType::AMS_HOST,
+                actualElems);
       }
 
       // Copy output data to host
       for (int k = 0; k < numIn; k++) {
-        rm.copy(&outputs[k][i], hOutputs[k], actualElems * sizeof(FPTypeValue));
+        rm.copy(&outputs[k][i],
+                AMSResourceType::AMS_DEVICE,
+                hOutputs[k],
+                AMSResourceType::AMS_HOST,
+                actualElems);
       }
 
       // Store to database
diff --git a/tests/AMSlib/ams_ete.cpp b/tests/AMSlib/ams_ete.cpp
index bdf68bfa..729f3752 100644
--- a/tests/AMSlib/ams_ete.cpp
+++ b/tests/AMSlib/ams_ete.cpp
@@ -130,13 +130,6 @@ void callBackSingle(void *cls, long elements, void **inputs, void **outputs)
 
 int main(int argc, char **argv)
 {
-  // Number of ranks in this run
-  int wS = 1;
-  // My Local Id
-  int rId = 0;
-  // Level of Threading provided by MPI
-  int provided = 0;
-
   if (argc != 12) {
     std::cout << "Wrong cli\n";
     std::cout << argv[0]
@@ -162,7 +155,6 @@ int main(int argc, char **argv)
   std::string db_type_str = std::string(argv[10]);
   std::string fs_path = std::string(argv[11]);
   AMSDBType db_type = ams::db::getDBType(db_type_str);
-  AMSDBType dbType = AMSDBType::AMS_NONE;
   AMSResourceType resource = AMSResourceType::AMS_HOST;
   srand(time(NULL));
 
diff --git a/tests/AMSlib/ams_ete_env.cpp b/tests/AMSlib/ams_ete_env.cpp
index 291523e4..71560bb2 100644
--- a/tests/AMSlib/ams_ete_env.cpp
+++ b/tests/AMSlib/ams_ete_env.cpp
@@ -99,12 +99,14 @@ struct Problem {
                  outputs.size());
 
       for (int i = 0; i < num_outputs; i++) {
-        delete outputs[i];
+        delete[] outputs[i];
+        outputs[i] = nullptr;
       }
 
 
       for (int i = 0; i < num_inputs; i++) {
-        delete inputs[i];
+        delete[] inputs[i];
+        inputs[i] = nullptr;
       }
     }
   }
@@ -130,12 +132,6 @@ void callBackSingle(void *cls, long elements, void **inputs, void **outputs)
 
 int main(int argc, char **argv)
 {
-  // Number of ranks in this run
-  int wS = 1;
-  // My Local Id
-  int rId = 0;
-  // Level of Threading provided by MPI
-  int provided = 0;
 
   if (argc != 9) {
     std::cout << "Wrong cli\n";
diff --git a/tests/AMSlib/ams_update_model.cpp b/tests/AMSlib/ams_update_model.cpp
index 6906081e..a8bd5225 100644
--- a/tests/AMSlib/ams_update_model.cpp
+++ b/tests/AMSlib/ams_update_model.cpp
@@ -44,8 +44,16 @@ bool inference(SurrogateModel<T> &model,
     if (resource == AMSResourceType::AMS_DEVICE) {
       first_model_out = ams_rm.allocate<T>(SIZE, AMSResourceType::AMS_HOST);
       second_model_out = ams_rm.allocate<T>(SIZE, AMSResourceType::AMS_HOST);
-      ams_rm.copy(outputs[i], first_model_out, SIZE * sizeof(T));
-      ams_rm.copy(outputs[i + 4], second_model_out, SIZE * sizeof(T));
+      ams_rm.copy(outputs[i],
+                  resource,
+                  first_model_out,
+                  AMSResourceType::AMS_HOST,
+                  SIZE);
+      ams_rm.copy(outputs[i + 4],
+                  resource,
+                  second_model_out,
+                  AMSResourceType::AMS_HOST,
+                  SIZE);
     }
 
     for (int j = 0; j < SIZE; j++) {
diff --git a/tests/AMSlib/cpu_packing_test.cpp b/tests/AMSlib/cpu_packing_test.cpp
index a52c5966..9cd780cf 100644
--- a/tests/AMSlib/cpu_packing_test.cpp
+++ b/tests/AMSlib/cpu_packing_test.cpp
@@ -108,8 +108,16 @@ int main(int argc, char* argv[])
     double* rsparse = rm.allocate<double>(SIZE, resource);
     int* reindex = rm.allocate<int>(SIZE, resource);
 
-    rm.copy(h_predicate, predicate);
-    rm.copy(h_sparse, sparse);
+    rm.copy(h_predicate,
+            AMSResourceType::AMS_HOST,
+            predicate,
+            AMSResourceType::AMS_DEVICE,
+            SIZE);
+    rm.copy(h_sparse,
+            AMSResourceType::AMS_HOST,
+            sparse,
+            AMSResourceType::AMS_DEVICE,
+            SIZE);
 
     std::vector<const double*> s_data({const_cast<const double*>(sparse)});
     std::vector<double*> sr_data({rsparse});
@@ -126,7 +134,11 @@ int main(int argc, char* argv[])
         return 1;
       }
 
-      rm.copy(dense, h_dense);
+      rm.copy(dense,
+              AMSResourceType::AMS_DEVICE,
+              h_dense,
+              AMSResourceType::AMS_HOST,
+              elements);
 
       if (verify(h_dense, elements, flag)) {
         std::cout << "Dense elements do not have the correct values\n";
@@ -135,7 +147,11 @@ int main(int argc, char* argv[])
 
       data_handler::unpack(resource, predicate, size, d_data, sr_data, flag);
 
-      rm.copy(rsparse, h_rsparse);
+      rm.copy(rsparse,
+              AMSResourceType::AMS_DEVICE,
+              h_rsparse,
+              AMSResourceType::AMS_HOST,
+              size);
 
       if (verify(h_predicate, h_sparse, h_rsparse, size, flag)) {
         //      for ( int k = 0; k < SIZE; k++){

From 8cce8e87c0b0cbdcf092ed533aa59acee6cb573b Mon Sep 17 00:00:00 2001
From: koparasy <parasyris1@llnl.gov>
Date: Thu, 6 Jun 2024 13:26:09 -0700
Subject: [PATCH 3/5] Fix test for hdcache

---
 tests/AMSlib/CMakeLists.txt   |  5 +++--
 tests/AMSlib/test_hdcache.cpp | 10 ++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/AMSlib/CMakeLists.txt b/tests/AMSlib/CMakeLists.txt
index 7b9f8000..1c85e22a 100644
--- a/tests/AMSlib/CMakeLists.txt
+++ b/tests/AMSlib/CMakeLists.txt
@@ -155,8 +155,9 @@ function(ADDTEST exe test_name)
   endif()
 endfunction()
 
-BUILD_TEST(ams_allocator_test ams_allocate.cpp)
-ADDTEST(ams_allocator_test AMSAllocate)
+# This test requires Allocate
+#BUILD_TEST(ams_allocator_test ams_allocate.cpp)
+#ADDTEST(ams_allocator_test AMSAllocate)
 BUILD_TEST(ams_packing_test cpu_packing_test.cpp AMSPack)
 ADDTEST(ams_packing_test AMSPack)
 
diff --git a/tests/AMSlib/test_hdcache.cpp b/tests/AMSlib/test_hdcache.cpp
index 865cc517..f87d5232 100644
--- a/tests/AMSlib/test_hdcache.cpp
+++ b/tests/AMSlib/test_hdcache.cpp
@@ -97,8 +97,10 @@ bool do_faiss(std::shared_ptr<HDCache<T>> &index,
     for (int i = 0; i < orig_data.size(); i++) {
       T *d_data = rm.allocate<T>(nClusters * nElements, resource);
       rm.copy(const_cast<T *>(orig_data[i]),
+              AMSResourceType::AMS_HOST,
               d_data,
-              nClusters * nElements * sizeof(T));
+              AMSResourceType::AMS_DEVICE,
+              nClusters * nElements);
       data[i] = d_data;
     }
   }
@@ -111,7 +113,11 @@ bool do_faiss(std::shared_ptr<HDCache<T>> &index,
   if (resource == AMSResourceType::AMS_DEVICE) {
     h_predicates =
         rm.allocate<bool>(nClusters * nElements, AMSResourceType::AMS_HOST);
-    rm.copy(predicates, h_predicates, nClusters * nElements);
+    rm.copy(predicates,
+            AMSResourceType::AMS_DEVICE,
+            h_predicates,
+            AMSResourceType::AMS_HOST,
+            nClusters * nElements);
     for (auto d : data) {
       rm.deallocate(const_cast<T *>(d), AMSResourceType::AMS_DEVICE);
     }

From 636747d2bb929c3d13335a78c1ce08dae235d4a3 Mon Sep 17 00:00:00 2001
From: koparasy <parasyris1@llnl.gov>
Date: Thu, 6 Jun 2024 14:45:19 -0700
Subject: [PATCH 4/5] Multples of 8 for any alignment

---
 src/AMSlib/wf/resource_manager.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/AMSlib/wf/resource_manager.cpp b/src/AMSlib/wf/resource_manager.cpp
index 8c08d287..a19deac7 100644
--- a/src/AMSlib/wf/resource_manager.cpp
+++ b/src/AMSlib/wf/resource_manager.cpp
@@ -18,6 +18,12 @@
 namespace ams
 {
 
+template<typename T>
+static T roundUp(T num_to_round, int multiple) 
+{
+    return ((num_to_round + multiple- 1) / multiple) * multiple;
+}
+
 std::string AMSAllocator::getName() { return name; }
 
 
@@ -34,7 +40,7 @@ struct AMSDefaultHostAllocator final : AMSAllocator {
   AMSDefaultHostAllocator(std::string name) : AMSAllocator(name) {}
   ~AMSDefaultHostAllocator() = default;
 
-  void *allocate(size_t num_bytes) { return aligned_alloc(8, num_bytes); }
+  void *allocate(size_t num_bytes) { return aligned_alloc(8, roundUp(num_bytes, 8)); }
 
   void deallocate(void *ptr) { free(ptr); }
 };

From e448ed564e6f7146477a99c5d03acdda88254f9a Mon Sep 17 00:00:00 2001
From: koparasy <parasyris1@llnl.gov>
Date: Mon, 10 Jun 2024 09:05:59 -0700
Subject: [PATCH 5/5] Fix comments

---
 src/AMSlib/AMS.cpp                 |   6 +-
 src/AMSlib/wf/device.hpp           |  16 +++--
 src/AMSlib/wf/resource_manager.cpp | 102 +++++++++++++++--------------
 src/AMSlib/wf/resource_manager.hpp |  24 +++----
 tests/AMSlib/CMakeLists.txt        |   1 +
 5 files changed, 83 insertions(+), 66 deletions(-)

diff --git a/src/AMSlib/AMS.cpp b/src/AMSlib/AMS.cpp
index d78538d3..5c5d3066 100644
--- a/src/AMSlib/AMS.cpp
+++ b/src/AMSlib/AMS.cpp
@@ -389,8 +389,9 @@ class AMSWrap
       } else if (log_prefix.find("<PID>") != std::string::npos) {
         pattern = std::string("<PID>");
         id = getpid();
-      }  // Combine hostname and pid
+      }
 
+      // Combine hostname and pid
       std::ostringstream combined;
       combined << "." << hostname << "." << id;
 
@@ -665,7 +666,8 @@ const char *AMSGetAllocatorName(AMSResourceType device)
 void AMSSetAllocator(AMSResourceType resource, const char *alloc_name)
 {
   auto &rm = ams::ResourceManager::getInstance();
-  rm.setAllocator(std::string(alloc_name), resource);
+  std::string alloc(alloc_name);
+  rm.setAllocator(alloc, resource);
 }
 
 AMSCAbstrModel AMSRegisterAbstractModel(const char *domain_name,
diff --git a/src/AMSlib/wf/device.hpp b/src/AMSlib/wf/device.hpp
index a7dd7bdf..d197bd21 100644
--- a/src/AMSlib/wf/device.hpp
+++ b/src/AMSlib/wf/device.hpp
@@ -16,6 +16,8 @@
 #include "AMS.h"
 #include "wf/debug.h"
 
+#define UNDEFINED_FUNC -1
+
 #ifdef __ENABLE_CUDA__
 namespace ams
 {
@@ -254,6 +256,7 @@ inline void computePredicate(float *data,
                              const size_t kneigh,
                              float threshold)
 {
+  FATAL(Device, "Called device code when CUDA disabled");
   return;
 }
 
@@ -265,6 +268,7 @@ inline void linearize(TypeOutValue *output,
                       size_t dims,
                       size_t elements)
 {
+  FATAL(Device, "Called device code when CUDA disabled");
   return;
 }
 
@@ -277,7 +281,8 @@ inline int pack(bool cond,
                 TypeValue **dense,
                 int dims)
 {
-  return -1;
+  FATAL(Device, "Called device code when CUDA disabled");
+  return UNDEFINED_FUNC;
 }
 
 template <typename TypeValue>
@@ -290,7 +295,8 @@ inline int pack(bool cond,
                 int *sparse_indices,
                 int dims)
 {
-  return -1;
+  FATAL(Device, "Called device code when CUDA disabled");
+  return UNDEFINED_FUNC;
 }
 
 template <typename TypeValue>
@@ -302,7 +308,8 @@ inline int unpack(bool cond,
                   TypeValue **dense,
                   int dims)
 {
-  return -1;
+  FATAL(Device, "Called device code when CUDA disabled");
+  return UNDEFINED_FUNC;
 }
 
 template <typename TypeValue>
@@ -314,7 +321,8 @@ inline int unpack(bool cond,
                   int *sparse_indices,
                   int dims)
 {
-  return -1;
+  FATAL(Device, "Called device code when CUDA disabled");
+  return UNDEFINED_FUNC;
 }
 
 }  // namespace Device
diff --git a/src/AMSlib/wf/resource_manager.cpp b/src/AMSlib/wf/resource_manager.cpp
index a19deac7..8b9da265 100644
--- a/src/AMSlib/wf/resource_manager.cpp
+++ b/src/AMSlib/wf/resource_manager.cpp
@@ -7,9 +7,6 @@
 
 #include <cstdlib>
 #include <cstring>
-#include <umpire/ResourceManager.hpp>
-#include <umpire/Umpire.hpp>
-#include <umpire/strategy/QuickPool.hpp>
 
 #include "debug.h"
 #include "device.hpp"
@@ -18,13 +15,13 @@
 namespace ams
 {
 
-template<typename T>
-static T roundUp(T num_to_round, int multiple) 
+template <typename T>
+static T roundUp(T num_to_round, int multiple)
 {
-    return ((num_to_round + multiple- 1) / multiple) * multiple;
+  return ((num_to_round + multiple - 1) / multiple) * multiple;
 }
 
-std::string AMSAllocator::getName() { return name; }
+const std::string AMSAllocator::getName() const { return name; }
 
 
 struct AMSDefaultDeviceAllocator final : AMSAllocator {
@@ -40,7 +37,10 @@ struct AMSDefaultHostAllocator final : AMSAllocator {
   AMSDefaultHostAllocator(std::string name) : AMSAllocator(name) {}
   ~AMSDefaultHostAllocator() = default;
 
-  void *allocate(size_t num_bytes) { return aligned_alloc(8, roundUp(num_bytes, 8)); }
+  void *allocate(size_t num_bytes)
+  {
+    return aligned_alloc(8, roundUp(num_bytes, 8));
+  }
 
   void deallocate(void *ptr) { free(ptr); }
 };
@@ -63,51 +63,55 @@ void _raw_copy(void *src,
                AMSResourceType dest_dev,
                size_t num_bytes)
 {
-  if (src_dev == AMSResourceType::AMS_HOST) {
-    if (dest_dev == AMSResourceType::AMS_HOST) {
-      std::memcpy(dest, src, num_bytes);
-    } else if (dest_dev == AMSResourceType::AMS_DEVICE) {
-      HtoDMemcpy(dest, src, num_bytes);
-    } else if (dest_dev == AMSResourceType::AMS_PINNED) {
-      std::memcpy(dest, src, num_bytes);
-    } else {
-      FATAL(AMSResource, "Unknown copy dest")
-    }
-  } else if (src_dev == AMSResourceType::AMS_DEVICE) {
-    if (dest_dev == AMSResourceType::AMS_HOST) {
-      DtoHMemcpy(dest, src, num_bytes);
-    } else if (dest_dev == AMSResourceType::AMS_DEVICE) {
-      DtoDMemcpy(dest, src, num_bytes);
-    } else if (dest_dev == AMSResourceType::AMS_PINNED) {
-      DtoHMemcpy(dest, src, num_bytes);
-    } else {
-      FATAL(AMSResource, "Unknown copy dest")
-    }
-  } else if (src_dev == AMSResourceType::AMS_PINNED) {
-    if (dest_dev == AMSResourceType::AMS_HOST) {
-      std::memcpy(dest, src, num_bytes);
-    } else if (dest_dev == AMSResourceType::AMS_DEVICE) {
-      HtoDMemcpy(dest, src, num_bytes);
-    } else if (dest_dev == AMSResourceType::AMS_PINNED) {
-      std::memcpy(dest, src, num_bytes);
-    } else {
-      FATAL(AMSResource, "Unknown copy dest")
-    }
+  switch (src_dev) {
+    case AMSResourceType::AMS_HOST:
+    case AMSResourceType::AMS_PINNED:
+      switch (dest_dev) {
+        case AMSResourceType::AMS_HOST:
+        case AMSResourceType::AMS_PINNED:
+          std::memcpy(dest, src, num_bytes);
+          break;
+        case AMSResourceType::AMS_DEVICE:
+          HtoDMemcpy(dest, src, num_bytes);
+          break;
+        default:
+          FATAL(ResourceManager, "Unknown device type to copy to from HOST");
+          break;
+      }
+      break;
+    case AMSResourceType::AMS_DEVICE:
+      switch (dest_dev) {
+        case AMSResourceType::AMS_DEVICE:
+          DtoDMemcpy(dest, src, num_bytes);
+          break;
+        case AMSResourceType::AMS_HOST:
+        case AMSResourceType::AMS_PINNED:
+          DtoHMemcpy(dest, src, num_bytes);
+          break;
+        default:
+          FATAL(ResourceManager, "Unknown device type to copy to from DEVICE");
+          break;
+      }
+    default:
+      FATAL(ResourceManager, "Unknown device type to copy from");
   }
 }
 
-AMSAllocator *_get_allocator(std::string alloc_name, AMSResourceType resource)
+AMSAllocator *_get_allocator(std::string &alloc_name, AMSResourceType resource)
 {
-  if (resource == AMSResourceType::AMS_DEVICE) {
-    return new AMSDefaultDeviceAllocator(alloc_name);
-  } else if (resource == AMSResourceType::AMS_HOST) {
-    return new AMSDefaultHostAllocator(alloc_name);
-  } else if (resource == AMSResourceType::AMS_PINNED) {
-    return new AMSDefaultPinnedAllocator(alloc_name);
-  } else {
-    FATAL(ResourceManager,
-          "Requested allocator %s for Unknown resource type",
-          alloc_name.c_str());
+  switch (resource) {
+    case AMSResourceType::AMS_DEVICE:
+      return new AMSDefaultDeviceAllocator(alloc_name);
+      break;
+    case AMSResourceType::AMS_HOST:
+      return new AMSDefaultHostAllocator(alloc_name);
+      break;
+    case AMSResourceType::AMS_PINNED:
+      return new AMSDefaultPinnedAllocator(alloc_name);
+      break;
+    default:
+      FATAL(ResourceManager,
+            "Unknown resource type to create an allocator for");
   }
 }
 
diff --git a/src/AMSlib/wf/resource_manager.hpp b/src/AMSlib/wf/resource_manager.hpp
index 88d44447..b86fdb6b 100644
--- a/src/AMSlib/wf/resource_manager.hpp
+++ b/src/AMSlib/wf/resource_manager.hpp
@@ -9,6 +9,7 @@
 #define __AMS_ALLOCATOR__
 
 #include <cstddef>
+#include <string>
 #include <vector>
 
 #include "AMS.h"
@@ -27,7 +28,7 @@ void _raw_copy(void* src,
                AMSResourceType dest_dev,
                size_t num_bytes);
 
-AMSAllocator* _get_allocator(std::string alloc_name, AMSResourceType resource);
+AMSAllocator* _get_allocator(std::string& alloc_name, AMSResourceType resource);
 void _release_allocator(AMSAllocator* allocator);
 
 }  // namespace internal
@@ -40,16 +41,15 @@ void _release_allocator(AMSAllocator* allocator);
 
 struct AMSAllocator {
   std::string name;
-  AMSAllocator(std::string alloc_name) : name(alloc_name) {}
+  AMSAllocator(std::string& alloc_name) : name(alloc_name) {}
   virtual ~AMSAllocator() = default;
 
   virtual void* allocate(size_t num_bytes) = 0;
   virtual void deallocate(void* ptr) = 0;
 
-  std::string getName();
+  const std::string getName() const;
 };
 
-
 class ResourceManager
 {
 private:
@@ -77,7 +77,7 @@ class ResourceManager
   }
 
   /** @brief return the name of an allocator */
-  std::string getAllocatorName(AMSResourceType resource)
+  const std::string getAllocatorName(AMSResourceType resource) const
   {
     return RMAllocators[resource]->getName();
   }
@@ -113,7 +113,7 @@ class ResourceManager
    *  @tparam TypeInValue type of pointers
    *  @param[in] src Source memory pointer.
    *  @param[out] dest destination memory pointer.
-   *  @param[in] size number of values to copy.
+   *  @param[in] size number of values to copy (It performs a shallow copy of nested pointers).
    *  @return void.
    */
   template <typename TypeInValue>
@@ -146,18 +146,21 @@ class ResourceManager
   void init()
   {
     DBG(ResourceManager, "Initialization of allocators");
+    std::string host_alloc("HOST");
+    std::string device_alloc("DEVICE");
+    std::string pinned_alloc("PINNED");
     if (!RMAllocators[AMSResourceType::AMS_HOST])
-      setAllocator("HOST", AMSResourceType::AMS_HOST);
+      setAllocator(host_alloc, AMSResourceType::AMS_HOST);
 #ifdef __ENABLE_CUDA__
     if (!RMAllocators[AMSResourceType::AMS_DEVICE])
-      setAllocator("DEVICE", AMSResourceType::AMS_DEVICE);
+      setAllocator(host_alloc, AMSResourceType::AMS_DEVICE);
 
     if (!RMAllocators[AMSResourceType::AMS_PINNED])
-      setAllocator("PINNED", AMSResourceType::AMS_PINNED);
+      setAllocator(pinned_alloc, AMSResourceType::AMS_PINNED);
 #endif
   }
 
-  void setAllocator(std::string alloc_name, AMSResourceType resource)
+  void setAllocator(std::string& alloc_name, AMSResourceType resource)
   {
     if (RMAllocators[resource]) {
       delete RMAllocators[resource];
@@ -194,7 +197,6 @@ class ResourceManager
   //! ------------------------------------------------------------------------
 };
 
-
 }  // namespace ams
 
 #endif
diff --git a/tests/AMSlib/CMakeLists.txt b/tests/AMSlib/CMakeLists.txt
index 1c85e22a..5571a922 100644
--- a/tests/AMSlib/CMakeLists.txt
+++ b/tests/AMSlib/CMakeLists.txt
@@ -156,6 +156,7 @@ function(ADDTEST exe test_name)
 endfunction()
 
 # This test requires Allocate
+# TODO: Include tests once we re-instate a pool
 #BUILD_TEST(ams_allocator_test ams_allocate.cpp)
 #ADDTEST(ams_allocator_test AMSAllocate)
 BUILD_TEST(ams_packing_test cpu_packing_test.cpp AMSPack)