From e93c32d99b91ef8df88079aa678dc755be0e0ef2 Mon Sep 17 00:00:00 2001 From: koparasy Date: Fri, 24 May 2024 15:09:19 -0700 Subject: [PATCH 1/5] Fixes issues raised on the example code and issues of GPU enabled executions --- src/AMSlib/wf/basedb.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/AMSlib/wf/basedb.hpp b/src/AMSlib/wf/basedb.hpp index d77af6a6..ebcc50ef 100644 --- a/src/AMSlib/wf/basedb.hpp +++ b/src/AMSlib/wf/basedb.hpp @@ -2377,7 +2377,6 @@ class RMQInterface } }; - /* A class that provides a BaseDB interface to AMS workflow. * When storing data it pushes the data to the RMQ server asynchronously */ From 4c1148d16a5f659e19929e2e573567650b820129 Mon Sep 17 00:00:00 2001 From: koparasy Date: Mon, 3 Jun 2024 09:36:25 -0700 Subject: [PATCH 2/5] Remove umpire as an allocator and move device code to a single file --- src/AMSlib/AMS.cpp | 4 +- src/AMSlib/CMakeLists.txt | 40 +- src/AMSlib/ml/hdcache.hpp | 5 +- src/AMSlib/ml/random_uq.hpp | 21 +- src/AMSlib/ml/surrogate.hpp | 61 +-- src/AMSlib/wf/basedb.hpp | 19 +- .../wf/cuda/{utilities.cuh => utilities.cpp} | 400 +++++++++++++++--- src/AMSlib/wf/device.hpp | 398 +++++++++-------- src/AMSlib/wf/resource_manager.cpp | 111 +++-- src/AMSlib/wf/resource_manager.hpp | 91 ++-- src/AMSlib/wf/workflow.hpp | 12 +- tests/AMSlib/ams_ete.cpp | 8 - tests/AMSlib/ams_ete_env.cpp | 12 +- tests/AMSlib/ams_update_model.cpp | 12 +- tests/AMSlib/cpu_packing_test.cpp | 24 +- 15 files changed, 783 insertions(+), 435 deletions(-) rename src/AMSlib/wf/cuda/{utilities.cuh => utilities.cpp} (52%) diff --git a/src/AMSlib/AMS.cpp b/src/AMSlib/AMS.cpp index 267d5d59..d78538d3 100644 --- a/src/AMSlib/AMS.cpp +++ b/src/AMSlib/AMS.cpp @@ -389,8 +389,8 @@ class AMSWrap } else if (log_prefix.find("") != std::string::npos) { pattern = std::string(""); id = getpid(); - } - // Combine hostname and pid + } // Combine hostname and pid + std::ostringstream combined; combined << "." << hostname << "." << id; diff --git a/src/AMSlib/CMakeLists.txt b/src/AMSlib/CMakeLists.txt index 8c265aae..35159b0c 100644 --- a/src/AMSlib/CMakeLists.txt +++ b/src/AMSlib/CMakeLists.txt @@ -8,27 +8,35 @@ file(GLOB_RECURSE MINIAPP_INCLUDES "*.hpp") #set global library path to link with tests if necessary set(LIBRARY_OUTPUT_PATH ${AMS_LIB_OUT_PATH}) set(AMS_LIB_SRC ${MINIAPP_INCLUDES} AMS.cpp wf/resource_manager.cpp wf/debug.cpp wf/basedb.cpp wf/logger.cpp) + +if (WITH_CUDA) + list(APPEND AMS_LIB_SRC wf/cuda/utilities.cpp) + message(WARNING "FILES ARE ${AMS_LIB_SRC}") +endif() + + # two targets: a shared lib and an exec add_library(AMS ${AMS_LIB_SRC} ${MINIAPP_INCLUDES}) # ------------------------------------------------------------------------------ if (WITH_CUDA) - set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH}) - - # if (BUILD_SHARED_LIBS) - # set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - # else() - # set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON) - # set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) - # endif() - - set_source_files_properties(AMS.cpp PROPERTIES LANGUAGE CUDA) - set_source_files_properties(AMS.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH}) - set_source_files_properties(AMS.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda") - - if (WITH_PERFFLOWASPECT) - set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so") - set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so") + + set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH}) + + # if (BUILD_SHARED_LIBS) + # set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + # else() + # set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + # set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) + # endif() + + set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES LANGUAGE CUDA) + set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH}) + set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda") + + if (WITH_PERFFLOWASPECT) + set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so") + set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so") endif() endif() diff --git a/src/AMSlib/ml/hdcache.hpp b/src/AMSlib/ml/hdcache.hpp index 33949db0..31719979 100644 --- a/src/AMSlib/ml/hdcache.hpp +++ b/src/AMSlib/ml/hdcache.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #ifdef __ENABLE_FAISS__ @@ -29,6 +30,8 @@ #include #include #include + +#include "wf/device.hpp" #endif #endif @@ -366,7 +369,7 @@ class HDCache _evaluate(ndata, data, is_acceptable); if (cache_location == AMSResourceType::AMS_DEVICE) { - deviceCheckErrors(__FILE__, __LINE__); + ams::deviceCheckErrors(__FILE__, __LINE__); } DBG(UQModule, "Done with evalution of uq") diff --git a/src/AMSlib/ml/random_uq.hpp b/src/AMSlib/ml/random_uq.hpp index c031e8ba..dc3bbb8d 100644 --- a/src/AMSlib/ml/random_uq.hpp +++ b/src/AMSlib/ml/random_uq.hpp @@ -11,13 +11,9 @@ #include "AMS.h" #include "wf/debug.h" +#include "wf/device.hpp" #include "wf/utils.hpp" -static size_t round_up(size_t num, size_t denom) -{ - return (num + denom - 1) / denom; -} - class RandomUQ { public: @@ -25,20 +21,7 @@ class RandomUQ inline void evaluate(const size_t ndata, bool *is_acceptable) { if (resourceLocation == AMSResourceType::AMS_DEVICE) { -#ifdef __ENABLE_CUDA__ - //TODO: Move all of this code on device.cpp and provide better logic regarding - // number of threads - size_t threads = 256; - size_t blocks = round_up(ndata, threads); - random_uq_device<<>>(seed, - is_acceptable, - ndata, - threshold); - seed = seed + 1; -#else - THROW(std::runtime_error, - "Random-uq is not configured to use device allocations"); -#endif + ams::device_random_uq(seed, is_acceptable, ndata, threshold); } else { random_uq_host(is_acceptable, ndata, threshold); } diff --git a/src/AMSlib/ml/surrogate.hpp b/src/AMSlib/ml/surrogate.hpp index 4f168469..b194de94 100644 --- a/src/AMSlib/ml/surrogate.hpp +++ b/src/AMSlib/ml/surrogate.hpp @@ -15,7 +15,7 @@ #include #include "AMS.h" -#include "wf/cuda/utilities.cuh" +#include "wf/device.hpp" #ifdef __ENABLE_TORCH__ #include @@ -100,43 +100,12 @@ class SurrogateModel { // Transpose to get continuous memory and // perform single memcpy. + auto& rm = ams::ResourceManager::getInstance(); tensor = tensor.transpose(1, 0); - if (model_resource == AMSResourceType::AMS_HOST) { - for (long j = 0; j < numCols; j++) { - auto tmp = tensor[j].contiguous(); - TypeInValue* ptr = tmp.data_ptr(); - HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows); - } - } else { - for (long j = 0; j < numCols; j++) { - auto tmp = tensor[j].contiguous(); - TypeInValue* ptr = tmp.data_ptr(); - DtoDMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows); - } - } - } - - PERFFASPECT() - inline void tensorToHostArray(at::Tensor tensor, - long numRows, - long numCols, - TypeInValue** array) - { - // Transpose to get continuous memory and - // perform single memcpy. - tensor = tensor.transpose(1, 0); - if (model_resource == AMSResourceType::AMS_HOST) { - for (long j = 0; j < numCols; j++) { - auto tmp = tensor[j].contiguous(); - TypeInValue* ptr = tmp.data_ptr(); - HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows); - } - } else { - for (long j = 0; j < numCols; j++) { - auto tmp = tensor[j].contiguous(); - TypeInValue* ptr = tmp.data_ptr(); - DtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows); - } + for (long j = 0; j < numCols; j++) { + auto tmp = tensor[j].contiguous(); + TypeInValue* ptr = tmp.data_ptr(); + rm.copy(ptr, model_resource, array[j], model_resource, numRows); } } @@ -216,13 +185,9 @@ class SurrogateModel if (model_resource == AMSResourceType::AMS_DEVICE) { #ifdef __ENABLE_CUDA__ DBG(Surrogate, "Compute mean delta uq predicates on device\n"); - constexpr int block_size = 256; - int grid_size = divup(nrows, block_size); - computeDeltaUQMeanPredicatesKernel<<>>( - outputs_stdev, predicates, nrows, ncols, threshold); // TODO: use combined routine when it lands. - cudaDeviceSynchronize(); - CUDACHECKERROR(); + ams::Device::computeDeltaUQMeanPredicatesDevice( + outputs_stdev, predicates, nrows, ncols, threshold); #else THROW(std::runtime_error, "Expected CUDA is enabled when model data are on DEVICE"); @@ -235,13 +200,9 @@ class SurrogateModel if (model_resource == AMSResourceType::AMS_DEVICE) { #ifdef __ENABLE_CUDA__ DBG(Surrogate, "Compute max delta uq predicates on device\n"); - constexpr int block_size = 256; - int grid_size = divup(nrows, block_size); - computeDeltaUQMaxPredicatesKernel<<>>( - outputs_stdev, predicates, nrows, ncols, threshold); // TODO: use combined routine when it lands. - cudaDeviceSynchronize(); - CUDACHECKERROR(); + ams::Device::computeDeltaUQMaxPredicatesDevice( + outputs_stdev, predicates, nrows, ncols, threshold); #else THROW(std::runtime_error, "Expected CUDA is enabled when model data are on DEVICE"); @@ -306,7 +267,7 @@ class SurrogateModel } if (is_device()) { - deviceCheckErrors(__FILE__, __LINE__); + ams::deviceCheckErrors(__FILE__, __LINE__); } DBG(Surrogate, diff --git a/src/AMSlib/wf/basedb.hpp b/src/AMSlib/wf/basedb.hpp index ebcc50ef..d6acc086 100644 --- a/src/AMSlib/wf/basedb.hpp +++ b/src/AMSlib/wf/basedb.hpp @@ -1962,14 +1962,8 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler msg_id) auto& msg = *it; auto& rm = ams::ResourceManager::getInstance(); - try { - rm.deallocate(msg.data(), AMSResourceType::AMS_HOST); - } catch (const umpire::util::Exception& e) { - FATAL(RMQPublisherHandler, - "Failed to deallocate #%d (%p)", - msg.id(), - msg.data()); - } + rm.deallocate(msg.data(), AMSResourceType::AMS_HOST); + DBG(RMQPublisherHandler, "Deallocated msg #%d (%p)", msg.id(), msg.data()) buf.erase(it); } @@ -1984,14 +1978,7 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler auto& rm = ams::ResourceManager::getInstance(); for (auto& dp : buffer) { DBG(RMQPublisherHandler, "deallocate msg #%d (%p)", dp.id(), dp.data()) - try { - rm.deallocate(dp.data(), AMSResourceType::AMS_HOST); - } catch (const umpire::util::Exception& e) { - FATAL(RMQPublisherHandler, - "Failed to deallocate msg #%d (%p)", - dp.id(), - dp.data()); - } + rm.deallocate(dp.data(), AMSResourceType::AMS_HOST); } buffer.clear(); } diff --git a/src/AMSlib/wf/cuda/utilities.cuh b/src/AMSlib/wf/cuda/utilities.cpp similarity index 52% rename from src/AMSlib/wf/cuda/utilities.cuh rename to src/AMSlib/wf/cuda/utilities.cpp index c3c688ab..0af826da 100644 --- a/src/AMSlib/wf/cuda/utilities.cuh +++ b/src/AMSlib/wf/cuda/utilities.cpp @@ -8,8 +8,8 @@ #ifndef __DEVICE_UTILITIES__ #define __DEVICE_UTILITIES__ -#ifdef __ENABLE_CUDA__ +#include #include #include #include @@ -17,11 +17,14 @@ #include +#include "wf/device.hpp" #include "wf/resource_manager.hpp" -//#include -//#include -// +namespace ams +{ +namespace Device +{ + const int warpSize = 32; const unsigned int fullMask = 0xffffffff; @@ -168,14 +171,14 @@ __global__ void assignK(T** sparse, } template -__global__ void compactK(bool cond, - T** d_input, - T** d_output, - const bool* predicates, - const size_t length, - int dims, - int* d_BlocksOffset, - bool reverse) +__global__ void device_compactK(bool cond, + T** d_input, + T** d_output, + const bool* predicates, + const size_t length, + int dims, + int* d_BlocksOffset, + bool reverse) { int idx = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ int warpTotals[]; @@ -279,19 +282,20 @@ void __global__ compute_predicate(float* data, } template -int compact(bool cond, - const T** sparse, - T** dense, - const bool* dPredicate, - const size_t length, - int dims, - int blockSize, - bool isReverse = false) +int device_compact(bool cond, + const T** sparse, + T** dense, + const bool* dPredicate, + const size_t length, + int dims, + int blockSize, + bool isReverse) { int numBlocks = divup(length, blockSize); auto& rm = ams::ResourceManager::getInstance(); int* d_BlocksCount = rm.allocate(numBlocks, AMSResourceType::AMS_DEVICE); - int* d_BlocksOffset = rm.allocate(numBlocks, AMSResourceType::AMS_DEVICE); + int* d_BlocksOffset = + rm.allocate(numBlocks, AMSResourceType::AMS_DEVICE); // determine number of elements in the compacted list int* h_BlocksCount = rm.allocate(numBlocks, AMSResourceType::AMS_HOST); int* h_BlocksOffset = rm.allocate(numBlocks, AMSResourceType::AMS_HOST); @@ -299,10 +303,18 @@ int compact(bool cond, T** d_dense = rm.allocate(dims, AMSResourceType::AMS_DEVICE); T** d_sparse = rm.allocate(dims, AMSResourceType::AMS_DEVICE); - rm.registerExternal(dense, sizeof(T*) * dims, AMSResourceType::AMS_HOST); - rm.registerExternal(sparse, sizeof(T*) * dims, AMSResourceType::AMS_HOST); - rm.copy(dense, d_dense); - rm.copy(const_cast(sparse), d_sparse); + + rm.copy(dense, + AMSResourceType::AMS_HOST, + d_dense, + AMSResourceType::AMS_DEVICE, + dims); + rm.copy(const_cast(sparse), + AMSResourceType::AMS_HOST, + d_sparse, + AMSResourceType::AMS_DEVICE, + dims); + thrust::device_ptr thrustPrt_bCount(d_BlocksCount); thrust::device_ptr thrustPrt_bOffset(d_BlocksOffset); @@ -319,20 +331,30 @@ int compact(bool cond, d_BlocksOffset); //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements - compactK<<>>( - cond, - d_sparse, - d_dense, - dPredicate, - length, - dims, - d_BlocksOffset, - isReverse); + device_compactK<<>>(cond, + d_sparse, + d_dense, + dPredicate, + length, + dims, + d_BlocksOffset, + isReverse); cudaDeviceSynchronize(); CUDACHECKERROR(); - rm.copy(d_BlocksCount, h_BlocksCount); - rm.copy(d_BlocksOffset, h_BlocksOffset); + rm.copy(d_BlocksCount, + AMSResourceType::AMS_DEVICE, + h_BlocksCount, + AMSResourceType::AMS_HOST, + numBlocks); + rm.copy(d_BlocksOffset, + AMSResourceType::AMS_DEVICE, + h_BlocksOffset, + AMSResourceType::AMS_HOST, + numBlocks); + int compact_length = h_BlocksOffset[numBlocks - 1] + thrustPrt_bCount[numBlocks - 1]; @@ -345,8 +367,6 @@ int compact(bool cond, rm.deallocate(d_dense, AMSResourceType::AMS_DEVICE); rm.deallocate(d_sparse, AMSResourceType::AMS_DEVICE); - rm.deregisterExternal(dense); - rm.deregisterExternal(sparse); cudaDeviceSynchronize(); CUDACHECKERROR(); @@ -354,15 +374,15 @@ int compact(bool cond, } template -int compact(bool cond, - T** sparse, - T** dense, - int* indices, - const size_t length, - int dims, - int blockSize, - const bool* dPredicate, - bool isReverse = false) +int device_compact(bool cond, + T** sparse, + T** dense, + int* indices, + const size_t length, + int dims, + int blockSize, + const bool* dPredicate, + bool isReverse) { int numBlocks = divup(length, blockSize); size_t sparseElements = length; @@ -442,6 +462,7 @@ void cuda_rand_init(bool* predicate, const size_t length, T threshold) CUDACHECKERROR(); } + void device_compute_predicate(float* data, bool* predicate, size_t nData, @@ -459,6 +480,289 @@ void device_compute_predicate(float* data, CUDACHECKERROR(); } -#endif +__global__ void random_uq_device(int seed, + bool* uq_flags, + int ndata, + double acceptable_error) +{ + + /* CUDA's random number library uses curandState_t to keep track of the seed + value we will store a random state for every thread */ + curandState_t state; + int id = threadIdx.x + blockDim.x * blockIdx.x; + + if (id >= ndata) return; + + /* we have to initialize the state */ + curand_init( + seed + + id, /* the seed controls the sequence of random values that are produced */ + 0, /* the sequence number is only important with multiple cores */ + 0, /* the offset is how much extra we advance in the sequence for each + call, can be 0 */ + &state); + + float x = curand_uniform(&state); + uq_flags[id] = (x <= acceptable_error); +} + + +template +__global__ void computeDeltaUQMeanPredicatesKernel( + const scalar_t* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold) +{ + + size_t idx = blockDim.x * blockIdx.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + // Compute mean over columns, strided loop. + for (size_t i = idx; i < nrows; i += stride) { + double mean = 0.0; + for (size_t j = 0; j < ncols; ++j) + mean += outputs_stdev[j + i * ncols]; + mean /= ncols; + + predicates[i] = (mean < threshold); + } +} + +template +__global__ void computeDeltaUQMaxPredicatesKernel( + const scalar_t* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold) +{ + + size_t idx = blockDim.x * blockIdx.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + // Compute max delta uq over columns, strided loop. + for (size_t i = idx; i < nrows; i += stride) { + predicates[i] = true; + for (size_t j = 0; j < ncols; ++j) + if (outputs_stdev[j + i * ncols] >= threshold) { + predicates[i] = false; + break; + } + } +} + + +template +void rand_init(bool* predicate, const size_t n, TypeValue threshold) +{ + cuda_rand_init(predicate, n, threshold); + return; +} + +template +void computeDeltaUQMeanPredicatesDevice( + const scalar_t* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold) +{ + constexpr int block_size = 256; + int grid_size = divup(nrows, block_size); + computeDeltaUQMeanPredicatesKernel<<>>( + outputs_stdev, predicates, nrows, ncols, threshold); + cudaDeviceSynchronize(); + CUDACHECKERROR(); +}; + +template +void computeDeltaUQMaxPredicatesDevice( + const scalar_t* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold) +{ + constexpr int block_size = 256; + int grid_size = divup(nrows, block_size); + computeDeltaUQMaxPredicatesKernel<<>>( + outputs_stdev, predicates, nrows, ncols, threshold); + cudaDeviceSynchronize(); + CUDACHECKERROR(); +} + + +// Specializations + +template void computeDeltaUQMaxPredicatesDevice( + const float* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold); + +template void computeDeltaUQMaxPredicatesDevice( + const double* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold); + +template void computeDeltaUQMeanPredicatesDevice( + const float* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold); + +template void computeDeltaUQMeanPredicatesDevice( + const double* __restrict__ outputs_stdev, + bool* __restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold); + +template void cuda_rand_init(bool* predicate, + const size_t length, + float threshold); + +template void cuda_rand_init(bool* predicate, + const size_t length, + double threshold); + + +template void device_linearize(float* output, + const float* const* inputs, + size_t dims, + size_t elements); + +template void device_linearize(double* output, + const float* const* inputs, + size_t dims, + size_t elements); + +template void device_linearize(double* output, + const double* const* inputs, + size_t dims, + size_t elements); + +template void device_linearize(float* output, + const double* const* inputs, + size_t dims, + size_t elements); + +template int device_compact(bool cond, + const double** sparse, + double** dense, + const bool* dPredicate, + const size_t length, + int dims, + int blockSize, + bool isReverse); + +template int device_compact(bool cond, + const float** sparse, + float** dense, + const bool* dPredicate, + const size_t length, + int dims, + int blockSize, + bool isReverse); + +template int device_compact(bool cond, + double** sparse, + double** dense, + int* indices, + const size_t length, + int dims, + int blockSize, + const bool* dPredicate, + bool isReverse); + +template int device_compact(bool cond, + float** sparse, + float** dense, + int* indices, + const size_t length, + int dims, + int blockSize, + const bool* dPredicate, + bool isReverse); + + +template void rand_init(bool* predicate, + const size_t n, + double threshold); + +template void rand_init(bool* predicate, + const size_t n, + float threshold); + +} // namespace Device + + +void DtoDMemcpy(void* dest, void* src, size_t nBytes) +{ + cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToDevice); +} + +void HtoHMemcpy(void* dest, void* src, size_t nBytes) +{ + std::memcpy(dest, src, nBytes); +} + +void HtoDMemcpy(void* dest, void* src, size_t nBytes) +{ + cudaMemcpy(dest, src, nBytes, cudaMemcpyHostToDevice); +}; + +void DtoHMemcpy(void* dest, void* src, size_t nBytes) +{ + cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToHost); +} + +void* DeviceAllocate(size_t nBytes) +{ + void* devPtr; + cudaMalloc(&devPtr, nBytes); + return devPtr; +} + +void DeviceFree(void* ptr) +{ + cudaFree(ptr); + return; +} + +void* DevicePinnedAlloc(size_t nBytes) +{ + void* ptr; + cudaHostAlloc(&ptr, nBytes, cudaHostAllocPortable); + return ptr; +} + +void DeviceFreePinned(void* ptr) { cudaFreeHost(ptr); } + + +void deviceCheckErrors(const char* file, int line) +{ + ams::Device::__cudaCheckError(file, line); +} + +void device_random_uq(int seed, + bool* uq_flags, + int ndata, + double acceptable_error) +{ + size_t block_size = 256; + size_t blocks = ams::Device::divup(ndata, block_size); + ams::Device::random_uq_device<<>>(seed, + uq_flags, + ndata, + acceptable_error); +} + + +} // namespace ams #endif diff --git a/src/AMSlib/wf/device.hpp b/src/AMSlib/wf/device.hpp index e990f439..a7dd7bdf 100644 --- a/src/AMSlib/wf/device.hpp +++ b/src/AMSlib/wf/device.hpp @@ -13,183 +13,185 @@ #include #include +#include "AMS.h" #include "wf/debug.h" #ifdef __ENABLE_CUDA__ -#include "cuda/utilities.cuh" -#endif - namespace ams { +void DtoDMemcpy(void *dest, void *src, size_t nBytes); + +void HtoHMemcpy(void *dest, void *src, size_t nBytes); + +void HtoDMemcpy(void *dest, void *src, size_t nBytes); + +void DtoHMemcpy(void *dest, void *src, size_t nBytes); + +void *DeviceAllocate(size_t nBytes); + +void DeviceFree(void *ptr); + +void *DevicePinnedAlloc(size_t nBytes); + +void DeviceFreePinned(void *ptr); + +void deviceCheckErrors(const char *file, int line); + +void device_random_uq(int seed, + bool *uq_flags, + int ndata, + double acceptable_error); + namespace Device { + +template +void computeDeltaUQMeanPredicatesDevice( + const scalar_t *__restrict__ outputs_stdev, + bool *__restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold); + + +template +void computeDeltaUQMaxPredicatesDevice( + const scalar_t *__restrict__ outputs_stdev, + bool *__restrict__ predicates, + const size_t nrows, + const size_t ncols, + const double threshold); + +void device_compute_predicate(float *data, + bool *predicate, + size_t nData, + const size_t kneigh, + float threshold); + +template PERFFASPECT() -void computePredicate(float *data, - bool *predicate, - size_t nData, - const size_t kneigh, - float threshold) -{ -#ifdef __ENABLE_CUDA__ - return device_compute_predicate(data, predicate, nData, kneigh, threshold); -#else - return; -#endif -} +void rand_init(bool *predicate, const size_t n, TypeValue threshold); + +template +void device_linearize(TypeOutValue *output, + const TypeInValue *const *inputs, + size_t dims, + size_t elements); + +template +int device_compact(bool cond, + const T **sparse, + T **dense, + const bool *dPredicate, + const size_t length, + int dims, + int blockSize, + bool isReverse = false); + +template +int device_compact(bool cond, + T **sparse, + T **dense, + int *indices, + const size_t length, + int dims, + int blockSize, + const bool *dPredicate, + bool isReverse = false); + PERFFASPECT() -void computePredicateDeltaUQ() +inline void computePredicate(float *data, + bool *predicate, + size_t nData, + const size_t kneigh, + float threshold) { - THROW(std::runtime_error, - "Computing DeltaUQ predications on device is not supported yet"); + return device_compute_predicate(data, predicate, nData, kneigh, threshold); } + template PERFFASPECT() -void linearize(TypeOutValue *output, - const TypeInValue *const *inputs, - size_t dims, - size_t elements) +inline void linearize(TypeOutValue *output, + const TypeInValue *const *inputs, + size_t dims, + size_t elements) { -#ifdef __ENABLE_CUDA__ return device_linearize(output, inputs, dims, elements); -#else - return; -#endif } template PERFFASPECT() -int pack(bool cond, - const bool *predicate, - const size_t n, - const TypeValue **sparse, - TypeValue **dense, - int dims) +inline int pack(bool cond, + const bool *predicate, + const size_t n, + const TypeValue **sparse, + TypeValue **dense, + int dims) { -#ifdef __ENABLE_CUDA__ - return compact(cond, sparse, dense, predicate, n, dims, 1024); -#else - return 0; -#endif + return device_compact(cond, sparse, dense, predicate, n, dims, 1024); } template PERFFASPECT() -int pack(bool cond, - const bool *predicate, - const size_t n, - TypeValue **sparse, - TypeValue **dense, - int *sparse_indices, - int dims) +inline int pack(bool cond, + const bool *predicate, + const size_t n, + TypeValue **sparse, + TypeValue **dense, + int *sparse_indices, + int dims) { -#ifdef __ENABLE_CUDA__ - return compact(cond, sparse, dense, sparse_indices, n, dims, 1024, predicate); -#else - return 0; -#endif + return device_compact( + cond, sparse, dense, sparse_indices, n, dims, 1024, predicate); } template PERFFASPECT() -int unpack(bool cond, - const bool *predicate, - const size_t n, - TypeValue **sparse, - TypeValue **dense, - int dims) +inline int unpack(bool cond, + const bool *predicate, + const size_t n, + TypeValue **sparse, + TypeValue **dense, + int dims) { -#ifdef __ENABLE_CUDA__ - return compact(cond, - const_cast(sparse), - dense, - predicate, - n, - dims, - 1024, - true); -#else - return 0; -#endif + return device_compact(cond, + const_cast(sparse), + dense, + predicate, + n, + dims, + 1024, + true); } template PERFFASPECT() -int unpack(bool cond, - const size_t n, - TypeValue **sparse, - TypeValue **dense, - int *sparse_indices, - int dims) +inline int unpack(bool cond, + const size_t n, + TypeValue **sparse, + TypeValue **dense, + int *sparse_indices, + int dims) { -#ifdef __ENABLE_CUDA__ - return compact( + return device_compact( cond, sparse, dense, sparse_indices, n, dims, 1024, NULL, true); -#else - return 0; -#endif -} - -template -PERFFASPECT() -void rand_init(bool *predicate, const size_t n, TypeValue threshold) -{ -#ifdef __ENABLE_CUDA__ - cuda_rand_init(predicate, n, threshold); -#endif - return; } } // namespace Device } // namespace ams -void deviceCheckErrors(const char *file, const int line) -{ -#ifdef __ENABLE_CUDA__ - __cudaCheckError(file, line); -#endif - return; -} - - -#ifdef __ENABLE_CUDA__ +#else -#include -#include -PERFFASPECT() -__global__ void random_uq_device(int seed, - bool *uq_flags, - int ndata, - double acceptable_error) +namespace ams { - /* CUDA's random number library uses curandState_t to keep track of the seed - value we will store a random state for every thread */ - curandState_t state; - int id = threadIdx.x + blockDim.x * blockIdx.x; - - if (id >= ndata) return; - /* we have to initialize the state */ - curand_init( - seed + - id, /* the seed controls the sequence of random values that are produced */ - 0, /* the sequence number is only important with multiple cores */ - 0, /* the offset is how much extra we advance in the sequence for each - call, can be 0 */ - &state); - - float x = curand_uniform(&state); - uq_flags[id] = (x <= acceptable_error); -} - - -#include PERFFASPECT() inline void DtoDMemcpy(void *dest, void *src, size_t nBytes) { - cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToDevice); + FATAL(Device, "DtoD Memcpy Not Enabled"); } PERFFASPECT() @@ -201,86 +203,124 @@ inline void HtoHMemcpy(void *dest, void *src, size_t nBytes) PERFFASPECT() inline void HtoDMemcpy(void *dest, void *src, size_t nBytes) { - cudaMemcpy(dest, src, nBytes, cudaMemcpyHostToDevice); -}; + FATAL(Device, "HtoD Memcpy Not Enabled"); +} PERFFASPECT() inline void DtoHMemcpy(void *dest, void *src, size_t nBytes) { - cudaMemcpy(dest, src, nBytes, cudaMemcpyDeviceToHost); + FATAL(Device, "DtoH Memcpy Not Enabled"); } -template -__global__ void computeDeltaUQMeanPredicatesKernel( - const scalar_t *__restrict__ outputs_stdev, - bool *__restrict__ predicates, - const size_t nrows, - const size_t ncols, - const double threshold) + +inline void *DeviceAllocate(size_t nBytes) { + FATAL(Device, "DtoH Memcpy Not Enabled"); +} - size_t idx = blockDim.x * blockIdx.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - // Compute mean over columns, strided loop. - for (size_t i = idx; i < nrows; i += stride) { - double mean = 0.0; - for (size_t j = 0; j < ncols; ++j) - mean += outputs_stdev[j + i * ncols]; - mean /= ncols; - - predicates[i] = (mean < threshold); - } + +PERFFASPECT() +inline void DeviceFree(void *ptr) { FATAL(Device, "DtoH Memcpy Not Enabled"); } + +PERFFASPECT() +inline void *DevicePinnedAlloc(size_t nBytes) +{ + FATAL(Device, "Pinned Alloc Not Enabled"); } -template -__global__ void computeDeltaUQMaxPredicatesKernel( - const scalar_t *__restrict__ outputs_stdev, - bool *__restrict__ predicates, - const size_t nrows, - const size_t ncols, - const double threshold) +PERFFASPECT() +inline void DeviceFreePinned(void *ptr) +{ + FATAL(Device, "Pinned Free Pinned Not Enabled"); +} + +inline void device_random_uq(int seed, + bool *uq_flags, + int ndata, + double acceptable_error) { + FATAL(Device, "Called Device Runtime UQ without enabling Device compilation"); +} + + +inline void deviceCheckErrors(const char *file, int line) { return; } - size_t idx = blockDim.x * blockIdx.x + threadIdx.x; - size_t stride = blockDim.x * gridDim.x; - // Compute max delta uq over columns, strided loop. - for (size_t i = idx; i < nrows; i += stride) { - predicates[i] = true; - for (size_t j = 0; j < ncols; ++j) - if (outputs_stdev[j + i * ncols] >= threshold) { - predicates[i] = false; - break; - } - } +namespace Device +{ +PERFFASPECT() +inline void computePredicate(float *data, + bool *predicate, + size_t nData, + const size_t kneigh, + float threshold) +{ + return; } -#else + +template PERFFASPECT() -inline void DtoDMemcpy(void *dest, void *src, size_t nBytes) +inline void linearize(TypeOutValue *output, + const TypeInValue *const *inputs, + size_t dims, + size_t elements) { - std::cerr << "DtoD Memcpy Not Enabled" << std::endl; - exit(-1); + return; } +template PERFFASPECT() -inline void HtoHMemcpy(void *dest, void *src, size_t nBytes) +inline int pack(bool cond, + const bool *predicate, + const size_t n, + const TypeValue **sparse, + TypeValue **dense, + int dims) { - std::memcpy(dest, src, nBytes); + return -1; } +template PERFFASPECT() -inline void HtoDMemcpy(void *dest, void *src, size_t nBytes) +inline int pack(bool cond, + const bool *predicate, + const size_t n, + TypeValue **sparse, + TypeValue **dense, + int *sparse_indices, + int dims) { - std::cerr << "HtoD Memcpy Not Enabled" << std::endl; - exit(-1); -}; + return -1; +} +template PERFFASPECT() -inline void DtoHMemcpy(void *dest, void *src, size_t nBytes) +inline int unpack(bool cond, + const bool *predicate, + const size_t n, + TypeValue **sparse, + TypeValue **dense, + int dims) +{ + return -1; +} + +template +PERFFASPECT() +inline int unpack(bool cond, + const size_t n, + TypeValue **sparse, + TypeValue **dense, + int *sparse_indices, + int dims) { - std::cerr << "DtoH Memcpy Not Enabled" << std::endl; - exit(-1); + return -1; } + +} // namespace Device +} // namespace ams + #endif + #endif diff --git a/src/AMSlib/wf/resource_manager.cpp b/src/AMSlib/wf/resource_manager.cpp index d941c745..8c08d287 100644 --- a/src/AMSlib/wf/resource_manager.cpp +++ b/src/AMSlib/wf/resource_manager.cpp @@ -5,48 +5,107 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ -#include "resource_manager.hpp" - +#include +#include #include #include #include #include "debug.h" +#include "device.hpp" +#include "resource_manager.hpp" namespace ams { -std::string AMSAllocator::getName() { return allocator.getName(); } +std::string AMSAllocator::getName() { return name; } -void *AMSAllocator::allocate(size_t num_bytes) -{ - void *ptr = allocator.allocate(num_bytes); - CFATAL(ResourceManager, - ptr == nullptr, - "Failed to allocated %ld values using allocator %s", - num_bytes, - getName().c_str()); - return ptr; -} -void AMSAllocator::deallocate(void *ptr) { allocator.deallocate(ptr); } +struct AMSDefaultDeviceAllocator final : AMSAllocator { + AMSDefaultDeviceAllocator(std::string name) : AMSAllocator(name){}; + ~AMSDefaultDeviceAllocator() = default; + + void *allocate(size_t num_bytes) { return DeviceAllocate(num_bytes); } + + void deallocate(void *ptr) { return DeviceFree(ptr); } +}; + +struct AMSDefaultHostAllocator final : AMSAllocator { + AMSDefaultHostAllocator(std::string name) : AMSAllocator(name) {} + ~AMSDefaultHostAllocator() = default; -void AMSAllocator::registerPtr(void *ptr, size_t nBytes) + void *allocate(size_t num_bytes) { return aligned_alloc(8, num_bytes); } + + void deallocate(void *ptr) { free(ptr); } +}; + +struct AMSDefaultPinnedAllocator final : AMSAllocator { + AMSDefaultPinnedAllocator(std::string name) : AMSAllocator(name) {} + ~AMSDefaultPinnedAllocator() = default; + + void *allocate(size_t num_bytes) { return DevicePinnedAlloc(num_bytes); } + + void deallocate(void *ptr) { DeviceFreePinned(ptr); } +}; + + +namespace internal { - auto &rm = umpire::ResourceManager::getInstance(); - rm.registerAllocation(ptr, - umpire::util::AllocationRecord( - ptr, nBytes, allocator.getAllocationStrategy())); +void _raw_copy(void *src, + AMSResourceType src_dev, + void *dest, + AMSResourceType dest_dev, + size_t num_bytes) +{ + if (src_dev == AMSResourceType::AMS_HOST) { + if (dest_dev == AMSResourceType::AMS_HOST) { + std::memcpy(dest, src, num_bytes); + } else if (dest_dev == AMSResourceType::AMS_DEVICE) { + HtoDMemcpy(dest, src, num_bytes); + } else if (dest_dev == AMSResourceType::AMS_PINNED) { + std::memcpy(dest, src, num_bytes); + } else { + FATAL(AMSResource, "Unknown copy dest") + } + } else if (src_dev == AMSResourceType::AMS_DEVICE) { + if (dest_dev == AMSResourceType::AMS_HOST) { + DtoHMemcpy(dest, src, num_bytes); + } else if (dest_dev == AMSResourceType::AMS_DEVICE) { + DtoDMemcpy(dest, src, num_bytes); + } else if (dest_dev == AMSResourceType::AMS_PINNED) { + DtoHMemcpy(dest, src, num_bytes); + } else { + FATAL(AMSResource, "Unknown copy dest") + } + } else if (src_dev == AMSResourceType::AMS_PINNED) { + if (dest_dev == AMSResourceType::AMS_HOST) { + std::memcpy(dest, src, num_bytes); + } else if (dest_dev == AMSResourceType::AMS_DEVICE) { + HtoDMemcpy(dest, src, num_bytes); + } else if (dest_dev == AMSResourceType::AMS_PINNED) { + std::memcpy(dest, src, num_bytes); + } else { + FATAL(AMSResource, "Unknown copy dest") + } + } } -void AMSAllocator::getAllocatorStats(size_t &wm, size_t &cs, size_t &as) +AMSAllocator *_get_allocator(std::string alloc_name, AMSResourceType resource) { - wm = allocator.getHighWatermark(); - cs = allocator.getCurrentSize(); - as = allocator.getActualSize(); + if (resource == AMSResourceType::AMS_DEVICE) { + return new AMSDefaultDeviceAllocator(alloc_name); + } else if (resource == AMSResourceType::AMS_HOST) { + return new AMSDefaultHostAllocator(alloc_name); + } else if (resource == AMSResourceType::AMS_PINNED) { + return new AMSDefaultPinnedAllocator(alloc_name); + } else { + FATAL(ResourceManager, + "Requested allocator %s for Unknown resource type", + alloc_name.c_str()); + } } -// ----------------------------------------------------------------------------- -// set up the resource manager -// ----------------------------------------------------------------------------- +void _release_allocator(AMSAllocator *allocator) { delete allocator; } + +} // namespace internal } // namespace ams diff --git a/src/AMSlib/wf/resource_manager.hpp b/src/AMSlib/wf/resource_manager.hpp index 407bdff6..88d44447 100644 --- a/src/AMSlib/wf/resource_manager.hpp +++ b/src/AMSlib/wf/resource_manager.hpp @@ -9,9 +9,7 @@ #define __AMS_ALLOCATOR__ #include -#include -#include -#include +#include #include "AMS.h" #include "wf/debug.h" @@ -19,44 +17,39 @@ namespace ams { +// Forward decl. +struct AMSAllocator; +namespace internal +{ +void _raw_copy(void* src, + AMSResourceType src_dev, + void* dest, + AMSResourceType dest_dev, + size_t num_bytes); + +AMSAllocator* _get_allocator(std::string alloc_name, AMSResourceType resource); +void _release_allocator(AMSAllocator* allocator); + +} // namespace internal /** * @brief A "utility" class that provides * a unified interface to the umpire library for memory allocations * and data movements/copies. */ -struct AMSAllocator { - int id; - umpire::Allocator allocator; - - AMSAllocator(std::string alloc_name) - { - auto& rm = umpire::ResourceManager::getInstance(); - allocator = rm.getAllocator(alloc_name); - DBG(AMSAllocator, - "in AMSAllocator(%d, %s, %p)", - id, - alloc_name.c_str(), - this) - } - ~AMSAllocator() { DBG(AMSAllocator, "in ~AMSAllocator(%d, %p)", id, this) } +struct AMSAllocator { + std::string name; + AMSAllocator(std::string alloc_name) : name(alloc_name) {} + virtual ~AMSAllocator() = default; - void* allocate(size_t num_bytes); - void deallocate(void* ptr); + virtual void* allocate(size_t num_bytes) = 0; + virtual void deallocate(void* ptr) = 0; std::string getName(); - - void registerPtr(void* ptr, size_t nBytes); - static void deregisterPtr(void* ptr) - { - auto& rm = umpire::ResourceManager::getInstance(); - rm.deregisterAllocation(ptr); - } - - void getAllocatorStats(size_t& wm, size_t& cs, size_t& as); }; + class ResourceManager { private: @@ -116,37 +109,26 @@ class ResourceManager RMAllocators[dev]->deallocate(data); } - /** @brief registers an external pointer in the umpire allocation records. - * @param[in] ptr pointer to memory to register. - * @param[in] nBytes number of bytes to register. - * @param[in] dev resource to register the memory to. - * @return void. - */ - PERFFASPECT() - void registerExternal(void* ptr, size_t nBytes, AMSResourceType dev) - { - RMAllocators[dev]->registerPtr(ptr, nBytes); - } - - /** @brief removes a registered external pointer from the umpire allocation records. - * @param[in] ptr pointer to memory to de-register. - * @return void. - */ - void deregisterExternal(void* ptr) { AMSAllocator::deregisterPtr(ptr); } - /** @brief copy values from src to destination regardless of their memory location. * @tparam TypeInValue type of pointers * @param[in] src Source memory pointer. * @param[out] dest destination memory pointer. - * @param[in] size number of bytes to copy. (When 0 copies entire allocated area) + * @param[in] size number of values to copy. * @return void. */ template PERFFASPECT() - void copy(TypeInValue* src, TypeInValue* dest, size_t size = 0) + void copy(TypeInValue* src, + AMSResourceType src_dev, + TypeInValue* dest, + AMSResourceType dest_dev, + size_t nvalues) { - static auto& rm = umpire::ResourceManager::getInstance(); - rm.copy(dest, src, size); + ams::internal::_raw_copy(static_cast(src), + src_dev, + static_cast(dest), + dest_dev, + nvalues * sizeof(TypeInValue)); } /** @brief Utility function that deallocates all C-Vectors inside the vector. @@ -163,7 +145,7 @@ class ResourceManager void init() { - DBG(ResourceManager, "Default initialization of allocators"); + DBG(ResourceManager, "Initialization of allocators"); if (!RMAllocators[AMSResourceType::AMS_HOST]) setAllocator("HOST", AMSResourceType::AMS_HOST); #ifdef __ENABLE_CUDA__ @@ -181,7 +163,8 @@ class ResourceManager delete RMAllocators[resource]; } - RMAllocators[resource] = new AMSAllocator(alloc_name); + RMAllocators[resource] = + ams::internal::_get_allocator(alloc_name, resource); DBG(ResourceManager, "Set Allocator [%d] to pool with name : %s", resource, @@ -205,13 +188,13 @@ class ResourceManager size_t& cs, size_t& as) { - RMAllocators[resource]->getAllocatorStats(wm, cs, as); return; } //! ------------------------------------------------------------------------ }; + } // namespace ams #endif diff --git a/src/AMSlib/wf/workflow.hpp b/src/AMSlib/wf/workflow.hpp index c164e7d5..9807461c 100644 --- a/src/AMSlib/wf/workflow.hpp +++ b/src/AMSlib/wf/workflow.hpp @@ -140,12 +140,20 @@ class AMSWorkflow size_t actualElems = std::min(elPerDim, num_elements - i); // Copy input data to host for (int k = 0; k < numIn; k++) { - rm.copy(&inputs[k][i], hInputs[k], actualElems * sizeof(FPTypeValue)); + rm.copy(&inputs[k][i], + AMSResourceType::AMS_DEVICE, + hInputs[k], + AMSResourceType::AMS_HOST, + actualElems); } // Copy output data to host for (int k = 0; k < numIn; k++) { - rm.copy(&outputs[k][i], hOutputs[k], actualElems * sizeof(FPTypeValue)); + rm.copy(&outputs[k][i], + AMSResourceType::AMS_DEVICE, + hOutputs[k], + AMSResourceType::AMS_HOST, + actualElems); } // Store to database diff --git a/tests/AMSlib/ams_ete.cpp b/tests/AMSlib/ams_ete.cpp index bdf68bfa..729f3752 100644 --- a/tests/AMSlib/ams_ete.cpp +++ b/tests/AMSlib/ams_ete.cpp @@ -130,13 +130,6 @@ void callBackSingle(void *cls, long elements, void **inputs, void **outputs) int main(int argc, char **argv) { - // Number of ranks in this run - int wS = 1; - // My Local Id - int rId = 0; - // Level of Threading provided by MPI - int provided = 0; - if (argc != 12) { std::cout << "Wrong cli\n"; std::cout << argv[0] @@ -162,7 +155,6 @@ int main(int argc, char **argv) std::string db_type_str = std::string(argv[10]); std::string fs_path = std::string(argv[11]); AMSDBType db_type = ams::db::getDBType(db_type_str); - AMSDBType dbType = AMSDBType::AMS_NONE; AMSResourceType resource = AMSResourceType::AMS_HOST; srand(time(NULL)); diff --git a/tests/AMSlib/ams_ete_env.cpp b/tests/AMSlib/ams_ete_env.cpp index 291523e4..71560bb2 100644 --- a/tests/AMSlib/ams_ete_env.cpp +++ b/tests/AMSlib/ams_ete_env.cpp @@ -99,12 +99,14 @@ struct Problem { outputs.size()); for (int i = 0; i < num_outputs; i++) { - delete outputs[i]; + delete[] outputs[i]; + outputs[i] = nullptr; } for (int i = 0; i < num_inputs; i++) { - delete inputs[i]; + delete[] inputs[i]; + inputs[i] = nullptr; } } } @@ -130,12 +132,6 @@ void callBackSingle(void *cls, long elements, void **inputs, void **outputs) int main(int argc, char **argv) { - // Number of ranks in this run - int wS = 1; - // My Local Id - int rId = 0; - // Level of Threading provided by MPI - int provided = 0; if (argc != 9) { std::cout << "Wrong cli\n"; diff --git a/tests/AMSlib/ams_update_model.cpp b/tests/AMSlib/ams_update_model.cpp index 6906081e..a8bd5225 100644 --- a/tests/AMSlib/ams_update_model.cpp +++ b/tests/AMSlib/ams_update_model.cpp @@ -44,8 +44,16 @@ bool inference(SurrogateModel &model, if (resource == AMSResourceType::AMS_DEVICE) { first_model_out = ams_rm.allocate(SIZE, AMSResourceType::AMS_HOST); second_model_out = ams_rm.allocate(SIZE, AMSResourceType::AMS_HOST); - ams_rm.copy(outputs[i], first_model_out, SIZE * sizeof(T)); - ams_rm.copy(outputs[i + 4], second_model_out, SIZE * sizeof(T)); + ams_rm.copy(outputs[i], + resource, + first_model_out, + AMSResourceType::AMS_HOST, + SIZE); + ams_rm.copy(outputs[i + 4], + resource, + second_model_out, + AMSResourceType::AMS_HOST, + SIZE); } for (int j = 0; j < SIZE; j++) { diff --git a/tests/AMSlib/cpu_packing_test.cpp b/tests/AMSlib/cpu_packing_test.cpp index a52c5966..9cd780cf 100644 --- a/tests/AMSlib/cpu_packing_test.cpp +++ b/tests/AMSlib/cpu_packing_test.cpp @@ -108,8 +108,16 @@ int main(int argc, char* argv[]) double* rsparse = rm.allocate(SIZE, resource); int* reindex = rm.allocate(SIZE, resource); - rm.copy(h_predicate, predicate); - rm.copy(h_sparse, sparse); + rm.copy(h_predicate, + AMSResourceType::AMS_HOST, + predicate, + AMSResourceType::AMS_DEVICE, + SIZE); + rm.copy(h_sparse, + AMSResourceType::AMS_HOST, + sparse, + AMSResourceType::AMS_DEVICE, + SIZE); std::vector s_data({const_cast(sparse)}); std::vector sr_data({rsparse}); @@ -126,7 +134,11 @@ int main(int argc, char* argv[]) return 1; } - rm.copy(dense, h_dense); + rm.copy(dense, + AMSResourceType::AMS_DEVICE, + h_dense, + AMSResourceType::AMS_HOST, + elements); if (verify(h_dense, elements, flag)) { std::cout << "Dense elements do not have the correct values\n"; @@ -135,7 +147,11 @@ int main(int argc, char* argv[]) data_handler::unpack(resource, predicate, size, d_data, sr_data, flag); - rm.copy(rsparse, h_rsparse); + rm.copy(rsparse, + AMSResourceType::AMS_DEVICE, + h_rsparse, + AMSResourceType::AMS_HOST, + size); if (verify(h_predicate, h_sparse, h_rsparse, size, flag)) { // for ( int k = 0; k < SIZE; k++){ From 8cce8e87c0b0cbdcf092ed533aa59acee6cb573b Mon Sep 17 00:00:00 2001 From: koparasy Date: Thu, 6 Jun 2024 13:26:09 -0700 Subject: [PATCH 3/5] Fix test for hdcache --- tests/AMSlib/CMakeLists.txt | 5 +++-- tests/AMSlib/test_hdcache.cpp | 10 ++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/AMSlib/CMakeLists.txt b/tests/AMSlib/CMakeLists.txt index 7b9f8000..1c85e22a 100644 --- a/tests/AMSlib/CMakeLists.txt +++ b/tests/AMSlib/CMakeLists.txt @@ -155,8 +155,9 @@ function(ADDTEST exe test_name) endif() endfunction() -BUILD_TEST(ams_allocator_test ams_allocate.cpp) -ADDTEST(ams_allocator_test AMSAllocate) +# This test requires Allocate +#BUILD_TEST(ams_allocator_test ams_allocate.cpp) +#ADDTEST(ams_allocator_test AMSAllocate) BUILD_TEST(ams_packing_test cpu_packing_test.cpp AMSPack) ADDTEST(ams_packing_test AMSPack) diff --git a/tests/AMSlib/test_hdcache.cpp b/tests/AMSlib/test_hdcache.cpp index 865cc517..f87d5232 100644 --- a/tests/AMSlib/test_hdcache.cpp +++ b/tests/AMSlib/test_hdcache.cpp @@ -97,8 +97,10 @@ bool do_faiss(std::shared_ptr> &index, for (int i = 0; i < orig_data.size(); i++) { T *d_data = rm.allocate(nClusters * nElements, resource); rm.copy(const_cast(orig_data[i]), + AMSResourceType::AMS_HOST, d_data, - nClusters * nElements * sizeof(T)); + AMSResourceType::AMS_DEVICE, + nClusters * nElements); data[i] = d_data; } } @@ -111,7 +113,11 @@ bool do_faiss(std::shared_ptr> &index, if (resource == AMSResourceType::AMS_DEVICE) { h_predicates = rm.allocate(nClusters * nElements, AMSResourceType::AMS_HOST); - rm.copy(predicates, h_predicates, nClusters * nElements); + rm.copy(predicates, + AMSResourceType::AMS_DEVICE, + h_predicates, + AMSResourceType::AMS_HOST, + nClusters * nElements); for (auto d : data) { rm.deallocate(const_cast(d), AMSResourceType::AMS_DEVICE); } From 636747d2bb929c3d13335a78c1ce08dae235d4a3 Mon Sep 17 00:00:00 2001 From: koparasy Date: Thu, 6 Jun 2024 14:45:19 -0700 Subject: [PATCH 4/5] Multples of 8 for any alignment --- src/AMSlib/wf/resource_manager.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/AMSlib/wf/resource_manager.cpp b/src/AMSlib/wf/resource_manager.cpp index 8c08d287..a19deac7 100644 --- a/src/AMSlib/wf/resource_manager.cpp +++ b/src/AMSlib/wf/resource_manager.cpp @@ -18,6 +18,12 @@ namespace ams { +template +static T roundUp(T num_to_round, int multiple) +{ + return ((num_to_round + multiple- 1) / multiple) * multiple; +} + std::string AMSAllocator::getName() { return name; } @@ -34,7 +40,7 @@ struct AMSDefaultHostAllocator final : AMSAllocator { AMSDefaultHostAllocator(std::string name) : AMSAllocator(name) {} ~AMSDefaultHostAllocator() = default; - void *allocate(size_t num_bytes) { return aligned_alloc(8, num_bytes); } + void *allocate(size_t num_bytes) { return aligned_alloc(8, roundUp(num_bytes, 8)); } void deallocate(void *ptr) { free(ptr); } }; From e448ed564e6f7146477a99c5d03acdda88254f9a Mon Sep 17 00:00:00 2001 From: koparasy Date: Mon, 10 Jun 2024 09:05:59 -0700 Subject: [PATCH 5/5] Fix comments --- src/AMSlib/AMS.cpp | 6 +- src/AMSlib/wf/device.hpp | 16 +++-- src/AMSlib/wf/resource_manager.cpp | 102 +++++++++++++++-------------- src/AMSlib/wf/resource_manager.hpp | 24 +++---- tests/AMSlib/CMakeLists.txt | 1 + 5 files changed, 83 insertions(+), 66 deletions(-) diff --git a/src/AMSlib/AMS.cpp b/src/AMSlib/AMS.cpp index d78538d3..5c5d3066 100644 --- a/src/AMSlib/AMS.cpp +++ b/src/AMSlib/AMS.cpp @@ -389,8 +389,9 @@ class AMSWrap } else if (log_prefix.find("") != std::string::npos) { pattern = std::string(""); id = getpid(); - } // Combine hostname and pid + } + // Combine hostname and pid std::ostringstream combined; combined << "." << hostname << "." << id; @@ -665,7 +666,8 @@ const char *AMSGetAllocatorName(AMSResourceType device) void AMSSetAllocator(AMSResourceType resource, const char *alloc_name) { auto &rm = ams::ResourceManager::getInstance(); - rm.setAllocator(std::string(alloc_name), resource); + std::string alloc(alloc_name); + rm.setAllocator(alloc, resource); } AMSCAbstrModel AMSRegisterAbstractModel(const char *domain_name, diff --git a/src/AMSlib/wf/device.hpp b/src/AMSlib/wf/device.hpp index a7dd7bdf..d197bd21 100644 --- a/src/AMSlib/wf/device.hpp +++ b/src/AMSlib/wf/device.hpp @@ -16,6 +16,8 @@ #include "AMS.h" #include "wf/debug.h" +#define UNDEFINED_FUNC -1 + #ifdef __ENABLE_CUDA__ namespace ams { @@ -254,6 +256,7 @@ inline void computePredicate(float *data, const size_t kneigh, float threshold) { + FATAL(Device, "Called device code when CUDA disabled"); return; } @@ -265,6 +268,7 @@ inline void linearize(TypeOutValue *output, size_t dims, size_t elements) { + FATAL(Device, "Called device code when CUDA disabled"); return; } @@ -277,7 +281,8 @@ inline int pack(bool cond, TypeValue **dense, int dims) { - return -1; + FATAL(Device, "Called device code when CUDA disabled"); + return UNDEFINED_FUNC; } template @@ -290,7 +295,8 @@ inline int pack(bool cond, int *sparse_indices, int dims) { - return -1; + FATAL(Device, "Called device code when CUDA disabled"); + return UNDEFINED_FUNC; } template @@ -302,7 +308,8 @@ inline int unpack(bool cond, TypeValue **dense, int dims) { - return -1; + FATAL(Device, "Called device code when CUDA disabled"); + return UNDEFINED_FUNC; } template @@ -314,7 +321,8 @@ inline int unpack(bool cond, int *sparse_indices, int dims) { - return -1; + FATAL(Device, "Called device code when CUDA disabled"); + return UNDEFINED_FUNC; } } // namespace Device diff --git a/src/AMSlib/wf/resource_manager.cpp b/src/AMSlib/wf/resource_manager.cpp index a19deac7..8b9da265 100644 --- a/src/AMSlib/wf/resource_manager.cpp +++ b/src/AMSlib/wf/resource_manager.cpp @@ -7,9 +7,6 @@ #include #include -#include -#include -#include #include "debug.h" #include "device.hpp" @@ -18,13 +15,13 @@ namespace ams { -template -static T roundUp(T num_to_round, int multiple) +template +static T roundUp(T num_to_round, int multiple) { - return ((num_to_round + multiple- 1) / multiple) * multiple; + return ((num_to_round + multiple - 1) / multiple) * multiple; } -std::string AMSAllocator::getName() { return name; } +const std::string AMSAllocator::getName() const { return name; } struct AMSDefaultDeviceAllocator final : AMSAllocator { @@ -40,7 +37,10 @@ struct AMSDefaultHostAllocator final : AMSAllocator { AMSDefaultHostAllocator(std::string name) : AMSAllocator(name) {} ~AMSDefaultHostAllocator() = default; - void *allocate(size_t num_bytes) { return aligned_alloc(8, roundUp(num_bytes, 8)); } + void *allocate(size_t num_bytes) + { + return aligned_alloc(8, roundUp(num_bytes, 8)); + } void deallocate(void *ptr) { free(ptr); } }; @@ -63,51 +63,55 @@ void _raw_copy(void *src, AMSResourceType dest_dev, size_t num_bytes) { - if (src_dev == AMSResourceType::AMS_HOST) { - if (dest_dev == AMSResourceType::AMS_HOST) { - std::memcpy(dest, src, num_bytes); - } else if (dest_dev == AMSResourceType::AMS_DEVICE) { - HtoDMemcpy(dest, src, num_bytes); - } else if (dest_dev == AMSResourceType::AMS_PINNED) { - std::memcpy(dest, src, num_bytes); - } else { - FATAL(AMSResource, "Unknown copy dest") - } - } else if (src_dev == AMSResourceType::AMS_DEVICE) { - if (dest_dev == AMSResourceType::AMS_HOST) { - DtoHMemcpy(dest, src, num_bytes); - } else if (dest_dev == AMSResourceType::AMS_DEVICE) { - DtoDMemcpy(dest, src, num_bytes); - } else if (dest_dev == AMSResourceType::AMS_PINNED) { - DtoHMemcpy(dest, src, num_bytes); - } else { - FATAL(AMSResource, "Unknown copy dest") - } - } else if (src_dev == AMSResourceType::AMS_PINNED) { - if (dest_dev == AMSResourceType::AMS_HOST) { - std::memcpy(dest, src, num_bytes); - } else if (dest_dev == AMSResourceType::AMS_DEVICE) { - HtoDMemcpy(dest, src, num_bytes); - } else if (dest_dev == AMSResourceType::AMS_PINNED) { - std::memcpy(dest, src, num_bytes); - } else { - FATAL(AMSResource, "Unknown copy dest") - } + switch (src_dev) { + case AMSResourceType::AMS_HOST: + case AMSResourceType::AMS_PINNED: + switch (dest_dev) { + case AMSResourceType::AMS_HOST: + case AMSResourceType::AMS_PINNED: + std::memcpy(dest, src, num_bytes); + break; + case AMSResourceType::AMS_DEVICE: + HtoDMemcpy(dest, src, num_bytes); + break; + default: + FATAL(ResourceManager, "Unknown device type to copy to from HOST"); + break; + } + break; + case AMSResourceType::AMS_DEVICE: + switch (dest_dev) { + case AMSResourceType::AMS_DEVICE: + DtoDMemcpy(dest, src, num_bytes); + break; + case AMSResourceType::AMS_HOST: + case AMSResourceType::AMS_PINNED: + DtoHMemcpy(dest, src, num_bytes); + break; + default: + FATAL(ResourceManager, "Unknown device type to copy to from DEVICE"); + break; + } + default: + FATAL(ResourceManager, "Unknown device type to copy from"); } } -AMSAllocator *_get_allocator(std::string alloc_name, AMSResourceType resource) +AMSAllocator *_get_allocator(std::string &alloc_name, AMSResourceType resource) { - if (resource == AMSResourceType::AMS_DEVICE) { - return new AMSDefaultDeviceAllocator(alloc_name); - } else if (resource == AMSResourceType::AMS_HOST) { - return new AMSDefaultHostAllocator(alloc_name); - } else if (resource == AMSResourceType::AMS_PINNED) { - return new AMSDefaultPinnedAllocator(alloc_name); - } else { - FATAL(ResourceManager, - "Requested allocator %s for Unknown resource type", - alloc_name.c_str()); + switch (resource) { + case AMSResourceType::AMS_DEVICE: + return new AMSDefaultDeviceAllocator(alloc_name); + break; + case AMSResourceType::AMS_HOST: + return new AMSDefaultHostAllocator(alloc_name); + break; + case AMSResourceType::AMS_PINNED: + return new AMSDefaultPinnedAllocator(alloc_name); + break; + default: + FATAL(ResourceManager, + "Unknown resource type to create an allocator for"); } } diff --git a/src/AMSlib/wf/resource_manager.hpp b/src/AMSlib/wf/resource_manager.hpp index 88d44447..b86fdb6b 100644 --- a/src/AMSlib/wf/resource_manager.hpp +++ b/src/AMSlib/wf/resource_manager.hpp @@ -9,6 +9,7 @@ #define __AMS_ALLOCATOR__ #include +#include #include #include "AMS.h" @@ -27,7 +28,7 @@ void _raw_copy(void* src, AMSResourceType dest_dev, size_t num_bytes); -AMSAllocator* _get_allocator(std::string alloc_name, AMSResourceType resource); +AMSAllocator* _get_allocator(std::string& alloc_name, AMSResourceType resource); void _release_allocator(AMSAllocator* allocator); } // namespace internal @@ -40,16 +41,15 @@ void _release_allocator(AMSAllocator* allocator); struct AMSAllocator { std::string name; - AMSAllocator(std::string alloc_name) : name(alloc_name) {} + AMSAllocator(std::string& alloc_name) : name(alloc_name) {} virtual ~AMSAllocator() = default; virtual void* allocate(size_t num_bytes) = 0; virtual void deallocate(void* ptr) = 0; - std::string getName(); + const std::string getName() const; }; - class ResourceManager { private: @@ -77,7 +77,7 @@ class ResourceManager } /** @brief return the name of an allocator */ - std::string getAllocatorName(AMSResourceType resource) + const std::string getAllocatorName(AMSResourceType resource) const { return RMAllocators[resource]->getName(); } @@ -113,7 +113,7 @@ class ResourceManager * @tparam TypeInValue type of pointers * @param[in] src Source memory pointer. * @param[out] dest destination memory pointer. - * @param[in] size number of values to copy. + * @param[in] size number of values to copy (It performs a shallow copy of nested pointers). * @return void. */ template @@ -146,18 +146,21 @@ class ResourceManager void init() { DBG(ResourceManager, "Initialization of allocators"); + std::string host_alloc("HOST"); + std::string device_alloc("DEVICE"); + std::string pinned_alloc("PINNED"); if (!RMAllocators[AMSResourceType::AMS_HOST]) - setAllocator("HOST", AMSResourceType::AMS_HOST); + setAllocator(host_alloc, AMSResourceType::AMS_HOST); #ifdef __ENABLE_CUDA__ if (!RMAllocators[AMSResourceType::AMS_DEVICE]) - setAllocator("DEVICE", AMSResourceType::AMS_DEVICE); + setAllocator(host_alloc, AMSResourceType::AMS_DEVICE); if (!RMAllocators[AMSResourceType::AMS_PINNED]) - setAllocator("PINNED", AMSResourceType::AMS_PINNED); + setAllocator(pinned_alloc, AMSResourceType::AMS_PINNED); #endif } - void setAllocator(std::string alloc_name, AMSResourceType resource) + void setAllocator(std::string& alloc_name, AMSResourceType resource) { if (RMAllocators[resource]) { delete RMAllocators[resource]; @@ -194,7 +197,6 @@ class ResourceManager //! ------------------------------------------------------------------------ }; - } // namespace ams #endif diff --git a/tests/AMSlib/CMakeLists.txt b/tests/AMSlib/CMakeLists.txt index 1c85e22a..5571a922 100644 --- a/tests/AMSlib/CMakeLists.txt +++ b/tests/AMSlib/CMakeLists.txt @@ -156,6 +156,7 @@ function(ADDTEST exe test_name) endfunction() # This test requires Allocate +# TODO: Include tests once we re-instate a pool #BUILD_TEST(ams_allocator_test ams_allocate.cpp) #ADDTEST(ams_allocator_test AMSAllocate) BUILD_TEST(ams_packing_test cpu_packing_test.cpp AMSPack)