Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove umpire and device cuda to a single source code file #71

Merged
merged 5 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/AMSlib/AMS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ class AMSWrap
pattern = std::string("<PID>");
id = getpid();
}

// Combine hostname and pid
std::ostringstream combined;
combined << "." << hostname << "." << id;
Expand Down Expand Up @@ -665,7 +666,8 @@ const char *AMSGetAllocatorName(AMSResourceType device)
void AMSSetAllocator(AMSResourceType resource, const char *alloc_name)
{
auto &rm = ams::ResourceManager::getInstance();
rm.setAllocator(std::string(alloc_name), resource);
std::string alloc(alloc_name);
rm.setAllocator(alloc, resource);
}

AMSCAbstrModel AMSRegisterAbstractModel(const char *domain_name,
Expand Down
40 changes: 24 additions & 16 deletions src/AMSlib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,35 @@ file(GLOB_RECURSE MINIAPP_INCLUDES "*.hpp")
#set global library path to link with tests if necessary
set(LIBRARY_OUTPUT_PATH ${AMS_LIB_OUT_PATH})
set(AMS_LIB_SRC ${MINIAPP_INCLUDES} AMS.cpp wf/resource_manager.cpp wf/debug.cpp wf/basedb.cpp wf/logger.cpp)

if (WITH_CUDA)
list(APPEND AMS_LIB_SRC wf/cuda/utilities.cpp)
message(WARNING "FILES ARE ${AMS_LIB_SRC}")
endif()


# two targets: a shared lib and an exec
add_library(AMS ${AMS_LIB_SRC} ${MINIAPP_INCLUDES})

# ------------------------------------------------------------------------------
if (WITH_CUDA)
set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})

# if (BUILD_SHARED_LIBS)
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# else()
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
# endif()

set_source_files_properties(AMS.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(AMS.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
set_source_files_properties(AMS.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")

if (WITH_PERFFLOWASPECT)
set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")

set_target_properties(AMS PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})

# if (BUILD_SHARED_LIBS)
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# else()
# set_target_properties(AMS PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# set_target_properties(AMS PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
# endif()

set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES LANGUAGE CUDA)
set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES CUDA_ARCHITECTURES ${AMS_CUDA_ARCH})
set_source_files_properties(wf/cuda/utilities.cpp PROPERTIES COMPILE_FLAGS "--expt-extended-lambda")

if (WITH_PERFFLOWASPECT)
set_property(SOURCE AMS.cpp APPEND_STRING PROPERTY COMPILE_FLAGS " -Xcompiler=-Xclang -Xcompiler=-load -Xcompiler=-Xclang -Xcompiler=${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
set_source_files_properties(wf/resource_manager.cpp COMPILE_FLAGS "-Xclang -load -Xclang ${PERFFLOWASPECT_LIB_DIR}/libWeavePass.so")
endif()
endif()

Expand Down
5 changes: 4 additions & 1 deletion src/AMSlib/ml/hdcache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <stdexcept>
#include <string>
#include <type_traits>
#include <unordered_map>
koparasy marked this conversation as resolved.
Show resolved Hide resolved
#include <vector>

#ifdef __ENABLE_FAISS__
Expand All @@ -29,6 +30,8 @@
#include <faiss/gpu/GpuCloner.h>
#include <faiss/gpu/GpuIndexIVFPQ.h>
#include <faiss/gpu/StandardGpuResources.h>

#include "wf/device.hpp"
#endif
#endif

Expand Down Expand Up @@ -366,7 +369,7 @@ class HDCache
_evaluate(ndata, data, is_acceptable);

if (cache_location == AMSResourceType::AMS_DEVICE) {
deviceCheckErrors(__FILE__, __LINE__);
ams::deviceCheckErrors(__FILE__, __LINE__);
}

DBG(UQModule, "Done with evalution of uq")
Expand Down
21 changes: 2 additions & 19 deletions src/AMSlib/ml/random_uq.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,17 @@

#include "AMS.h"
#include "wf/debug.h"
#include "wf/device.hpp"
#include "wf/utils.hpp"

static size_t round_up(size_t num, size_t denom)
{
return (num + denom - 1) / denom;
}

class RandomUQ
{
public:
PERFFASPECT()
inline void evaluate(const size_t ndata, bool *is_acceptable)
{
if (resourceLocation == AMSResourceType::AMS_DEVICE) {
#ifdef __ENABLE_CUDA__
//TODO: Move all of this code on device.cpp and provide better logic regarding
// number of threads
size_t threads = 256;
size_t blocks = round_up(ndata, threads);
random_uq_device<<<blocks, threads>>>(seed,
is_acceptable,
ndata,
threshold);
seed = seed + 1;
#else
THROW(std::runtime_error,
"Random-uq is not configured to use device allocations");
#endif
ams::device_random_uq(seed, is_acceptable, ndata, threshold);
} else {
random_uq_host(is_acceptable, ndata, threshold);
}
Expand Down
61 changes: 11 additions & 50 deletions src/AMSlib/ml/surrogate.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include <unordered_map>

#include "AMS.h"
#include "wf/cuda/utilities.cuh"
#include "wf/device.hpp"

#ifdef __ENABLE_TORCH__
#include <ATen/core/interned_strings.h>
Expand Down Expand Up @@ -100,43 +100,12 @@ class SurrogateModel
{
// Transpose to get continuous memory and
// perform single memcpy.
auto& rm = ams::ResourceManager::getInstance();
tensor = tensor.transpose(1, 0);
if (model_resource == AMSResourceType::AMS_HOST) {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
} else {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
DtoDMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
}
}

PERFFASPECT()
inline void tensorToHostArray(at::Tensor tensor,
long numRows,
long numCols,
TypeInValue** array)
{
// Transpose to get continuous memory and
// perform single memcpy.
tensor = tensor.transpose(1, 0);
if (model_resource == AMSResourceType::AMS_HOST) {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
HtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
} else {
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
DtoHMemcpy(array[j], ptr, sizeof(TypeInValue) * numRows);
}
for (long j = 0; j < numCols; j++) {
auto tmp = tensor[j].contiguous();
TypeInValue* ptr = tmp.data_ptr<TypeInValue>();
rm.copy(ptr, model_resource, array[j], model_resource, numRows);
}
}

Expand Down Expand Up @@ -216,13 +185,9 @@ class SurrogateModel
if (model_resource == AMSResourceType::AMS_DEVICE) {
#ifdef __ENABLE_CUDA__
DBG(Surrogate, "Compute mean delta uq predicates on device\n");
constexpr int block_size = 256;
int grid_size = divup(nrows, block_size);
computeDeltaUQMeanPredicatesKernel<<<grid_size, block_size>>>(
outputs_stdev, predicates, nrows, ncols, threshold);
// TODO: use combined routine when it lands.
cudaDeviceSynchronize();
CUDACHECKERROR();
ams::Device::computeDeltaUQMeanPredicatesDevice(
outputs_stdev, predicates, nrows, ncols, threshold);
#else
THROW(std::runtime_error,
"Expected CUDA is enabled when model data are on DEVICE");
Expand All @@ -235,13 +200,9 @@ class SurrogateModel
if (model_resource == AMSResourceType::AMS_DEVICE) {
#ifdef __ENABLE_CUDA__
DBG(Surrogate, "Compute max delta uq predicates on device\n");
constexpr int block_size = 256;
int grid_size = divup(nrows, block_size);
computeDeltaUQMaxPredicatesKernel<<<grid_size, block_size>>>(
outputs_stdev, predicates, nrows, ncols, threshold);
// TODO: use combined routine when it lands.
cudaDeviceSynchronize();
CUDACHECKERROR();
ams::Device::computeDeltaUQMaxPredicatesDevice(
outputs_stdev, predicates, nrows, ncols, threshold);
#else
THROW(std::runtime_error,
"Expected CUDA is enabled when model data are on DEVICE");
Expand Down Expand Up @@ -306,7 +267,7 @@ class SurrogateModel
}

if (is_device()) {
deviceCheckErrors(__FILE__, __LINE__);
ams::deviceCheckErrors(__FILE__, __LINE__);
}

DBG(Surrogate,
Expand Down
20 changes: 3 additions & 17 deletions src/AMSlib/wf/basedb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1962,14 +1962,8 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
msg_id)
auto& msg = *it;
auto& rm = ams::ResourceManager::getInstance();
try {
rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);
} catch (const umpire::util::Exception& e) {
FATAL(RMQPublisherHandler,
"Failed to deallocate #%d (%p)",
msg.id(),
msg.data());
}
rm.deallocate(msg.data(), AMSResourceType::AMS_HOST);

DBG(RMQPublisherHandler, "Deallocated msg #%d (%p)", msg.id(), msg.data())
buf.erase(it);
}
Expand All @@ -1984,14 +1978,7 @@ class RMQPublisherHandler final : public AMQP::LibEventHandler
auto& rm = ams::ResourceManager::getInstance();
for (auto& dp : buffer) {
DBG(RMQPublisherHandler, "deallocate msg #%d (%p)", dp.id(), dp.data())
try {
rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
} catch (const umpire::util::Exception& e) {
FATAL(RMQPublisherHandler,
"Failed to deallocate msg #%d (%p)",
dp.id(),
dp.data());
}
rm.deallocate(dp.data(), AMSResourceType::AMS_HOST);
}
buffer.clear();
}
Expand Down Expand Up @@ -2377,7 +2364,6 @@ class RMQInterface
}
};


/* A class that provides a BaseDB interface to AMS workflow.
* When storing data it pushes the data to the RMQ server asynchronously
*/
Expand Down
Loading
Loading