Skip to content

Commit

Permalink
Less cudaGet/SetDevice calls in Gluon execution (apache#13764)
Browse files Browse the repository at this point in the history
* Remove unnecessary cudaGetDevice/cudaSetDevice calls

* Fixes for the DeviceGuard

* Retrigger CI

* Fix for possible invalid device ordinal when using DeviceStore while
driver is unloading

* Fix for RTC when the driver API call is the first call

* Added DeviceStore to pooled engine
  • Loading branch information
ptrendx authored and perdasilva committed Mar 13, 2019
1 parent a03d59e commit d9e3d9f
Show file tree
Hide file tree
Showing 11 changed files with 59 additions and 80 deletions.
19 changes: 16 additions & 3 deletions src/common/cuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,22 +286,35 @@ inline DType __device__ CudaMin(DType a, DType b) {
class DeviceStore {
public:
/*! \brief default constructor- only optionally restores previous device */
explicit DeviceStore(bool restore = true) : restore_(restore) {
explicit DeviceStore(int requested_device = -1, bool restore = true) :
restore_device_(-1),
current_device_(requested_device),
restore_(restore) {
if (restore_)
CUDA_CALL(cudaGetDevice(&restore_device_));
if (requested_device != restore_device_) {
SetDevice(requested_device);
}
}

~DeviceStore() {
if (restore_)
if (restore_ &&
current_device_ != restore_device_ &&
current_device_ != -1 &&
restore_device_ != -1)
CUDA_CALL(cudaSetDevice(restore_device_));
}

void SetDevice(int device) {
CUDA_CALL(cudaSetDevice(device));
if (device != -1) {
CUDA_CALL(cudaSetDevice(device));
current_device_ = device;
}
}

private:
int restore_device_;
int current_device_;
bool restore_;
};

Expand Down
10 changes: 2 additions & 8 deletions src/engine/stream_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,21 +65,18 @@ template <std::size_t kNumGpus, std::size_t kStreams>
RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
Context const& ctx) {
RunContext ret;
#if MXNET_USE_CUDA
mxnet::common::cuda::DeviceStore device_store;
#endif
switch (ctx.dev_mask()) {
case cpu::kDevMask:
ret = RunContext{ctx, nullptr};
break;
case gpu::kDevMask: {
#if MXNET_USE_CUDA
std::size_t use_counter;
device_store.SetDevice(ctx.dev_id);
{
std::lock_guard<std::mutex> lock{mutex_};
auto&& counter = gpu_cnt_.at(ctx.dev_id);
if (counter == -1) {
mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
for (auto&& i : gpu_streams_.at(ctx.dev_id)) {
i = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, ctx.dev_id);
}
Expand All @@ -104,19 +101,16 @@ template <std::size_t kNumGpus, std::size_t kStreams>
RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
Context const& ctx) {
RunContext ret;
#if MXNET_USE_CUDA
mxnet::common::cuda::DeviceStore device_store;
#endif
switch (ctx.dev_mask()) {
case cpu::kDevMask:
ret = RunContext{ctx, nullptr};
break;
case gpu::kDevMask: {
#if MXNET_USE_CUDA
device_store.SetDevice(ctx.dev_id);
{
std::lock_guard<std::mutex> lock{mutex_};
if (gpu_io_streams_.at(ctx.dev_id) == nullptr) {
mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
}
}
Expand Down
8 changes: 7 additions & 1 deletion src/engine/threaded_engine_pooled.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
#include "./threaded_engine.h"
#include "./thread_pool.h"
#include "./stream_manager.h"
#if MXNET_USE_CUDA
#include "../common/cuda_utils.h"
#endif

namespace mxnet {
namespace engine {
Expand Down Expand Up @@ -130,10 +133,13 @@ class ThreadedEnginePooled : public ThreadedEngine {
* \param opr_block The operator block.
*/
void DoExecute(OprBlock* opr_block) {
#if MXNET_USE_CUDA
mxnet::common::cuda::DeviceStore device_store(-1, false);
#endif
assert(opr_block->wait.load() == 0);
if (opr_block->ctx.dev_mask() == gpu::kDevMask) {
#if MXNET_USE_CUDA
CUDA_CALL(cudaSetDevice(opr_block->ctx.dev_id));
device_store.SetDevice(opr_block->ctx.dev_id);
#else // MXNET_USE_CUDA
LOG(FATAL) << "Please compile with CUDA enabled";
#endif // MXNET_USE_CUDA
Expand Down
5 changes: 2 additions & 3 deletions src/kvstore/comm.h
Original file line number Diff line number Diff line change
Expand Up @@ -724,10 +724,9 @@ class CommDevice : public Comm {
int enabled = 0;
std::vector<int> p2p(n*n);

// Restores active device to what it was before EnableP2P
mxnet::common::cuda::DeviceStore device_store;
for (int i = 0; i < n; ++i) {
device_store.SetDevice(gpus[i]);
// Restores active device to what it was before EnableP2P
mxnet::common::cuda::DeviceStore device_store(gpus[i]);
for (int j = 0; j < n; j++) {
int access;
cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
Expand Down
3 changes: 1 addition & 2 deletions src/kvstore/comm_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,9 +339,8 @@ class CommDeviceTree : public CommDevice {
int n = static_cast<int>(gpus.size());
int enabled = 0;
std::vector<int> p2p(n*n);
mxnet::common::cuda::DeviceStore device_store;
for (int i = 0; i < n; ++i) {
device_store.SetDevice(gpus[i]);
mxnet::common::cuda::DeviceStore device_store(gpus[i]);
for (int j = 0; j < n; j++) {
int access;
cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
Expand Down
10 changes: 6 additions & 4 deletions src/storage/cpu_device_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ class CPUDeviceStorage {
* \param size Size to allocate.
* \return Pointer to the storage.
*/
inline static void* Alloc(size_t size);
inline static void* Alloc(Storage::Handle* handle);
/*!
* \brief Deallocation.
* \param ptr Pointer to deallocate.
*/
inline static void Free(void* ptr);
inline static void Free(Storage::Handle handle);

private:
/*!
Expand All @@ -63,7 +63,8 @@ class CPUDeviceStorage {
#endif
}; // class CPUDeviceStorage

inline void* CPUDeviceStorage::Alloc(size_t size) {
inline void* CPUDeviceStorage::Alloc(Storage::Handle* handle) {
const size_t size = handle->size;
void* ptr;
#if _MSC_VER
ptr = _aligned_malloc(size, alignment_);
Expand All @@ -75,7 +76,8 @@ inline void* CPUDeviceStorage::Alloc(size_t size) {
return ptr;
}

inline void CPUDeviceStorage::Free(void* ptr) {
inline void CPUDeviceStorage::Free(Storage::Handle handle) {
void * ptr = handle.dptr;
#if _MSC_VER
_aligned_free(ptr);
#else
Expand Down
12 changes: 8 additions & 4 deletions src/storage/gpu_device_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,19 @@ class GPUDeviceStorage {
* \param size Size to allocate.
* \return Pointer to the storage.
*/
inline static void* Alloc(size_t size);
inline static void* Alloc(Storage::Handle* handle);
/*!
* \brief Deallocation.
* \param ptr Pointer to deallocate.
*/
inline static void Free(void* ptr);
inline static void Free(Storage::Handle handle);
}; // class GPUDeviceStorage

inline void* GPUDeviceStorage::Alloc(size_t size) {
inline void* GPUDeviceStorage::Alloc(Storage::Handle* handle) {
const size_t size = handle->size;
void* ret = nullptr;
#if MXNET_USE_CUDA
mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
#endif // MXNET_USE_NCCL
Expand All @@ -69,8 +71,10 @@ inline void* GPUDeviceStorage::Alloc(size_t size) {
return ret;
}

inline void GPUDeviceStorage::Free(void* ptr) {
inline void GPUDeviceStorage::Free(Storage::Handle handle) {
#if MXNET_USE_CUDA
void * ptr = handle.dptr;
mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
#endif // MXNET_USE_NCCL
Expand Down
6 changes: 3 additions & 3 deletions src/storage/naive_storage_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class NaiveStorageManager final : public StorageManager {
void Free(Storage::Handle handle) override;

void DirectFree(Storage::Handle handle) override {
DeviceStorage::Free(handle.dptr);
DeviceStorage::Free(handle);
}

private:
Expand All @@ -58,12 +58,12 @@ class NaiveStorageManager final : public StorageManager {

template <class DeviceStorage>
void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
handle->dptr = DeviceStorage::Alloc(handle->size);
handle->dptr = DeviceStorage::Alloc(handle);
}

template <class DeviceStorage>
void NaiveStorageManager<DeviceStorage>::Free(Storage::Handle handle) {
DeviceStorage::Free(handle.dptr);
DeviceStorage::Free(handle);
}

} // namespace storage
Expand Down
12 changes: 8 additions & 4 deletions src/storage/pinned_memory_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,29 +41,33 @@ class PinnedMemoryStorage {
* \param size Size to allocate.
* \return Pointer to the storage.
*/
inline static void* Alloc(size_t size);
inline static void* Alloc(Storage::Handle* handle);

/*!
* \brief Deallocation.
* \param ptr Pointer to deallocate.
*/
inline static void Free(void* ptr);
inline static void Free(Storage::Handle handle);
};

inline void* PinnedMemoryStorage::Alloc(size_t size) {
inline void* PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
void* ret = nullptr;
const size_t size = handle->size;
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
#endif
mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
// make the memory available across all devices
CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
return ret;
}

inline void PinnedMemoryStorage::Free(void* ptr) {
inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
void * ptr = handle.dptr;
#if MXNET_USE_NCCL
std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
#endif
mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
cudaError_t err = cudaFreeHost(ptr);
// ignore unloading error, as memory has already been recycled
if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
Expand Down
4 changes: 4 additions & 0 deletions src/storage/pooled_storage_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ class GPUPooledStorageManager final : public StorageManager {

private:
void DirectFreeNoLock(Storage::Handle handle) {
mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
cudaError_t err = cudaFree(handle.dptr);
size_t size = RoundAllocSize(handle.size);
// ignore unloading error, as memory has already been recycled
Expand Down Expand Up @@ -132,6 +133,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
size_t size = RoundAllocSize(handle->size);
auto&& reuse_it = memory_pool_.find(size);
if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
size_t free, total;
cudaMemGetInfo(&free, &total);
if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
Expand Down Expand Up @@ -252,6 +254,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
}

void DirectFreeNoLock(Storage::Handle handle) {
mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
cudaError_t err = cudaFree(handle.dptr);
size_t size = get_size(get_bucket(handle.size));
// ignore unloading error, as memory has already been recycled
Expand Down Expand Up @@ -288,6 +291,7 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
size_t size = get_size(bucket);
auto&& reuse_pool = memory_pool_[bucket];
if (reuse_pool.size() == 0) {
mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
size_t free, total;
cudaMemGetInfo(&free, &total);
if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
Expand Down
50 changes: 2 additions & 48 deletions src/storage/storage.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,35 +48,6 @@ class StorageImpl : public Storage {
static int num_gpu_device;
#endif // MXNET_USE_CUDA

static void ActivateDevice(Context ctx) {
switch (ctx.dev_type) {
case Context::kCPU:
break;
case Context::kCPUPinned:
#if MXNET_USE_CUDA
if (num_gpu_device > 0) {
CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
}
#endif // MXNET_USE_CUDA
break;
case Context::kCPUShared: {
#if defined(ANDROID) || defined(__ANDROID__)
LOG(FATAL) << "Unimplemented device";
#endif // defined(ANDROID) || defined(__ANDROID__)
}
break;
case Context::kGPU: {
#if MXNET_USE_CUDA
if (num_gpu_device > 0) {
CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
}
#endif // MXNET_USE_CUDA
break;
}
default:
LOG(FATAL) << "Unimplemented device";
}
}
// internal storage managers
std::array<common::LazyAllocArray<storage::StorageManager>,
kMaxNumberOfDevices> storage_managers_;
Expand All @@ -100,6 +71,8 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
case Context::kCPUShared: {
#if !defined(ANDROID) && !defined(__ANDROID__)
ptr = new storage::CPUSharedStorageManager();
#else
LOG(FATAL) << "Unimplemented device";
#endif // !defined(ANDROID) && !defined(__ANDROID__)
break;
}
Expand Down Expand Up @@ -149,13 +122,6 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
return ptr;
});

#if MXNET_USE_CUDA
// Will restore gpu device to before ActivateDevice if necessary
bool restore = handle->ctx.dev_type == Context::kCPUPinned ||
handle->ctx.dev_type == Context::kGPU;
mxnet::common::cuda::DeviceStore device_store(restore);
#endif
this->ActivateDevice(handle->ctx);
manager->Alloc(handle);
profiler_.OnAlloc(*handle);
}
Expand All @@ -169,12 +135,6 @@ void StorageImpl::Free(Storage::Handle handle) {
return nullptr;
});

#if MXNET_USE_CUDA
// Will restore gpu device to before ActivateDevice if necessary
bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
mxnet::common::cuda::DeviceStore device_store(restore);
#endif
this->ActivateDevice(ctx);
manager->Free(handle);
profiler_.OnFree(handle);
}
Expand All @@ -188,12 +148,6 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
return nullptr;
});

#if MXNET_USE_CUDA
// Will restore gpu device to before ActivateDevice if necessary
bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
mxnet::common::cuda::DeviceStore device_store(restore);
#endif
this->ActivateDevice(ctx);
manager->DirectFree(handle);
profiler_.OnFree(handle);
}
Expand Down

0 comments on commit d9e3d9f

Please sign in to comment.