From d9e3d9f8479d1b88ab7b49d659d817484639d2ff Mon Sep 17 00:00:00 2001 From: Przemyslaw Tredak Date: Fri, 4 Jan 2019 17:55:23 -0800 Subject: [PATCH] Less cudaGet/SetDevice calls in Gluon execution (#13764) * Remove unnecessary cudaGetDevice/cudaSetDevice calls * Fixes for the DeviceGuard * Retrigger CI * Fix for possible invalid device ordinal when using DeviceStore while driver is unloading * Fix for RTC when the driver API call is the first call * Added DeviceStore to pooled engine --- src/common/cuda_utils.h | 19 +++++++++-- src/engine/stream_manager.h | 10 ++---- src/engine/threaded_engine_pooled.cc | 8 ++++- src/kvstore/comm.h | 5 ++- src/kvstore/comm_tree.h | 3 +- src/storage/cpu_device_storage.h | 10 +++--- src/storage/gpu_device_storage.h | 12 ++++--- src/storage/naive_storage_manager.h | 6 ++-- src/storage/pinned_memory_storage.h | 12 ++++--- src/storage/pooled_storage_manager.h | 4 +++ src/storage/storage.cc | 50 ++-------------------------- 11 files changed, 59 insertions(+), 80 deletions(-) diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h index 047edde88a53..0dd9d2db3722 100644 --- a/src/common/cuda_utils.h +++ b/src/common/cuda_utils.h @@ -286,22 +286,35 @@ inline DType __device__ CudaMin(DType a, DType b) { class DeviceStore { public: /*! \brief default constructor- only optionally restores previous device */ - explicit DeviceStore(bool restore = true) : restore_(restore) { + explicit DeviceStore(int requested_device = -1, bool restore = true) : + restore_device_(-1), + current_device_(requested_device), + restore_(restore) { if (restore_) CUDA_CALL(cudaGetDevice(&restore_device_)); + if (requested_device != restore_device_) { + SetDevice(requested_device); + } } ~DeviceStore() { - if (restore_) + if (restore_ && + current_device_ != restore_device_ && + current_device_ != -1 && + restore_device_ != -1) CUDA_CALL(cudaSetDevice(restore_device_)); } void SetDevice(int device) { - CUDA_CALL(cudaSetDevice(device)); + if (device != -1) { + CUDA_CALL(cudaSetDevice(device)); + current_device_ = device; + } } private: int restore_device_; + int current_device_; bool restore_; }; diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h index d4ac042ff401..516e04bf5e82 100644 --- a/src/engine/stream_manager.h +++ b/src/engine/stream_manager.h @@ -65,9 +65,6 @@ template RunContext StreamManager::GetRunContext( Context const& ctx) { RunContext ret; -#if MXNET_USE_CUDA - mxnet::common::cuda::DeviceStore device_store; -#endif switch (ctx.dev_mask()) { case cpu::kDevMask: ret = RunContext{ctx, nullptr}; @@ -75,11 +72,11 @@ RunContext StreamManager::GetRunContext( case gpu::kDevMask: { #if MXNET_USE_CUDA std::size_t use_counter; - device_store.SetDevice(ctx.dev_id); { std::lock_guard lock{mutex_}; auto&& counter = gpu_cnt_.at(ctx.dev_id); if (counter == -1) { + mxnet::common::cuda::DeviceStore device_store(ctx.dev_id); for (auto&& i : gpu_streams_.at(ctx.dev_id)) { i = mshadow::NewStream(true, MXNET_USE_CUDNN != 0, ctx.dev_id); } @@ -104,19 +101,16 @@ template RunContext StreamManager::GetIORunContext( Context const& ctx) { RunContext ret; -#if MXNET_USE_CUDA - mxnet::common::cuda::DeviceStore device_store; -#endif switch (ctx.dev_mask()) { case cpu::kDevMask: ret = RunContext{ctx, nullptr}; break; case gpu::kDevMask: { #if MXNET_USE_CUDA - device_store.SetDevice(ctx.dev_id); { std::lock_guard lock{mutex_}; if (gpu_io_streams_.at(ctx.dev_id) == nullptr) { + mxnet::common::cuda::DeviceStore device_store(ctx.dev_id); gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream(false, false, ctx.dev_id); } } diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc index 1abb82fd6a67..c6eb99508e09 100644 --- a/src/engine/threaded_engine_pooled.cc +++ b/src/engine/threaded_engine_pooled.cc @@ -31,6 +31,9 @@ #include "./threaded_engine.h" #include "./thread_pool.h" #include "./stream_manager.h" +#if MXNET_USE_CUDA +#include "../common/cuda_utils.h" +#endif namespace mxnet { namespace engine { @@ -130,10 +133,13 @@ class ThreadedEnginePooled : public ThreadedEngine { * \param opr_block The operator block. */ void DoExecute(OprBlock* opr_block) { +#if MXNET_USE_CUDA + mxnet::common::cuda::DeviceStore device_store(-1, false); +#endif assert(opr_block->wait.load() == 0); if (opr_block->ctx.dev_mask() == gpu::kDevMask) { #if MXNET_USE_CUDA - CUDA_CALL(cudaSetDevice(opr_block->ctx.dev_id)); + device_store.SetDevice(opr_block->ctx.dev_id); #else // MXNET_USE_CUDA LOG(FATAL) << "Please compile with CUDA enabled"; #endif // MXNET_USE_CUDA diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h index 7090aaf46d8f..08f6155cb5b4 100644 --- a/src/kvstore/comm.h +++ b/src/kvstore/comm.h @@ -724,10 +724,9 @@ class CommDevice : public Comm { int enabled = 0; std::vector p2p(n*n); - // Restores active device to what it was before EnableP2P - mxnet::common::cuda::DeviceStore device_store; for (int i = 0; i < n; ++i) { - device_store.SetDevice(gpus[i]); + // Restores active device to what it was before EnableP2P + mxnet::common::cuda::DeviceStore device_store(gpus[i]); for (int j = 0; j < n; j++) { int access; cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]); diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h index e3b2ad7f57d3..b62228cd2885 100644 --- a/src/kvstore/comm_tree.h +++ b/src/kvstore/comm_tree.h @@ -339,9 +339,8 @@ class CommDeviceTree : public CommDevice { int n = static_cast(gpus.size()); int enabled = 0; std::vector p2p(n*n); - mxnet::common::cuda::DeviceStore device_store; for (int i = 0; i < n; ++i) { - device_store.SetDevice(gpus[i]); + mxnet::common::cuda::DeviceStore device_store(gpus[i]); for (int j = 0; j < n; j++) { int access; cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]); diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h index 43e98fe04a1f..25ad61efb232 100644 --- a/src/storage/cpu_device_storage.h +++ b/src/storage/cpu_device_storage.h @@ -43,12 +43,12 @@ class CPUDeviceStorage { * \param size Size to allocate. * \return Pointer to the storage. */ - inline static void* Alloc(size_t size); + inline static void* Alloc(Storage::Handle* handle); /*! * \brief Deallocation. * \param ptr Pointer to deallocate. */ - inline static void Free(void* ptr); + inline static void Free(Storage::Handle handle); private: /*! @@ -63,7 +63,8 @@ class CPUDeviceStorage { #endif }; // class CPUDeviceStorage -inline void* CPUDeviceStorage::Alloc(size_t size) { +inline void* CPUDeviceStorage::Alloc(Storage::Handle* handle) { + const size_t size = handle->size; void* ptr; #if _MSC_VER ptr = _aligned_malloc(size, alignment_); @@ -75,7 +76,8 @@ inline void* CPUDeviceStorage::Alloc(size_t size) { return ptr; } -inline void CPUDeviceStorage::Free(void* ptr) { +inline void CPUDeviceStorage::Free(Storage::Handle handle) { + void * ptr = handle.dptr; #if _MSC_VER _aligned_free(ptr); #else diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h index 435c7e81d2ae..562badb8752e 100644 --- a/src/storage/gpu_device_storage.h +++ b/src/storage/gpu_device_storage.h @@ -46,17 +46,19 @@ class GPUDeviceStorage { * \param size Size to allocate. * \return Pointer to the storage. */ - inline static void* Alloc(size_t size); + inline static void* Alloc(Storage::Handle* handle); /*! * \brief Deallocation. * \param ptr Pointer to deallocate. */ - inline static void Free(void* ptr); + inline static void Free(Storage::Handle handle); }; // class GPUDeviceStorage -inline void* GPUDeviceStorage::Alloc(size_t size) { +inline void* GPUDeviceStorage::Alloc(Storage::Handle* handle) { + const size_t size = handle->size; void* ret = nullptr; #if MXNET_USE_CUDA + mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true); #if MXNET_USE_NCCL std::lock_guard l(Storage::Get()->GetMutex(Context::kGPU)); #endif // MXNET_USE_NCCL @@ -69,8 +71,10 @@ inline void* GPUDeviceStorage::Alloc(size_t size) { return ret; } -inline void GPUDeviceStorage::Free(void* ptr) { +inline void GPUDeviceStorage::Free(Storage::Handle handle) { #if MXNET_USE_CUDA + void * ptr = handle.dptr; + mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true); #if MXNET_USE_NCCL std::lock_guard l(Storage::Get()->GetMutex(Context::kGPU)); #endif // MXNET_USE_NCCL diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h index b05b242a799e..55112b5a82e9 100644 --- a/src/storage/naive_storage_manager.h +++ b/src/storage/naive_storage_manager.h @@ -49,7 +49,7 @@ class NaiveStorageManager final : public StorageManager { void Free(Storage::Handle handle) override; void DirectFree(Storage::Handle handle) override { - DeviceStorage::Free(handle.dptr); + DeviceStorage::Free(handle); } private: @@ -58,12 +58,12 @@ class NaiveStorageManager final : public StorageManager { template void NaiveStorageManager::Alloc(Storage::Handle* handle) { - handle->dptr = DeviceStorage::Alloc(handle->size); + handle->dptr = DeviceStorage::Alloc(handle); } template void NaiveStorageManager::Free(Storage::Handle handle) { - DeviceStorage::Free(handle.dptr); + DeviceStorage::Free(handle); } } // namespace storage diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h index e3fec2f4a06d..c4ababbdc03a 100644 --- a/src/storage/pinned_memory_storage.h +++ b/src/storage/pinned_memory_storage.h @@ -41,29 +41,33 @@ class PinnedMemoryStorage { * \param size Size to allocate. * \return Pointer to the storage. */ - inline static void* Alloc(size_t size); + inline static void* Alloc(Storage::Handle* handle); /*! * \brief Deallocation. * \param ptr Pointer to deallocate. */ - inline static void Free(void* ptr); + inline static void Free(Storage::Handle handle); }; -inline void* PinnedMemoryStorage::Alloc(size_t size) { +inline void* PinnedMemoryStorage::Alloc(Storage::Handle* handle) { void* ret = nullptr; + const size_t size = handle->size; #if MXNET_USE_NCCL std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU)); #endif + mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true); // make the memory available across all devices CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable)); return ret; } -inline void PinnedMemoryStorage::Free(void* ptr) { +inline void PinnedMemoryStorage::Free(Storage::Handle handle) { + void * ptr = handle.dptr; #if MXNET_USE_NCCL std::lock_guard lock(Storage::Get()->GetMutex(Context::kGPU)); #endif + mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true); cudaError_t err = cudaFreeHost(ptr); // ignore unloading error, as memory has already been recycled if (err != cudaSuccess && err != cudaErrorCudartUnloading) { diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h index cade8d9495f4..c407a9f00cb6 100644 --- a/src/storage/pooled_storage_manager.h +++ b/src/storage/pooled_storage_manager.h @@ -84,6 +84,7 @@ class GPUPooledStorageManager final : public StorageManager { private: void DirectFreeNoLock(Storage::Handle handle) { + mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true); cudaError_t err = cudaFree(handle.dptr); size_t size = RoundAllocSize(handle.size); // ignore unloading error, as memory has already been recycled @@ -132,6 +133,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) { size_t size = RoundAllocSize(handle->size); auto&& reuse_it = memory_pool_.find(size); if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) { + mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true); size_t free, total; cudaMemGetInfo(&free, &total); if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100) @@ -252,6 +254,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager { } void DirectFreeNoLock(Storage::Handle handle) { + mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true); cudaError_t err = cudaFree(handle.dptr); size_t size = get_size(get_bucket(handle.size)); // ignore unloading error, as memory has already been recycled @@ -288,6 +291,7 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) { size_t size = get_size(bucket); auto&& reuse_pool = memory_pool_[bucket]; if (reuse_pool.size() == 0) { + mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true); size_t free, total; cudaMemGetInfo(&free, &total); if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100) diff --git a/src/storage/storage.cc b/src/storage/storage.cc index c7100a456d80..911d30cc3f05 100644 --- a/src/storage/storage.cc +++ b/src/storage/storage.cc @@ -48,35 +48,6 @@ class StorageImpl : public Storage { static int num_gpu_device; #endif // MXNET_USE_CUDA - static void ActivateDevice(Context ctx) { - switch (ctx.dev_type) { - case Context::kCPU: - break; - case Context::kCPUPinned: -#if MXNET_USE_CUDA - if (num_gpu_device > 0) { - CUDA_CALL(cudaSetDevice(ctx.real_dev_id())); - } -#endif // MXNET_USE_CUDA - break; - case Context::kCPUShared: { -#if defined(ANDROID) || defined(__ANDROID__) - LOG(FATAL) << "Unimplemented device"; -#endif // defined(ANDROID) || defined(__ANDROID__) - } - break; - case Context::kGPU: { -#if MXNET_USE_CUDA - if (num_gpu_device > 0) { - CUDA_CALL(cudaSetDevice(ctx.real_dev_id())); - } -#endif // MXNET_USE_CUDA - break; - } - default: - LOG(FATAL) << "Unimplemented device"; - } - } // internal storage managers std::array, kMaxNumberOfDevices> storage_managers_; @@ -100,6 +71,8 @@ void StorageImpl::Alloc(Storage::Handle* handle) { case Context::kCPUShared: { #if !defined(ANDROID) && !defined(__ANDROID__) ptr = new storage::CPUSharedStorageManager(); +#else + LOG(FATAL) << "Unimplemented device"; #endif // !defined(ANDROID) && !defined(__ANDROID__) break; } @@ -149,13 +122,6 @@ void StorageImpl::Alloc(Storage::Handle* handle) { return ptr; }); -#if MXNET_USE_CUDA - // Will restore gpu device to before ActivateDevice if necessary - bool restore = handle->ctx.dev_type == Context::kCPUPinned || - handle->ctx.dev_type == Context::kGPU; - mxnet::common::cuda::DeviceStore device_store(restore); -#endif - this->ActivateDevice(handle->ctx); manager->Alloc(handle); profiler_.OnAlloc(*handle); } @@ -169,12 +135,6 @@ void StorageImpl::Free(Storage::Handle handle) { return nullptr; }); -#if MXNET_USE_CUDA - // Will restore gpu device to before ActivateDevice if necessary - bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU; - mxnet::common::cuda::DeviceStore device_store(restore); -#endif - this->ActivateDevice(ctx); manager->Free(handle); profiler_.OnFree(handle); } @@ -188,12 +148,6 @@ void StorageImpl::DirectFree(Storage::Handle handle) { return nullptr; }); -#if MXNET_USE_CUDA - // Will restore gpu device to before ActivateDevice if necessary - bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU; - mxnet::common::cuda::DeviceStore device_store(restore); -#endif - this->ActivateDevice(ctx); manager->DirectFree(handle); profiler_.OnFree(handle); }