From d9e3d9f8479d1b88ab7b49d659d817484639d2ff Mon Sep 17 00:00:00 2001
From: Przemyslaw Tredak <ptrendx@gmail.com>
Date: Fri, 4 Jan 2019 17:55:23 -0800
Subject: [PATCH] Less cudaGet/SetDevice calls in Gluon execution (#13764)

* Remove unnecessary cudaGetDevice/cudaSetDevice calls

* Fixes for the DeviceGuard

* Retrigger CI

* Fix for possible invalid device ordinal when using DeviceStore while
driver is unloading

* Fix for RTC when the driver API call is the first call

* Added DeviceStore to pooled engine
---
 src/common/cuda_utils.h              | 19 +++++++++--
 src/engine/stream_manager.h          | 10 ++----
 src/engine/threaded_engine_pooled.cc |  8 ++++-
 src/kvstore/comm.h                   |  5 ++-
 src/kvstore/comm_tree.h              |  3 +-
 src/storage/cpu_device_storage.h     | 10 +++---
 src/storage/gpu_device_storage.h     | 12 ++++---
 src/storage/naive_storage_manager.h  |  6 ++--
 src/storage/pinned_memory_storage.h  | 12 ++++---
 src/storage/pooled_storage_manager.h |  4 +++
 src/storage/storage.cc               | 50 ++--------------------------
 11 files changed, 59 insertions(+), 80 deletions(-)
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 047edde88a53..0dd9d2db3722 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -286,22 +286,35 @@ inline DType __device__ CudaMin(DType a, DType b) {
 class DeviceStore {
  public:
   /*! \brief default constructor- only optionally restores previous device */
-  explicit DeviceStore(bool restore = true) : restore_(restore) {
+  explicit DeviceStore(int requested_device = -1, bool restore = true) :
+    restore_device_(-1),
+    current_device_(requested_device),
+    restore_(restore) {
     if (restore_)
       CUDA_CALL(cudaGetDevice(&restore_device_));
+    if (requested_device != restore_device_) {
+      SetDevice(requested_device);
+    }
   }
 
   ~DeviceStore() {
-    if (restore_)
+    if (restore_ &&
+        current_device_ != restore_device_ &&
+        current_device_ != -1 &&
+        restore_device_ != -1)
       CUDA_CALL(cudaSetDevice(restore_device_));
   }
 
   void SetDevice(int device) {
-    CUDA_CALL(cudaSetDevice(device));
+    if (device != -1) {
+      CUDA_CALL(cudaSetDevice(device));
+      current_device_ = device;
+    }
   }
 
  private:
   int restore_device_;
+  int current_device_;
   bool restore_;
 };
 
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index d4ac042ff401..516e04bf5e82 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -65,9 +65,6 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
     Context const& ctx) {
   RunContext ret;
-#if MXNET_USE_CUDA
-  mxnet::common::cuda::DeviceStore device_store;
-#endif
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
       ret = RunContext{ctx, nullptr};
@@ -75,11 +72,11 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
       std::size_t use_counter;
-      device_store.SetDevice(ctx.dev_id);
       {
         std::lock_guard<std::mutex> lock{mutex_};
         auto&& counter = gpu_cnt_.at(ctx.dev_id);
         if (counter == -1) {
+          mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
           for (auto&& i : gpu_streams_.at(ctx.dev_id)) {
             i = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, ctx.dev_id);
           }
@@ -104,19 +101,16 @@ template <std::size_t kNumGpus, std::size_t kStreams>
 RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
     Context const& ctx) {
   RunContext ret;
-#if MXNET_USE_CUDA
-  mxnet::common::cuda::DeviceStore device_store;
-#endif
   switch (ctx.dev_mask()) {
     case cpu::kDevMask:
       ret = RunContext{ctx, nullptr};
       break;
     case gpu::kDevMask: {
 #if MXNET_USE_CUDA
-      device_store.SetDevice(ctx.dev_id);
       {
         std::lock_guard<std::mutex> lock{mutex_};
         if (gpu_io_streams_.at(ctx.dev_id) == nullptr) {
+          mxnet::common::cuda::DeviceStore device_store(ctx.dev_id);
           gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
         }
       }
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 1abb82fd6a67..c6eb99508e09 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -31,6 +31,9 @@
 #include "./threaded_engine.h"
 #include "./thread_pool.h"
 #include "./stream_manager.h"
+#if MXNET_USE_CUDA
+#include "../common/cuda_utils.h"
+#endif
 
 namespace mxnet {
 namespace engine {
@@ -130,10 +133,13 @@ class ThreadedEnginePooled : public ThreadedEngine {
    * \param opr_block The operator block.
    */
   void DoExecute(OprBlock* opr_block) {
+#if MXNET_USE_CUDA
+    mxnet::common::cuda::DeviceStore device_store(-1, false);
+#endif
     assert(opr_block->wait.load() == 0);
     if (opr_block->ctx.dev_mask() == gpu::kDevMask) {
       #if MXNET_USE_CUDA
-      CUDA_CALL(cudaSetDevice(opr_block->ctx.dev_id));
+      device_store.SetDevice(opr_block->ctx.dev_id);
       #else   // MXNET_USE_CUDA
       LOG(FATAL) << "Please compile with CUDA enabled";
       #endif  // MXNET_USE_CUDA
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 7090aaf46d8f..08f6155cb5b4 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -724,10 +724,9 @@ class CommDevice : public Comm {
     int enabled = 0;
     std::vector<int> p2p(n*n);
 
-    // Restores active device to what it was before EnableP2P
-    mxnet::common::cuda::DeviceStore device_store;
     for (int i = 0; i < n; ++i) {
-     device_store.SetDevice(gpus[i]);
+      // Restores active device to what it was before EnableP2P
+      mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
         int access;
         cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index e3b2ad7f57d3..b62228cd2885 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -339,9 +339,8 @@ class CommDeviceTree : public CommDevice {
     int n = static_cast<int>(gpus.size());
     int enabled = 0;
     std::vector<int> p2p(n*n);
-    mxnet::common::cuda::DeviceStore device_store;
     for (int i = 0; i < n; ++i) {
-      device_store.SetDevice(gpus[i]);
+      mxnet::common::cuda::DeviceStore device_store(gpus[i]);
       for (int j = 0; j < n; j++) {
         int access;
         cudaDeviceCanAccessPeer(&access, gpus[i], gpus[j]);
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 43e98fe04a1f..25ad61efb232 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -43,12 +43,12 @@ class CPUDeviceStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 
  private:
   /*!
@@ -63,7 +63,8 @@ class CPUDeviceStorage {
 #endif
 };  // class CPUDeviceStorage
 
-inline void* CPUDeviceStorage::Alloc(size_t size) {
+inline void* CPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  const size_t size = handle->size;
   void* ptr;
 #if _MSC_VER
   ptr = _aligned_malloc(size, alignment_);
@@ -75,7 +76,8 @@ inline void* CPUDeviceStorage::Alloc(size_t size) {
   return ptr;
 }
 
-inline void CPUDeviceStorage::Free(void* ptr) {
+inline void CPUDeviceStorage::Free(Storage::Handle handle) {
+  void * ptr = handle.dptr;
 #if _MSC_VER
   _aligned_free(ptr);
 #else
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 435c7e81d2ae..562badb8752e 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -46,17 +46,19 @@ class GPUDeviceStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 };  // class GPUDeviceStorage
 
-inline void* GPUDeviceStorage::Alloc(size_t size) {
+inline void* GPUDeviceStorage::Alloc(Storage::Handle* handle) {
+  const size_t size = handle->size;
   void* ret = nullptr;
 #if MXNET_USE_CUDA
+  mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
@@ -69,8 +71,10 @@ inline void* GPUDeviceStorage::Alloc(size_t size) {
   return ret;
 }
 
-inline void GPUDeviceStorage::Free(void* ptr) {
+inline void GPUDeviceStorage::Free(Storage::Handle handle) {
 #if MXNET_USE_CUDA
+  void * ptr = handle.dptr;
+  mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index b05b242a799e..55112b5a82e9 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -49,7 +49,7 @@ class NaiveStorageManager final : public StorageManager {
   void Free(Storage::Handle handle) override;
 
   void DirectFree(Storage::Handle handle) override {
-    DeviceStorage::Free(handle.dptr);
+    DeviceStorage::Free(handle);
   }
 
  private:
@@ -58,12 +58,12 @@ class NaiveStorageManager final : public StorageManager {
 
 template <class DeviceStorage>
 void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
-  handle->dptr = DeviceStorage::Alloc(handle->size);
+  handle->dptr = DeviceStorage::Alloc(handle);
 }
 
 template <class DeviceStorage>
 void NaiveStorageManager<DeviceStorage>::Free(Storage::Handle handle) {
-  DeviceStorage::Free(handle.dptr);
+  DeviceStorage::Free(handle);
 }
 
 }  // namespace storage
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index e3fec2f4a06d..c4ababbdc03a 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -41,29 +41,33 @@ class PinnedMemoryStorage {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  inline static void* Alloc(size_t size);
+  inline static void* Alloc(Storage::Handle* handle);
 
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    */
-  inline static void Free(void* ptr);
+  inline static void Free(Storage::Handle handle);
 };
 
-inline void* PinnedMemoryStorage::Alloc(size_t size) {
+inline void* PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
   void* ret = nullptr;
+  const size_t size = handle->size;
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
+  mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
   // make the memory available across all devices
   CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
   return ret;
 }
 
-inline void PinnedMemoryStorage::Free(void* ptr) {
+inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
+  void * ptr = handle.dptr;
 #if MXNET_USE_NCCL
   std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
 #endif
+  mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
   cudaError_t err = cudaFreeHost(ptr);
   // ignore unloading error, as memory has already been recycled
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index cade8d9495f4..c407a9f00cb6 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -84,6 +84,7 @@ class GPUPooledStorageManager final : public StorageManager {
 
  private:
   void DirectFreeNoLock(Storage::Handle handle) {
+    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
     cudaError_t err = cudaFree(handle.dptr);
     size_t size = RoundAllocSize(handle.size);
     // ignore unloading error, as memory has already been recycled
@@ -132,6 +133,7 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
   size_t size = RoundAllocSize(handle->size);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
+    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
     size_t free, total;
     cudaMemGetInfo(&free, &total);
     if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
@@ -252,6 +254,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
   }
 
   void DirectFreeNoLock(Storage::Handle handle) {
+    mxnet::common::cuda::DeviceStore device_store(handle.ctx.real_dev_id(), true);
     cudaError_t err = cudaFree(handle.dptr);
     size_t size = get_size(get_bucket(handle.size));
     // ignore unloading error, as memory has already been recycled
@@ -288,6 +291,7 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
   size_t size = get_size(bucket);
   auto&& reuse_pool = memory_pool_[bucket];
   if (reuse_pool.size() == 0) {
+    mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
     size_t free, total;
     cudaMemGetInfo(&free, &total);
     if (free <= total * reserve_ / 100 || size > free - total * reserve_ / 100)
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index c7100a456d80..911d30cc3f05 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -48,35 +48,6 @@ class StorageImpl : public Storage {
   static int num_gpu_device;
 #endif  // MXNET_USE_CUDA
 
-  static void ActivateDevice(Context ctx) {
-    switch (ctx.dev_type) {
-      case Context::kCPU:
-        break;
-      case Context::kCPUPinned:
-#if MXNET_USE_CUDA
-        if (num_gpu_device > 0) {
-          CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
-        }
-#endif  // MXNET_USE_CUDA
-        break;
-      case Context::kCPUShared: {
-#if defined(ANDROID) || defined(__ANDROID__)
-        LOG(FATAL) << "Unimplemented device";
-#endif  // defined(ANDROID) || defined(__ANDROID__)
-      }
-        break;
-      case Context::kGPU: {
-#if MXNET_USE_CUDA
-          if (num_gpu_device > 0) {
-            CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
-          }
-#endif  // MXNET_USE_CUDA
-          break;
-        }
-      default:
-        LOG(FATAL) << "Unimplemented device";
-    }
-  }
   // internal storage managers
   std::array<common::LazyAllocArray<storage::StorageManager>,
              kMaxNumberOfDevices> storage_managers_;
@@ -100,6 +71,8 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
           case Context::kCPUShared: {
 #if !defined(ANDROID) && !defined(__ANDROID__)
             ptr = new storage::CPUSharedStorageManager();
+#else
+            LOG(FATAL) << "Unimplemented device";
 #endif  // !defined(ANDROID) && !defined(__ANDROID__)
             break;
           }
@@ -149,13 +122,6 @@ void StorageImpl::Alloc(Storage::Handle* handle) {
         return ptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = handle->ctx.dev_type == Context::kCPUPinned ||
-                 handle->ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(handle->ctx);
   manager->Alloc(handle);
   profiler_.OnAlloc(*handle);
 }
@@ -169,12 +135,6 @@ void StorageImpl::Free(Storage::Handle handle) {
         return nullptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(ctx);
   manager->Free(handle);
   profiler_.OnFree(handle);
 }
@@ -188,12 +148,6 @@ void StorageImpl::DirectFree(Storage::Handle handle) {
         return nullptr;
       });
 
-#if MXNET_USE_CUDA
-  // Will restore gpu device to before ActivateDevice if necessary
-  bool restore = ctx.dev_type == Context::kCPUPinned || ctx.dev_type == Context::kGPU;
-  mxnet::common::cuda::DeviceStore device_store(restore);
-#endif
-  this->ActivateDevice(ctx);
   manager->DirectFree(handle);
   profiler_.OnFree(handle);
 }