Use cuda virtual memory management and merge blocks (#36189)

* Use cuda virtual memory management and merge blocks, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * window dll, test=develop * fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop * use autogrowthv2 for system allocator, test=develop * remove ~CUDAVirtualMemAllocator(), test=develop * refine, test=develop * fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop * fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop * fix bug, test=develop * revert system allocator, test =develop * revert multiprocessing, test=develop * fix AutoGrowthBestFitAllocatorV2 mutxt, test=develop * catch cudaErrorInitializationError when create allocator, test=develop * fix cuMemSetAccess use, test=develop * refine cuda api use, test=develop * refine, test=develop * for test, test=develop * for test, test=develop * switch to v2, test=develop * refine virtual allocator, test=develop * Record cuMemCreate and cuMemRelease, test=develop * refine, test=develop * avoid out of bounds, test=develop * rename allocator, test=develop * refine, test=develop * use PADDLE_ENFORCE_CUDA_SUCCESS, test=develop * for test,test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * refine, test=develop
PaddlePaddle · Nov 8, 2021 · a1ec1d5 · a1ec1d5
1 parent 472dcca
commit a1ec1d5
Show file tree

Hide file tree

Showing 15 changed files with 785 additions and 9 deletions.
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -18,6 +18,9 @@ if (WITH_GPU)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
+  if(CUDA_VERSION GREATER_EQUAL 10.2)
+    nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
+  endif()
 endif()
 
 if (WITH_ROCM)
@@ -36,6 +39,9 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 if (WITH_GPU OR WITH_ROCM)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+    if(CUDA_VERSION GREATER_EQUAL 10.2)
+      list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
+    endif()
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
 elseif(WITH_ASCEND)
@@ -72,7 +78,7 @@ else()
                 cpu_allocator)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
+list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator)
 
 if (WITH_ASCEND_CL)
     list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
@@ -107,6 +113,8 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
 cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
 
+cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
+
 if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -33,6 +33,11 @@
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#if CUDA_VERSION >= 10020
+#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
+#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#endif
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_graph.h"
 #endif
@@ -51,6 +56,9 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Whether to use system allocator to allocate CPU and GPU memory. "
     "Only used for unittests.");
 
+PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
+                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");
+
 DECLARE_string(allocator_strategy);
 
 namespace paddle {
@@ -258,6 +266,40 @@ class AllocatorFacadePrivate {
 
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
+#if defined(PADDLE_WITH_HIP)
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 10020
+    CUdevice device;
+    int val;
+    try {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGetAttribute(
+              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+              device));
+    } catch (...) {
+      val = 0;
+    }
+
+    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
+      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
+      allocators_[p] =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(), p);
+    } else {
+      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+          cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+    }
+
+#else
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     auto alignment = platform::GpuMinChunkSize();
     bool need_addr_align = true;
@@ -292,6 +334,8 @@ class AllocatorFacadePrivate {
     }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         underlying_allocator, alignment, 0, allow_free_idle_chunk);
+#endif
+#endif
   }
 #endif
 

diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -0,0 +1,225 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#include <string>
+#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#endif
+#if CUDA_VERSION >= 10020
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
+    const platform::CUDAPlace& place)
+    : place_(place) {
+  CUmemAllocationProp prop = {};
+
+  // Setup the properties common for all the chunks
+  // The allocations will be device pinned memory.
+  // This property structure describes the physical location where the memory
+  // will be allocated via cuMemCreate allong with additional properties In this
+  // case, the allocation will be pinnded device memory local to a given device.
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = place.device;
+  prop_ = prop;
+
+  // Prepare the access descriptor array indicating where and how the backings
+  // should be visible.
+  access_desc_.resize(platform::GetCUDADeviceCount());
+  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+    if (place.device != dev_id) {
+      int capable = 0;
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaDeviceCanAccessPeer(&capable, place.device, dev_id));
+      if (!capable) {
+        continue;
+      }
+    }
+    // Specify which device we are adding mappings for.
+    access_desc_[dev_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_desc_[dev_id].location.id = dev_id;
+
+    // Specify both read and write access.
+    access_desc_[dev_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  }
+
+  // Get the minimum granularity needed for all devices
+  // (the max of the minimum granularity of each participating device)
+  granularity_ = 0;
+  for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+    size_t granularity;
+    prop.location.id = dev_id;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        paddle::platform::dynload::cuMemGetAllocationGranularity(
+            &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    granularity_ = std::max(granularity, granularity_);
+  }
+
+  size_t actual_avail, actual_total;
+  paddle::platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+
+  virtual_mem_size_ = AlignedSize(actual_total, granularity_);
+
+  // Reserve the required contiguous virtual address space for the allocations
+  // The maximum video memory size we can apply for is the video memory size of
+  // GPU,
+  // so the virtual address space size we reserve is equal to the GPU video
+  // memory size
+  PADDLE_ENFORCE_CUDA_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
+      &virtual_mem_base_, virtual_mem_size_, 0, 0, 0));
+
+  virtual_mem_alloced_offset_ = 0;
+}
+
+bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
+
+void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "GPU memory is freed in incorrect device. This may be a bug"));
+
+  auto iter = virtual_2_physical_map_.find(
+      reinterpret_cast<CUdeviceptr>(allocation->ptr()));
+  if (iter == virtual_2_physical_map_.end()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Can not find virtual memory address at %s", allocation->ptr()));
+  }
+
+  int prev_id;
+  cudaGetDevice(&prev_id);
+  if (prev_id != place_.device) {
+    cudaSetDevice(place_.device);
+  }
+
+  auto result =
+      paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second);
+  if (result != CUDA_ERROR_DEINITIALIZED) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+  }
+
+  if (result != CUDA_ERROR_DEINITIALIZED) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::RecordedCuMemRelease(
+        iter->second.first, iter->second.second, place_.device));
+  }
+
+  if (prev_id != place_.device) {
+    cudaSetDevice(prev_id);
+  }
+
+  virtual_2_physical_map_.erase(iter);
+
+  delete allocation;
+}
+
+Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
+  size = AlignedSize(size, granularity_);
+
+  CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;
+
+  if (ptr + size > virtual_mem_base_ + virtual_mem_size_) {
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on GPU Virtual Memory %d. "
+        "Cannot allocate %s memory on GPU Virtual Memory %d, %s memory has "
+        "been allocated and "
+        "available memory is only %s.\n\n"
+        "Please decrease the batch size of your model.\n\n",
+        place_.device, string::HumanReadableSize(size), place_.device,
+        string::HumanReadableSize(virtual_mem_alloced_offset_),
+        string::HumanReadableSize(virtual_mem_size_ -
+                                  virtual_mem_alloced_offset_),
+        place_.device));
+    return nullptr;
+  }
+
+  CUmemGenericAllocationHandle handle;
+
+  paddle::platform::CUDADeviceGuard guard(place_.device);
+
+  // Create physical memory backing allocation.
+  auto result =
+      platform::RecordedCuMemCreate(&handle, size, &prop_, 0, place_.device);
+
+  if (result != CUDA_SUCCESS) {
+    if (result == CUDA_ERROR_OUT_OF_MEMORY) {
+      size_t actual_avail, actual_total;
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+      size_t actual_allocated = actual_total - actual_avail;
+
+      PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+          "\n\nOut of memory error on GPU %d. "
+          "Cannot allocate %s memory on GPU %d, %s memory has been allocated "
+          "and "
+          "available memory is only %s.\n\n"
+          "Please check whether there is any other process using GPU %d.\n"
+          "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
+          "2. If no, please decrease the batch size of your model.\n\n",
+          place_.device, string::HumanReadableSize(size), place_.device,
+          string::HumanReadableSize(actual_allocated),
+          string::HumanReadableSize(actual_avail), place_.device));
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    }
+    return nullptr;
+  }
+
+  // Assign the chunk to the appropriate VA range and release the handle.
+  // After mapping the memory, it can be referenced by virtual address.
+  // The allocation will be kept live until it is unmapped.
+  result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0);
+
+  if (result != CUDA_SUCCESS) {
+    platform::RecordedCuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    return nullptr;
+  }
+
+  // Apply the access descriptors to the whole VA range.
+  result = paddle::platform::dynload::cuMemSetAccess(
+      ptr, size, access_desc_.data(), access_desc_.size());
+
+  if (result != CUDA_SUCCESS) {
+    paddle::platform::dynload::cuMemUnmap(ptr, size);
+    platform::RecordedCuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_CUDA_SUCCESS(result);
+    return nullptr;
+  }
+
+  virtual_2_physical_map_.emplace(ptr, std::make_pair(handle, size));
+
+  virtual_mem_alloced_offset_ += size;
+
+  return new Allocation(reinterpret_cast<void*>(ptr), size,
+                        platform::Place(place_));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+#if CUDA_VERSION >= 10020
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// Allocate memory using NVIDIA's virtual memory management technology
+class CUDAVirtualMemAllocator : public Allocator {
+ public:
+  explicit CUDAVirtualMemAllocator(const platform::CUDAPlace& place);
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::CUDAPlace place_;
+
+  CUdeviceptr virtual_mem_base_;
+  size_t virtual_mem_size_;
+  size_t virtual_mem_alloced_offset_;
+  size_t granularity_;
+
+  CUmemAllocationProp prop_;
+  std::vector<CUmemAccessDesc> access_desc_;
+
+  std::map<CUdeviceptr, std::pair<CUmemGenericAllocationHandle, size_t>>
+      virtual_2_physical_map_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif