Skip to content

Commit

Permalink
Use cuda virtual memory management and merge blocks (#36189)
Browse files Browse the repository at this point in the history
* Use cuda virtual memory management and merge blocks, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* window dll, test=develop

* fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop

* use autogrowthv2 for system allocator, test=develop

* remove ~CUDAVirtualMemAllocator(), test=develop

* refine, test=develop

* fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop

* fix cuda error of CUDA_ERROR_NOT_INITIALIZED, test=develop

* fix bug, test=develop

* revert system allocator, test =develop

* revert multiprocessing, test=develop

* fix AutoGrowthBestFitAllocatorV2 mutxt, test=develop

* catch cudaErrorInitializationError when create allocator, test=develop

* fix cuMemSetAccess use, test=develop

* refine cuda api use, test=develop

* refine, test=develop

* for test, test=develop

* for test, test=develop

* switch to v2, test=develop

* refine virtual allocator, test=develop

* Record cuMemCreate and cuMemRelease, test=develop

* refine, test=develop

* avoid out of bounds, test=develop

* rename allocator, test=develop

* refine, test=develop

* use PADDLE_ENFORCE_CUDA_SUCCESS, test=develop

* for test,test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop

* refine, test=develop
  • Loading branch information
wanghuancoder authored Nov 8, 2021
1 parent 472dcca commit a1ec1d5
Show file tree
Hide file tree
Showing 15 changed files with 785 additions and 9 deletions.
10 changes: 9 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ if (WITH_GPU)
nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
if(CUDA_VERSION GREATER_EQUAL 10.2)
nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
endif()
endif()

if (WITH_ROCM)
Expand All @@ -36,6 +39,9 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)

if (WITH_GPU OR WITH_ROCM)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
if(CUDA_VERSION GREATER_EQUAL 10.2)
list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
endif()
elseif(WITH_XPU)
set(AllocatorFacadeDeps xpu_info)
elseif(WITH_ASCEND)
Expand Down Expand Up @@ -72,7 +78,7 @@ else()
cpu_allocator)
endif()

list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator)
list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator)

if (WITH_ASCEND_CL)
list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
Expand Down Expand Up @@ -107,6 +113,8 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc
cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)

cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)

if(NOT WIN32)
cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
Expand Down
44 changes: 44 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#if CUDA_VERSION >= 10020
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_graph.h"
#endif
Expand All @@ -51,6 +56,9 @@ PADDLE_DEFINE_EXPORTED_bool(
"Whether to use system allocator to allocate CPU and GPU memory. "
"Only used for unittests.");

PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
"Use VirtualMemoryAutoGrowthBestFitAllocator.");

DECLARE_string(allocator_strategy);

namespace paddle {
Expand Down Expand Up @@ -258,6 +266,40 @@ class AllocatorFacadePrivate {

void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
bool allow_free_idle_chunk) {
#if defined(PADDLE_WITH_HIP)
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
#endif

#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10020
CUdevice device;
int val;
try {
PADDLE_ENFORCE_CUDA_SUCCESS(
paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));

PADDLE_ENFORCE_CUDA_SUCCESS(
paddle::platform::dynload::cuDeviceGetAttribute(
&val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
device));
} catch (...) {
val = 0;
}

if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
allocators_[p] =
std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), p);
} else {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
}

#else
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
auto alignment = platform::GpuMinChunkSize();
bool need_addr_align = true;
Expand Down Expand Up @@ -292,6 +334,8 @@ class AllocatorFacadePrivate {
}
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
underlying_allocator, alignment, 0, allow_free_idle_chunk);
#endif
#endif
}
#endif

Expand Down
225 changes: 225 additions & 0 deletions paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif

#include <string>
#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
#include "paddle/fluid/platform/enforce.h"

#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/cuda_driver.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#if CUDA_VERSION >= 10020

namespace paddle {
namespace memory {
namespace allocation {

CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
const platform::CUDAPlace& place)
: place_(place) {
CUmemAllocationProp prop = {};

// Setup the properties common for all the chunks
// The allocations will be device pinned memory.
// This property structure describes the physical location where the memory
// will be allocated via cuMemCreate allong with additional properties In this
// case, the allocation will be pinnded device memory local to a given device.
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = place.device;
prop_ = prop;

// Prepare the access descriptor array indicating where and how the backings
// should be visible.
access_desc_.resize(platform::GetCUDADeviceCount());
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
if (place.device != dev_id) {
int capable = 0;
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceCanAccessPeer(&capable, place.device, dev_id));
if (!capable) {
continue;
}
}
// Specify which device we are adding mappings for.
access_desc_[dev_id].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access_desc_[dev_id].location.id = dev_id;

// Specify both read and write access.
access_desc_[dev_id].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
}

// Get the minimum granularity needed for all devices
// (the max of the minimum granularity of each participating device)
granularity_ = 0;
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
size_t granularity;
prop.location.id = dev_id;
PADDLE_ENFORCE_CUDA_SUCCESS(
paddle::platform::dynload::cuMemGetAllocationGranularity(
&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
granularity_ = std::max(granularity, granularity_);
}

size_t actual_avail, actual_total;
paddle::platform::CUDADeviceGuard guard(place.device);
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));

virtual_mem_size_ = AlignedSize(actual_total, granularity_);

// Reserve the required contiguous virtual address space for the allocations
// The maximum video memory size we can apply for is the video memory size of
// GPU,
// so the virtual address space size we reserve is equal to the GPU video
// memory size
PADDLE_ENFORCE_CUDA_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
&virtual_mem_base_, virtual_mem_size_, 0, 0, 0));

virtual_mem_alloced_offset_ = 0;
}

bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }

void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
PADDLE_ENFORCE_EQ(
BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
platform::errors::PermissionDenied(
"GPU memory is freed in incorrect device. This may be a bug"));

auto iter = virtual_2_physical_map_.find(
reinterpret_cast<CUdeviceptr>(allocation->ptr()));
if (iter == virtual_2_physical_map_.end()) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Can not find virtual memory address at %s", allocation->ptr()));
}

int prev_id;
cudaGetDevice(&prev_id);
if (prev_id != place_.device) {
cudaSetDevice(place_.device);
}

auto result =
paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second);
if (result != CUDA_ERROR_DEINITIALIZED) {
PADDLE_ENFORCE_CUDA_SUCCESS(result);
}

if (result != CUDA_ERROR_DEINITIALIZED) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::RecordedCuMemRelease(
iter->second.first, iter->second.second, place_.device));
}

if (prev_id != place_.device) {
cudaSetDevice(prev_id);
}

virtual_2_physical_map_.erase(iter);

delete allocation;
}

Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
size = AlignedSize(size, granularity_);

CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;

if (ptr + size > virtual_mem_base_ + virtual_mem_size_) {
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU Virtual Memory %d. "
"Cannot allocate %s memory on GPU Virtual Memory %d, %s memory has "
"been allocated and "
"available memory is only %s.\n\n"
"Please decrease the batch size of your model.\n\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(virtual_mem_alloced_offset_),
string::HumanReadableSize(virtual_mem_size_ -
virtual_mem_alloced_offset_),
place_.device));
return nullptr;
}

CUmemGenericAllocationHandle handle;

paddle::platform::CUDADeviceGuard guard(place_.device);

// Create physical memory backing allocation.
auto result =
platform::RecordedCuMemCreate(&handle, size, &prop_, 0, place_.device);

if (result != CUDA_SUCCESS) {
if (result == CUDA_ERROR_OUT_OF_MEMORY) {
size_t actual_avail, actual_total;
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
size_t actual_allocated = actual_total - actual_avail;

PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on GPU %d. "
"Cannot allocate %s memory on GPU %d, %s memory has been allocated "
"and "
"available memory is only %s.\n\n"
"Please check whether there is any other process using GPU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
"2. If no, please decrease the batch size of your model.\n\n",
place_.device, string::HumanReadableSize(size), place_.device,
string::HumanReadableSize(actual_allocated),
string::HumanReadableSize(actual_avail), place_.device));
} else {
PADDLE_ENFORCE_CUDA_SUCCESS(result);
}
return nullptr;
}

// Assign the chunk to the appropriate VA range and release the handle.
// After mapping the memory, it can be referenced by virtual address.
// The allocation will be kept live until it is unmapped.
result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0);

if (result != CUDA_SUCCESS) {
platform::RecordedCuMemRelease(handle, size, place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(result);
return nullptr;
}

// Apply the access descriptors to the whole VA range.
result = paddle::platform::dynload::cuMemSetAccess(
ptr, size, access_desc_.data(), access_desc_.size());

if (result != CUDA_SUCCESS) {
paddle::platform::dynload::cuMemUnmap(ptr, size);
platform::RecordedCuMemRelease(handle, size, place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(result);
return nullptr;
}

virtual_2_physical_map_.emplace(ptr, std::make_pair(handle, size));

virtual_mem_alloced_offset_ += size;

return new Allocation(reinterpret_cast<void*>(ptr), size,
platform::Place(place_));
}

} // namespace allocation
} // namespace memory
} // namespace paddle

#endif
62 changes: 62 additions & 0 deletions paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif

#include <mutex> // NOLINT
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h"

#if CUDA_VERSION >= 10020

namespace paddle {
namespace memory {
namespace allocation {

// Allocate memory using NVIDIA's virtual memory management technology
class CUDAVirtualMemAllocator : public Allocator {
public:
explicit CUDAVirtualMemAllocator(const platform::CUDAPlace& place);

bool IsAllocThreadSafe() const override;

protected:
void FreeImpl(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size) override;

private:
platform::CUDAPlace place_;

CUdeviceptr virtual_mem_base_;
size_t virtual_mem_size_;
size_t virtual_mem_alloced_offset_;
size_t granularity_;

CUmemAllocationProp prop_;
std::vector<CUmemAccessDesc> access_desc_;

std::map<CUdeviceptr, std::pair<CUmemGenericAllocationHandle, size_t>>
virtual_2_physical_map_;
};

} // namespace allocation
} // namespace memory
} // namespace paddle

#endif
Loading

0 comments on commit a1ec1d5

Please sign in to comment.