Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new APIs for GPU memory monitoring (max_memory_allocated, max_memory_reserved, memory_allocated, memory_reserved) #38657

Merged
merged 16 commits into from
Mar 30, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cc_library(allocator SRCS allocator.cc DEPS place)
cc_library(allocator SRCS allocator.cc DEPS place monitor)
cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/memory/allocation/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,16 @@

#include "paddle/fluid/framework/inlined_vector.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/pten/core/allocator.h"

DECLARE_string(allocator_strategy);

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
USE_GPU_ALLOC_STAT;
#endif

namespace paddle {
namespace memory {
namespace allocation {
Expand Down Expand Up @@ -142,6 +147,11 @@ using DecoratedAllocationPtr =
class Allocator : public pten::Allocator {
public:
static void AllocationDeleter(pten::Allocation* allocation) {
if (platform::is_gpu_place(allocation->place())) {
int dev_id = allocation->place().GetDeviceId();
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id) + "_alloc_size",
allocation->size());
}
Allocator* allocator =
static_cast<Allocation*>(allocation)->TopDecoratedAllocator();
allocator->Free(allocation);
Expand Down
38 changes: 29 additions & 9 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/retry_allocator.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
Expand Down Expand Up @@ -832,17 +833,27 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) {
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
return m_->GetAllocator(place, size)->Allocate(size);
if (UNLIKELY(!platform::CUDAGraph::IsCapturing())) {
#endif
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
#ifdef PADDLE_WITH_CUDA
}
#endif

platform::CUDAPlace cuda_place(place.GetDeviceId());
return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
}
#endif

return m_->GetAllocator(place, size)->Allocate(size);
AllocationPtr allocation = m_->GetAllocator(place, size)->Allocate(size);
if (platform::is_gpu_place(place)) {
int dev_id = place.GetDeviceId();
int64_t alloc_size =
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id) + "_alloc_size",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请问这里能做到通过 AllocatorFacade 分配内存等价于拿到具体的 Allocator 然后返回 Allocator->Allocate(size) 吗?后续 Tensor 计划不走 AllocatorFacade,而是直接传入具体的 Allocator

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已另作讨论,这里采集显存数据的相关逻辑无法实现到具体的Allocator里,与pten直接获取Allocator对象后分配内存的设想不等价,之后pten的Alloc接口在获取Allocator分配内存后,也需要添加类似的数据采集逻辑。此处存在一些和Allocator以及Pten最初设计不太切合的修改,短期先同步后进行合入,不阻塞相关功能的开发,后续pten项目相关负责人员腾出时间后,再对类似的问题进行集中讨论和优化整改。 @phlrain @chenwhql @zhiqiu @jim19930609

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

按先期形成的共识,Allocator 分配逻辑的统一出口为 https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/allocation/allocator.h#L142 。目前因为进度原因先行同意此合入,相关问题 @From00 后续处理。

allocation->size());
STAT_INT_UPDATE_MAXIMUM(
"STAT_gpu" + std::to_string(dev_id) + "_max_alloc_size", alloc_size);
}

return allocation;
}

uint64_t AllocatorFacade::Release(const platform::Place& place) {
Expand Down Expand Up @@ -933,12 +944,21 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
#endif

platform::CUDAPlace p(place.GetDeviceId());
AllocationPtr allocation;
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
->Allocate(size);
allocation = m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
->Allocate(size);
} else {
return m_->GetAllocator(p, size)->Allocate(size);
allocation = m_->GetAllocator(p, size)->Allocate(size);
}

int dev_id = p.GetDeviceId();
int64_t alloc_size = STAT_INT_ADD(
"STAT_gpu" + std::to_string(dev_id) + "_alloc_size", allocation->size());
STAT_INT_UPDATE_MAXIMUM(
"STAT_gpu" + std::to_string(dev_id) + "_max_alloc_size", alloc_size);

return allocation;
}

uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/platform/device/gpu/gpu_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,10 @@ class RecordedGpuMallocHelper {
#endif
if (result == gpuSuccess) {
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);

int64_t mem_size = STAT_INT_ADD(
"STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
STAT_INT_UPDATE_MAXIMUM(
"STAT_gpu" + std::to_string(dev_id_) + "_max_mem_size", mem_size);
#ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr);
#endif
Expand Down
51 changes: 51 additions & 0 deletions paddle/fluid/platform/monitor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,57 @@ DEFINE_INT_STATUS(STAT_gpu13_mem_size)
DEFINE_INT_STATUS(STAT_gpu14_mem_size)
DEFINE_INT_STATUS(STAT_gpu15_mem_size)

DEFINE_INT_STATUS(STAT_gpu0_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu1_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu2_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu3_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu4_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu5_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu6_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu7_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu8_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu9_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu10_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu11_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu12_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu13_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu14_max_mem_size)
DEFINE_INT_STATUS(STAT_gpu15_max_mem_size)

DEFINE_INT_STATUS(STAT_gpu0_alloc_size)
DEFINE_INT_STATUS(STAT_gpu1_alloc_size)
DEFINE_INT_STATUS(STAT_gpu2_alloc_size)
DEFINE_INT_STATUS(STAT_gpu3_alloc_size)
DEFINE_INT_STATUS(STAT_gpu4_alloc_size)
DEFINE_INT_STATUS(STAT_gpu5_alloc_size)
DEFINE_INT_STATUS(STAT_gpu6_alloc_size)
DEFINE_INT_STATUS(STAT_gpu7_alloc_size)
DEFINE_INT_STATUS(STAT_gpu8_alloc_size)
DEFINE_INT_STATUS(STAT_gpu9_alloc_size)
DEFINE_INT_STATUS(STAT_gpu10_alloc_size)
DEFINE_INT_STATUS(STAT_gpu11_alloc_size)
DEFINE_INT_STATUS(STAT_gpu12_alloc_size)
DEFINE_INT_STATUS(STAT_gpu13_alloc_size)
DEFINE_INT_STATUS(STAT_gpu14_alloc_size)
DEFINE_INT_STATUS(STAT_gpu15_alloc_size)

DEFINE_INT_STATUS(STAT_gpu0_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu1_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu2_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu3_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu4_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu5_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu6_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu7_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu8_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu9_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu10_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu11_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu12_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu13_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu14_max_alloc_size)
DEFINE_INT_STATUS(STAT_gpu15_max_alloc_size)

// For Ascend NPU
DEFINE_INT_STATUS(STAT_npu0_mem_size)
DEFINE_INT_STATUS(STAT_npu1_mem_size)
Expand Down
92 changes: 75 additions & 17 deletions paddle/fluid/platform/monitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ class StatValue : public MonitorRegistrar {
std::lock_guard<std::mutex> lock(mu_);
return v_ -= inc;
}
T update_maximum(T value) {
std::lock_guard<std::mutex> lock(mu_);
return v_ < value ? v_ = value : v_;
}
T reset(T value = 0) {
std::lock_guard<std::mutex> lock(mu_);
return v_ = value;
Expand Down Expand Up @@ -135,6 +139,10 @@ class StatRegistry {
paddle::platform::StatRegistry<int64_t>::Instance().get(item)->increase(t)
#define STAT_INT_SUB(item, t) \
paddle::platform::StatRegistry<int64_t>::Instance().get(item)->decrease(t)
#define STAT_INT_UPDATE_MAXIMUM(item, t) \
paddle::platform::StatRegistry<int64_t>::Instance() \
.get(item) \
->update_maximum(t)

#define STAT_FLOAT_ADD(item, t) \
paddle::platform::StatRegistry<float>::Instance().get(item)->increase(t)
Expand Down Expand Up @@ -170,23 +178,73 @@ class StatRegistry {
extern paddle::platform::StatValue<float> _##item; \
USE_STAT(item)

#define USE_GPU_MEM_STAT \
USE_INT_STAT(STAT_gpu0_mem_size); \
USE_INT_STAT(STAT_gpu1_mem_size); \
USE_INT_STAT(STAT_gpu2_mem_size); \
USE_INT_STAT(STAT_gpu3_mem_size); \
USE_INT_STAT(STAT_gpu4_mem_size); \
USE_INT_STAT(STAT_gpu5_mem_size); \
USE_INT_STAT(STAT_gpu6_mem_size); \
USE_INT_STAT(STAT_gpu7_mem_size); \
USE_INT_STAT(STAT_gpu8_mem_size); \
USE_INT_STAT(STAT_gpu9_mem_size); \
USE_INT_STAT(STAT_gpu10_mem_size); \
USE_INT_STAT(STAT_gpu11_mem_size); \
USE_INT_STAT(STAT_gpu12_mem_size); \
USE_INT_STAT(STAT_gpu13_mem_size); \
USE_INT_STAT(STAT_gpu14_mem_size); \
USE_INT_STAT(STAT_gpu15_mem_size)
#define USE_GPU_MEM_STAT \
USE_INT_STAT(STAT_gpu0_mem_size); \
USE_INT_STAT(STAT_gpu1_mem_size); \
USE_INT_STAT(STAT_gpu2_mem_size); \
USE_INT_STAT(STAT_gpu3_mem_size); \
USE_INT_STAT(STAT_gpu4_mem_size); \
USE_INT_STAT(STAT_gpu5_mem_size); \
USE_INT_STAT(STAT_gpu6_mem_size); \
USE_INT_STAT(STAT_gpu7_mem_size); \
USE_INT_STAT(STAT_gpu8_mem_size); \
USE_INT_STAT(STAT_gpu9_mem_size); \
USE_INT_STAT(STAT_gpu10_mem_size); \
USE_INT_STAT(STAT_gpu11_mem_size); \
USE_INT_STAT(STAT_gpu12_mem_size); \
USE_INT_STAT(STAT_gpu13_mem_size); \
USE_INT_STAT(STAT_gpu14_mem_size); \
USE_INT_STAT(STAT_gpu15_mem_size); \
USE_INT_STAT(STAT_gpu0_max_mem_size); \
USE_INT_STAT(STAT_gpu1_max_mem_size); \
USE_INT_STAT(STAT_gpu2_max_mem_size); \
USE_INT_STAT(STAT_gpu3_max_mem_size); \
USE_INT_STAT(STAT_gpu4_max_mem_size); \
USE_INT_STAT(STAT_gpu5_max_mem_size); \
USE_INT_STAT(STAT_gpu6_max_mem_size); \
USE_INT_STAT(STAT_gpu7_max_mem_size); \
USE_INT_STAT(STAT_gpu8_max_mem_size); \
USE_INT_STAT(STAT_gpu9_max_mem_size); \
USE_INT_STAT(STAT_gpu10_max_mem_size); \
USE_INT_STAT(STAT_gpu11_max_mem_size); \
USE_INT_STAT(STAT_gpu12_max_mem_size); \
USE_INT_STAT(STAT_gpu13_max_mem_size); \
USE_INT_STAT(STAT_gpu14_max_mem_size); \
USE_INT_STAT(STAT_gpu15_max_mem_size)

#define USE_GPU_ALLOC_STAT \
USE_INT_STAT(STAT_gpu0_alloc_size); \
USE_INT_STAT(STAT_gpu1_alloc_size); \
USE_INT_STAT(STAT_gpu2_alloc_size); \
USE_INT_STAT(STAT_gpu3_alloc_size); \
USE_INT_STAT(STAT_gpu4_alloc_size); \
USE_INT_STAT(STAT_gpu5_alloc_size); \
USE_INT_STAT(STAT_gpu6_alloc_size); \
USE_INT_STAT(STAT_gpu7_alloc_size); \
USE_INT_STAT(STAT_gpu8_alloc_size); \
USE_INT_STAT(STAT_gpu9_alloc_size); \
USE_INT_STAT(STAT_gpu10_alloc_size); \
USE_INT_STAT(STAT_gpu11_alloc_size); \
USE_INT_STAT(STAT_gpu12_alloc_size); \
USE_INT_STAT(STAT_gpu13_alloc_size); \
USE_INT_STAT(STAT_gpu14_alloc_size); \
USE_INT_STAT(STAT_gpu15_alloc_size); \
USE_INT_STAT(STAT_gpu0_max_alloc_size); \
USE_INT_STAT(STAT_gpu1_max_alloc_size); \
USE_INT_STAT(STAT_gpu2_max_alloc_size); \
USE_INT_STAT(STAT_gpu3_max_alloc_size); \
USE_INT_STAT(STAT_gpu4_max_alloc_size); \
USE_INT_STAT(STAT_gpu5_max_alloc_size); \
USE_INT_STAT(STAT_gpu6_max_alloc_size); \
USE_INT_STAT(STAT_gpu7_max_alloc_size); \
USE_INT_STAT(STAT_gpu8_max_alloc_size); \
USE_INT_STAT(STAT_gpu9_max_alloc_size); \
USE_INT_STAT(STAT_gpu10_max_alloc_size); \
USE_INT_STAT(STAT_gpu11_max_alloc_size); \
USE_INT_STAT(STAT_gpu12_max_alloc_size); \
USE_INT_STAT(STAT_gpu13_max_alloc_size); \
USE_INT_STAT(STAT_gpu14_max_alloc_size); \
USE_INT_STAT(STAT_gpu15_max_alloc_size)

#define USE_NPU_MEM_STAT \
USE_INT_STAT(STAT_npu0_mem_size); \
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2558,6 +2558,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("op_support_gpu", OpSupportGPU);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
m.def("cuda_empty_cache", [] {
for (int dev_id : platform::GetSelectedDevices()) {
auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
Expand Down
Loading