PaddlePaddle · From00 · Mar 30, 2022 · Jan 1, 2022 · Jan 17, 2022 · Jan 17, 2022
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(allocator SRCS allocator.cc DEPS place)
+cc_library(allocator SRCS allocator.cc DEPS place monitor)
 cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)

diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
@@ -21,11 +21,16 @@
 
 #include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/pten/core/allocator.h"
 
 DECLARE_string(allocator_strategy);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+USE_GPU_ALLOC_STAT;
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -142,6 +147,11 @@ using DecoratedAllocationPtr =
 class Allocator : public pten::Allocator {
  public:
   static void AllocationDeleter(pten::Allocation* allocation) {
+    if (platform::is_gpu_place(allocation->place())) {
+      int dev_id = allocation->place().GetDeviceId();
+      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id) + "_alloc_size",
+                   allocation->size());
+    }
     Allocator* allocator =
         static_cast<Allocation*>(allocation)->TopDecoratedAllocator();
     allocator->Free(allocation);

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -832,17 +833,27 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       size > 0 && FLAGS_use_system_allocator == false) {
 #ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      return m_->GetAllocator(place, size)->Allocate(size);
+    if (UNLIKELY(!platform::CUDAGraph::IsCapturing())) {
+#endif
+      platform::CUDAPlace cuda_place(place.GetDeviceId());
+      return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
+#ifdef PADDLE_WITH_CUDA
     }
 #endif
-
-    platform::CUDAPlace cuda_place(place.GetDeviceId());
-    return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
   }
 #endif
 
-  return m_->GetAllocator(place, size)->Allocate(size);
+  AllocationPtr allocation = m_->GetAllocator(place, size)->Allocate(size);
+  if (platform::is_gpu_place(place)) {
+    int dev_id = place.GetDeviceId();
+    int64_t alloc_size =
+        STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id) + "_alloc_size",
+                     allocation->size());
+    STAT_INT_UPDATE_MAXIMUM(
+        "STAT_gpu" + std::to_string(dev_id) + "_max_alloc_size", alloc_size);
+  }
+
+  return allocation;
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
@@ -933,12 +944,21 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
 #endif
 
   platform::CUDAPlace p(place.GetDeviceId());
+  AllocationPtr allocation;
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
-        ->Allocate(size);
+    allocation = m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
+                     ->Allocate(size);
   } else {
-    return m_->GetAllocator(p, size)->Allocate(size);
+    allocation = m_->GetAllocator(p, size)->Allocate(size);
   }
+
+  int dev_id = p.GetDeviceId();
+  int64_t alloc_size = STAT_INT_ADD(
+      "STAT_gpu" + std::to_string(dev_id) + "_alloc_size", allocation->size());
+  STAT_INT_UPDATE_MAXIMUM(
+      "STAT_gpu" + std::to_string(dev_id) + "_max_alloc_size", alloc_size);
+
+  return allocation;
 }
 
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,

diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -197,8 +197,10 @@ class RecordedGpuMallocHelper {
 #endif
     if (result == gpuSuccess) {
       cur_size_.fetch_add(size);
-      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-
+      int64_t mem_size = STAT_INT_ADD(
+          "STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+      STAT_INT_UPDATE_MAXIMUM(
+          "STAT_gpu" + std::to_string(dev_id_) + "_max_mem_size", mem_size);
 #ifdef PADDLE_WITH_TESTING
       gpu_ptrs.insert(*ptr);
 #endif

diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
@@ -36,6 +36,57 @@ DEFINE_INT_STATUS(STAT_gpu13_mem_size)
 DEFINE_INT_STATUS(STAT_gpu14_mem_size)
 DEFINE_INT_STATUS(STAT_gpu15_mem_size)
 
+DEFINE_INT_STATUS(STAT_gpu0_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu1_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu2_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu3_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu4_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu5_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu6_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu7_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu8_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu9_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu10_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu11_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu12_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu13_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu14_max_mem_size)
+DEFINE_INT_STATUS(STAT_gpu15_max_mem_size)
+
+DEFINE_INT_STATUS(STAT_gpu0_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu1_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu2_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu3_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu4_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu5_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu6_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu7_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu8_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu9_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu10_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu11_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu12_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu13_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu14_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu15_alloc_size)
+
+DEFINE_INT_STATUS(STAT_gpu0_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu1_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu2_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu3_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu4_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu5_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu6_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu7_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu8_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu9_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu10_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu11_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu12_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu13_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu14_max_alloc_size)
+DEFINE_INT_STATUS(STAT_gpu15_max_alloc_size)
+
 // For Ascend NPU
 DEFINE_INT_STATUS(STAT_npu0_mem_size)
 DEFINE_INT_STATUS(STAT_npu1_mem_size)

diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
@@ -57,6 +57,10 @@ class StatValue : public MonitorRegistrar {
     std::lock_guard<std::mutex> lock(mu_);
     return v_ -= inc;
   }
+  T update_maximum(T value) {
+    std::lock_guard<std::mutex> lock(mu_);
+    return v_ < value ? v_ = value : v_;
+  }
   T reset(T value = 0) {
     std::lock_guard<std::mutex> lock(mu_);
     return v_ = value;
@@ -135,6 +139,10 @@ class StatRegistry {
   paddle::platform::StatRegistry<int64_t>::Instance().get(item)->increase(t)
 #define STAT_INT_SUB(item, t) \
   paddle::platform::StatRegistry<int64_t>::Instance().get(item)->decrease(t)
+#define STAT_INT_UPDATE_MAXIMUM(item, t)              \
+  paddle::platform::StatRegistry<int64_t>::Instance() \
+      .get(item)                                      \
+      ->update_maximum(t)
 
 #define STAT_FLOAT_ADD(item, t) \
   paddle::platform::StatRegistry<float>::Instance().get(item)->increase(t)
@@ -170,23 +178,73 @@ class StatRegistry {
   extern paddle::platform::StatValue<float> _##item; \
   USE_STAT(item)
 
-#define USE_GPU_MEM_STAT             \
-  USE_INT_STAT(STAT_gpu0_mem_size);  \
-  USE_INT_STAT(STAT_gpu1_mem_size);  \
-  USE_INT_STAT(STAT_gpu2_mem_size);  \
-  USE_INT_STAT(STAT_gpu3_mem_size);  \
-  USE_INT_STAT(STAT_gpu4_mem_size);  \
-  USE_INT_STAT(STAT_gpu5_mem_size);  \
-  USE_INT_STAT(STAT_gpu6_mem_size);  \
-  USE_INT_STAT(STAT_gpu7_mem_size);  \
-  USE_INT_STAT(STAT_gpu8_mem_size);  \
-  USE_INT_STAT(STAT_gpu9_mem_size);  \
-  USE_INT_STAT(STAT_gpu10_mem_size); \
-  USE_INT_STAT(STAT_gpu11_mem_size); \
-  USE_INT_STAT(STAT_gpu12_mem_size); \
-  USE_INT_STAT(STAT_gpu13_mem_size); \
-  USE_INT_STAT(STAT_gpu14_mem_size); \
-  USE_INT_STAT(STAT_gpu15_mem_size)
+#define USE_GPU_MEM_STAT                 \
+  USE_INT_STAT(STAT_gpu0_mem_size);      \
+  USE_INT_STAT(STAT_gpu1_mem_size);      \
+  USE_INT_STAT(STAT_gpu2_mem_size);      \
+  USE_INT_STAT(STAT_gpu3_mem_size);      \
+  USE_INT_STAT(STAT_gpu4_mem_size);      \
+  USE_INT_STAT(STAT_gpu5_mem_size);      \
+  USE_INT_STAT(STAT_gpu6_mem_size);      \
+  USE_INT_STAT(STAT_gpu7_mem_size);      \
+  USE_INT_STAT(STAT_gpu8_mem_size);      \
+  USE_INT_STAT(STAT_gpu9_mem_size);      \
+  USE_INT_STAT(STAT_gpu10_mem_size);     \
+  USE_INT_STAT(STAT_gpu11_mem_size);     \
+  USE_INT_STAT(STAT_gpu12_mem_size);     \
+  USE_INT_STAT(STAT_gpu13_mem_size);     \
+  USE_INT_STAT(STAT_gpu14_mem_size);     \
+  USE_INT_STAT(STAT_gpu15_mem_size);     \
+  USE_INT_STAT(STAT_gpu0_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu1_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu2_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu3_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu4_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu5_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu6_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu7_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu8_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu9_max_mem_size);  \
+  USE_INT_STAT(STAT_gpu10_max_mem_size); \
+  USE_INT_STAT(STAT_gpu11_max_mem_size); \
+  USE_INT_STAT(STAT_gpu12_max_mem_size); \
+  USE_INT_STAT(STAT_gpu13_max_mem_size); \
+  USE_INT_STAT(STAT_gpu14_max_mem_size); \
+  USE_INT_STAT(STAT_gpu15_max_mem_size)
+
+#define USE_GPU_ALLOC_STAT                 \
+  USE_INT_STAT(STAT_gpu0_alloc_size);      \
+  USE_INT_STAT(STAT_gpu1_alloc_size);      \
+  USE_INT_STAT(STAT_gpu2_alloc_size);      \
+  USE_INT_STAT(STAT_gpu3_alloc_size);      \
+  USE_INT_STAT(STAT_gpu4_alloc_size);      \
+  USE_INT_STAT(STAT_gpu5_alloc_size);      \
+  USE_INT_STAT(STAT_gpu6_alloc_size);      \
+  USE_INT_STAT(STAT_gpu7_alloc_size);      \
+  USE_INT_STAT(STAT_gpu8_alloc_size);      \
+  USE_INT_STAT(STAT_gpu9_alloc_size);      \
+  USE_INT_STAT(STAT_gpu10_alloc_size);     \
+  USE_INT_STAT(STAT_gpu11_alloc_size);     \
+  USE_INT_STAT(STAT_gpu12_alloc_size);     \
+  USE_INT_STAT(STAT_gpu13_alloc_size);     \
+  USE_INT_STAT(STAT_gpu14_alloc_size);     \
+  USE_INT_STAT(STAT_gpu15_alloc_size);     \
+  USE_INT_STAT(STAT_gpu0_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu1_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu2_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu3_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu4_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu5_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu6_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu7_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu8_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu9_max_alloc_size);  \
+  USE_INT_STAT(STAT_gpu10_max_alloc_size); \
+  USE_INT_STAT(STAT_gpu11_max_alloc_size); \
+  USE_INT_STAT(STAT_gpu12_max_alloc_size); \
+  USE_INT_STAT(STAT_gpu13_max_alloc_size); \
+  USE_INT_STAT(STAT_gpu14_max_alloc_size); \
+  USE_INT_STAT(STAT_gpu15_max_alloc_size)
 
 #define USE_NPU_MEM_STAT            \
   USE_INT_STAT(STAT_npu0_mem_size); \

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -2558,6 +2558,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
+  m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
   m.def("cuda_empty_cache", [] {
     for (int dev_id : platform::GetSelectedDevices()) {
       auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(