From 75fae21bddf7d32da7a3a31c41e0b261252eca23 Mon Sep 17 00:00:00 2001
From: qingshui <qshuihu@gmail.com>
Date: Tue, 14 Dec 2021 22:08:33 +0800
Subject: [PATCH 1/2] add sample gpu memory pool

---
 .../memory/allocation/allocator_facade.cc     |  21 +++
 .../memory/allocation/allocator_strategy.cc   |   4 +-
 .../memory/allocation/allocator_strategy.h    |   7 +-
 .../memory/allocation/retry_allocator.cc      | 144 ++++++++++++++++++
 .../fluid/memory/allocation/retry_allocator.h |  66 +++++++-
 5 files changed, 236 insertions(+), 6 deletions(-)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 03c252909d923..cc7928adabf2c 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -115,6 +115,23 @@ class AllocatorFacadePrivate {
         break;
       }
 
+      case AllocatorStrategy::kSamplePool: {
+        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
+#ifdef PADDLE_WITH_CUDA
+        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+             ++dev_id) {
+          InitSampleCUDAAllocator(platform::CUDAPlace(dev_id));
+        }
+        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+        break;
+      }
+
       default: {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Unsupported allocator strategy: %d", static_cast<int>(strategy)));
@@ -188,6 +205,10 @@ class AllocatorFacadePrivate {
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator, platform::GpuMinChunkSize());
   }
+  void InitSampleCUDAAllocator(platform::CUDAPlace p) {
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    allocators_[p] = std::make_shared<SampleAllocator>(cuda_allocator);
+  }
 #endif
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index 518b31e943048..e6f03b49c4add 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -35,7 +35,9 @@ static AllocatorStrategy GetStrategyFromFlag() {
   if (FLAGS_allocator_strategy == "thread_local") {
     return AllocatorStrategy::kThreadLocal;
   }
-
+  if (FLAGS_allocator_strategy == "sample_pool") {
+    return AllocatorStrategy::kSamplePool;
+  }
   PADDLE_THROW(platform::errors::InvalidArgument(
       "Unsupported allocator strategy: %s, condicates are naive_best_fit, "
       "auto_growth or thread_local.",
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h
index 0db9d93e3e645..b1d9129cee3a5 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ b/paddle/fluid/memory/allocation/allocator_strategy.h
@@ -18,7 +18,12 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal };
+enum class AllocatorStrategy {
+  kNaiveBestFit,
+  kAutoGrowth,
+  kThreadLocal,
+  kSamplePool
+};
 
 extern AllocatorStrategy GetAllocatorStrategy();
 
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index ae6af53241dfe..79e428b99ce30 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -14,6 +14,11 @@
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 
+DEFINE_int32(sample_max_bin_bytes, 2048, "sample max bytes in pool MB");
+DEFINE_int32(sample_bin_growth, 2, "sample growth memory by bin");
+DEFINE_int32(sample_min_bin, 8, "sample min bin number");
+DEFINE_bool(sample_debug_info, false, "sample print debug info");
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -94,6 +99,145 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) {
   }
 }
 
+static const unsigned int INVALID_BIN = (unsigned int)-1;
+SampleAllocator::BlockDescriptor::BlockDescriptor(Allocation* ptr)
+    : d_ptr(ptr), bytes(0), used(0), bin(INVALID_BIN) {}
+SampleAllocator::BlockDescriptor::BlockDescriptor()
+    : d_ptr(NULL), bytes(0), used(0), bin(INVALID_BIN) {}
+bool SampleAllocator::BlockDescriptor::ptrcompare(const BlockDescriptor& a,
+                                                  const BlockDescriptor& b) {
+  return (a.d_ptr < b.d_ptr);
+}
+bool SampleAllocator::BlockDescriptor::sizecompare(const BlockDescriptor& a,
+                                                   const BlockDescriptor& b) {
+  return (a.bytes < b.bytes);
+}
+SampleAllocator::SampleAllocator(std::shared_ptr<Allocator> allocator)
+    : allocator_(std::move(allocator)),
+      bin_growth_(FLAGS_sample_bin_growth),
+      min_bin_(FLAGS_sample_min_bin),
+      min_bin_bytes_(pow(bin_growth_, min_bin_)),
+      max_bin_bytes_(FLAGS_sample_max_bin_bytes * 1024 * 1024),
+      cached_blocks_(BlockDescriptor::sizecompare),
+      live_blocks_(BlockDescriptor::ptrcompare) {
+  PADDLE_ENFORCE_NOT_NULL(
+      allocator_, platform::errors::InvalidArgument(
+                      "Underlying allocator of SampleAllocator is NULL"));
+  VLOG(0) << "SampleAllocator init";
+}
+void SampleAllocator::FreeImpl(Allocation* allocation) {
+  if (allocation == NULL) {
+    return;
+  }
+  bool recached = false;
+  BlockDescriptor search_key(allocation);
+
+  mutex_.lock();
+  auto block_itr = live_blocks_.find(search_key);
+  if (block_itr != live_blocks_.end()) {
+    search_key = *block_itr;
+    live_blocks_.erase(block_itr);
+    cached_bytes_.live -= search_key.bytes;
+    cached_bytes_.used -= search_key.used;
+    if (search_key.bin != INVALID_BIN) {
+      recached = true;
+      cached_blocks_.insert(search_key);
+      cached_bytes_.free += search_key.bytes;
+    }
+  }
+  mutex_.unlock();
+
+  if (!recached) {
+    allocator_->Free(allocation);
+  }
+
+  if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) {
+    VLOG(0) << "pool total: " << (cached_bytes_.live >> 20)
+            << "MB, used: " << (cached_bytes_.used >> 20) << "MB, free"
+            << (cached_bytes_.free >> 20)
+            << "MB, free big memory: " << search_key.bytes << " bytes";
+  }
+}
+// alloc memory
+Allocation* SampleAllocator::AllocateImpl(size_t bytes) {
+  // Create a block descriptor for the requested allocation
+  bool found = false;
+  BlockDescriptor search_key;
+  search_key.used = bytes;
+  if (bytes > max_bin_bytes_) {
+    search_key.bin = INVALID_BIN;
+    search_key.bytes = bytes;
+  } else {
+    if (bytes < min_bin_bytes_) {
+      search_key.bin = min_bin_;
+      search_key.bytes = min_bin_bytes_;
+    } else {
+      search_key.bin = 0;
+      search_key.bytes = 1;
+      while (search_key.bytes < bytes) {
+        search_key.bytes *= bin_growth_;
+        ++search_key.bin;
+      }
+    }
+    mutex_.lock();
+    auto block_itr = cached_blocks_.lower_bound(search_key);
+    if ((block_itr != cached_blocks_.end()) &&
+        (block_itr->bin == search_key.bin)) {
+      found = true;
+      search_key = *block_itr;
+      search_key.used = bytes;
+      live_blocks_.insert(search_key);
+      // Remove from free blocks
+      cached_bytes_.free -= search_key.bytes;
+      cached_bytes_.live += search_key.bytes;
+      cached_bytes_.used += search_key.used;
+      cached_blocks_.erase(block_itr);
+    }
+    mutex_.unlock();
+  }
+  if (!found) {
+    try {
+      search_key.d_ptr = allocator_->Allocate(search_key.bytes).release();
+    } catch (BadAlloc&) {
+      // release all free cache
+      FreeAllCache();
+      // cuda malloc
+      search_key.d_ptr = allocator_->Allocate(search_key.bytes).release();
+    } catch (...) {
+      throw;
+    }
+    mutex_.lock();
+    live_blocks_.insert(search_key);
+    cached_bytes_.live += search_key.bytes;
+    cached_bytes_.used += search_key.used;
+    mutex_.unlock();
+
+    if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) {
+      VLOG(0) << "pool total: " << (cached_bytes_.live >> 20)
+              << "MB, used: " << (cached_bytes_.used >> 20) << "MB, free"
+              << (cached_bytes_.free >> 20)
+              << "MB, cuda alloc big memory: " << bytes << " bytes";
+    }
+  }
+  return search_key.d_ptr;
+}
+void SampleAllocator::FreeAllCache(void) {
+  mutex_.lock();
+  if (cached_blocks_.empty()) {
+    mutex_.unlock();
+    return;
+  }
+  while (!cached_blocks_.empty()) {
+    auto begin = cached_blocks_.begin();
+    allocator_->Free(begin->d_ptr);
+    cached_bytes_.free -= begin->bytes;
+    cached_blocks_.erase(begin);
+  }
+  mutex_.unlock();
+}
+
+void SampleAllocator::GetMemInfo(TotalBytes* info) { *info = cached_bytes_; }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 031a5e2b97f17..557cb715fabe9 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -19,8 +19,8 @@
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
+#include <set>
 #include <utility>
-
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -45,9 +45,9 @@ class RetryAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
-  uint64_t ReleaseImpl(const platform::Place& place) override {
+  void FreeImpl(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override {
     return underlying_allocator_->Release(place);
   }
 
@@ -60,6 +60,64 @@ class RetryAllocator : public Allocator {
   std::atomic<size_t> waited_allocate_size_{0};
 };
 
+class SampleAllocator : public Allocator {
+  /**
+   * Descriptor for device memory allocations
+   */
+  struct BlockDescriptor {
+    Allocation *d_ptr;  // Device pointer
+    size_t bytes;       // Size of allocation in bytes
+    size_t used;        // Real used
+    unsigned int bin;   // Bin enumeration
+
+    explicit BlockDescriptor(Allocation *ptr);
+    BlockDescriptor();
+    static bool ptrcompare(const BlockDescriptor &a, const BlockDescriptor &b);
+    static bool sizecompare(const BlockDescriptor &a, const BlockDescriptor &b);
+  };
+  // BlockDescriptor comparator function interface
+  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+  /// Set type for cached blocks (ordered by size)
+  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+  /// Set type for live blocks (ordered by ptr)
+  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+ public:
+  // Total Bytes
+  struct TotalBytes {
+    size_t free = 0;
+    size_t live = 0;
+    size_t used = 0;
+  };
+  explicit SampleAllocator(std::shared_ptr<Allocator> allocator);
+  bool IsAllocThreadSafe() const override { return true; }
+  void GetMemInfo(TotalBytes *info);
+
+ protected:
+  void FreeImpl(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override {
+    FreeAllCache();
+    return allocator_->Release(place);
+  }
+  void FreeAllCache(void);
+
+ private:
+  std::shared_ptr<Allocator> allocator_;
+  std::mutex mutex_;
+
+  unsigned int bin_growth_;  /// Geometric growth factor for bin-sizes
+  unsigned int min_bin_;     /// Minimum bin enumeration
+  size_t min_bin_bytes_;
+  size_t max_bin_bytes_;
+
+  TotalBytes cached_bytes_;  /// Map of device ordinal to aggregate cached bytes
+                             /// on that device
+  CachedBlocks
+      cached_blocks_;  /// Set of cached device allocations available for reuse
+  BusyBlocks live_blocks_;
+};
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle

From 30310f1c78571f18d8d6872467e916b20fe4115a Mon Sep 17 00:00:00 2001
From: qingshui <qshuihu@gmail.com>
Date: Tue, 14 Dec 2021 22:08:33 +0800
Subject: [PATCH 2/2] add sample gpu memory pool

---
 .../memory/allocation/allocator_facade.cc     |  24 ++-
 .../memory/allocation/allocator_strategy.cc   |   4 +-
 .../memory/allocation/allocator_strategy.h    |   7 +-
 .../memory/allocation/retry_allocator.cc      | 144 ++++++++++++++++++
 .../fluid/memory/allocation/retry_allocator.h |  66 +++++++-
 python/paddle/fluid/__init__.py               |   4 +
 6 files changed, 242 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 03c252909d923..5076fc19e078b 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -115,6 +115,23 @@ class AllocatorFacadePrivate {
         break;
       }
 
+      case AllocatorStrategy::kSamplePool: {
+        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
+#ifdef PADDLE_WITH_CUDA
+        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+             ++dev_id) {
+          InitSampleCUDAAllocator(platform::CUDAPlace(dev_id));
+        }
+        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+        break;
+      }
+
       default: {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Unsupported allocator strategy: %d", static_cast<int>(strategy)));
@@ -123,7 +140,8 @@ class AllocatorFacadePrivate {
     InitZeroSizeAllocators();
     InitSystemAllocators();
 
-    if (FLAGS_gpu_allocator_retry_time > 0) {
+    if (strategy != AllocatorStrategy::kSamplePool &&
+        FLAGS_gpu_allocator_retry_time > 0) {
       WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
     }
 
@@ -188,6 +206,10 @@ class AllocatorFacadePrivate {
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator, platform::GpuMinChunkSize());
   }
+  void InitSampleCUDAAllocator(platform::CUDAPlace p) {
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    allocators_[p] = std::make_shared<SampleAllocator>(cuda_allocator);
+  }
 #endif
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index 518b31e943048..e6f03b49c4add 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -35,7 +35,9 @@ static AllocatorStrategy GetStrategyFromFlag() {
   if (FLAGS_allocator_strategy == "thread_local") {
     return AllocatorStrategy::kThreadLocal;
   }
-
+  if (FLAGS_allocator_strategy == "sample_pool") {
+    return AllocatorStrategy::kSamplePool;
+  }
   PADDLE_THROW(platform::errors::InvalidArgument(
       "Unsupported allocator strategy: %s, condicates are naive_best_fit, "
       "auto_growth or thread_local.",
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h
index 0db9d93e3e645..b1d9129cee3a5 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.h
+++ b/paddle/fluid/memory/allocation/allocator_strategy.h
@@ -18,7 +18,12 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal };
+enum class AllocatorStrategy {
+  kNaiveBestFit,
+  kAutoGrowth,
+  kThreadLocal,
+  kSamplePool
+};
 
 extern AllocatorStrategy GetAllocatorStrategy();
 
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index ae6af53241dfe..79e428b99ce30 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -14,6 +14,11 @@
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 
+DEFINE_int32(sample_max_bin_bytes, 2048, "sample max bytes in pool MB");
+DEFINE_int32(sample_bin_growth, 2, "sample growth memory by bin");
+DEFINE_int32(sample_min_bin, 8, "sample min bin number");
+DEFINE_bool(sample_debug_info, false, "sample print debug info");
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -94,6 +99,145 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) {
   }
 }
 
+static const unsigned int INVALID_BIN = (unsigned int)-1;
+SampleAllocator::BlockDescriptor::BlockDescriptor(Allocation* ptr)
+    : d_ptr(ptr), bytes(0), used(0), bin(INVALID_BIN) {}
+SampleAllocator::BlockDescriptor::BlockDescriptor()
+    : d_ptr(NULL), bytes(0), used(0), bin(INVALID_BIN) {}
+bool SampleAllocator::BlockDescriptor::ptrcompare(const BlockDescriptor& a,
+                                                  const BlockDescriptor& b) {
+  return (a.d_ptr < b.d_ptr);
+}
+bool SampleAllocator::BlockDescriptor::sizecompare(const BlockDescriptor& a,
+                                                   const BlockDescriptor& b) {
+  return (a.bytes < b.bytes);
+}
+SampleAllocator::SampleAllocator(std::shared_ptr<Allocator> allocator)
+    : allocator_(std::move(allocator)),
+      bin_growth_(FLAGS_sample_bin_growth),
+      min_bin_(FLAGS_sample_min_bin),
+      min_bin_bytes_(pow(bin_growth_, min_bin_)),
+      max_bin_bytes_(FLAGS_sample_max_bin_bytes * 1024 * 1024),
+      cached_blocks_(BlockDescriptor::sizecompare),
+      live_blocks_(BlockDescriptor::ptrcompare) {
+  PADDLE_ENFORCE_NOT_NULL(
+      allocator_, platform::errors::InvalidArgument(
+                      "Underlying allocator of SampleAllocator is NULL"));
+  VLOG(0) << "SampleAllocator init";
+}
+void SampleAllocator::FreeImpl(Allocation* allocation) {
+  if (allocation == NULL) {
+    return;
+  }
+  bool recached = false;
+  BlockDescriptor search_key(allocation);
+
+  mutex_.lock();
+  auto block_itr = live_blocks_.find(search_key);
+  if (block_itr != live_blocks_.end()) {
+    search_key = *block_itr;
+    live_blocks_.erase(block_itr);
+    cached_bytes_.live -= search_key.bytes;
+    cached_bytes_.used -= search_key.used;
+    if (search_key.bin != INVALID_BIN) {
+      recached = true;
+      cached_blocks_.insert(search_key);
+      cached_bytes_.free += search_key.bytes;
+    }
+  }
+  mutex_.unlock();
+
+  if (!recached) {
+    allocator_->Free(allocation);
+  }
+
+  if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) {
+    VLOG(0) << "pool total: " << (cached_bytes_.live >> 20)
+            << "MB, used: " << (cached_bytes_.used >> 20) << "MB, free"
+            << (cached_bytes_.free >> 20)
+            << "MB, free big memory: " << search_key.bytes << " bytes";
+  }
+}
+// alloc memory
+Allocation* SampleAllocator::AllocateImpl(size_t bytes) {
+  // Create a block descriptor for the requested allocation
+  bool found = false;
+  BlockDescriptor search_key;
+  search_key.used = bytes;
+  if (bytes > max_bin_bytes_) {
+    search_key.bin = INVALID_BIN;
+    search_key.bytes = bytes;
+  } else {
+    if (bytes < min_bin_bytes_) {
+      search_key.bin = min_bin_;
+      search_key.bytes = min_bin_bytes_;
+    } else {
+      search_key.bin = 0;
+      search_key.bytes = 1;
+      while (search_key.bytes < bytes) {
+        search_key.bytes *= bin_growth_;
+        ++search_key.bin;
+      }
+    }
+    mutex_.lock();
+    auto block_itr = cached_blocks_.lower_bound(search_key);
+    if ((block_itr != cached_blocks_.end()) &&
+        (block_itr->bin == search_key.bin)) {
+      found = true;
+      search_key = *block_itr;
+      search_key.used = bytes;
+      live_blocks_.insert(search_key);
+      // Remove from free blocks
+      cached_bytes_.free -= search_key.bytes;
+      cached_bytes_.live += search_key.bytes;
+      cached_bytes_.used += search_key.used;
+      cached_blocks_.erase(block_itr);
+    }
+    mutex_.unlock();
+  }
+  if (!found) {
+    try {
+      search_key.d_ptr = allocator_->Allocate(search_key.bytes).release();
+    } catch (BadAlloc&) {
+      // release all free cache
+      FreeAllCache();
+      // cuda malloc
+      search_key.d_ptr = allocator_->Allocate(search_key.bytes).release();
+    } catch (...) {
+      throw;
+    }
+    mutex_.lock();
+    live_blocks_.insert(search_key);
+    cached_bytes_.live += search_key.bytes;
+    cached_bytes_.used += search_key.used;
+    mutex_.unlock();
+
+    if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) {
+      VLOG(0) << "pool total: " << (cached_bytes_.live >> 20)
+              << "MB, used: " << (cached_bytes_.used >> 20) << "MB, free"
+              << (cached_bytes_.free >> 20)
+              << "MB, cuda alloc big memory: " << bytes << " bytes";
+    }
+  }
+  return search_key.d_ptr;
+}
+void SampleAllocator::FreeAllCache(void) {
+  mutex_.lock();
+  if (cached_blocks_.empty()) {
+    mutex_.unlock();
+    return;
+  }
+  while (!cached_blocks_.empty()) {
+    auto begin = cached_blocks_.begin();
+    allocator_->Free(begin->d_ptr);
+    cached_bytes_.free -= begin->bytes;
+    cached_blocks_.erase(begin);
+  }
+  mutex_.unlock();
+}
+
+void SampleAllocator::GetMemInfo(TotalBytes* info) { *info = cached_bytes_; }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 031a5e2b97f17..557cb715fabe9 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -19,8 +19,8 @@
 #include <condition_variable>  // NOLINT
 #include <memory>
 #include <mutex>  // NOLINT
+#include <set>
 #include <utility>
-
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -45,9 +45,9 @@ class RetryAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
-  uint64_t ReleaseImpl(const platform::Place& place) override {
+  void FreeImpl(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override {
     return underlying_allocator_->Release(place);
   }
 
@@ -60,6 +60,64 @@ class RetryAllocator : public Allocator {
   std::atomic<size_t> waited_allocate_size_{0};
 };
 
+class SampleAllocator : public Allocator {
+  /**
+   * Descriptor for device memory allocations
+   */
+  struct BlockDescriptor {
+    Allocation *d_ptr;  // Device pointer
+    size_t bytes;       // Size of allocation in bytes
+    size_t used;        // Real used
+    unsigned int bin;   // Bin enumeration
+
+    explicit BlockDescriptor(Allocation *ptr);
+    BlockDescriptor();
+    static bool ptrcompare(const BlockDescriptor &a, const BlockDescriptor &b);
+    static bool sizecompare(const BlockDescriptor &a, const BlockDescriptor &b);
+  };
+  // BlockDescriptor comparator function interface
+  typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+  /// Set type for cached blocks (ordered by size)
+  typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+  /// Set type for live blocks (ordered by ptr)
+  typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+ public:
+  // Total Bytes
+  struct TotalBytes {
+    size_t free = 0;
+    size_t live = 0;
+    size_t used = 0;
+  };
+  explicit SampleAllocator(std::shared_ptr<Allocator> allocator);
+  bool IsAllocThreadSafe() const override { return true; }
+  void GetMemInfo(TotalBytes *info);
+
+ protected:
+  void FreeImpl(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override {
+    FreeAllCache();
+    return allocator_->Release(place);
+  }
+  void FreeAllCache(void);
+
+ private:
+  std::shared_ptr<Allocator> allocator_;
+  std::mutex mutex_;
+
+  unsigned int bin_growth_;  /// Geometric growth factor for bin-sizes
+  unsigned int min_bin_;     /// Minimum bin enumeration
+  size_t min_bin_bytes_;
+  size_t max_bin_bytes_;
+
+  TotalBytes cached_bytes_;  /// Map of device ordinal to aggregate cached bytes
+                             /// on that device
+  CachedBlocks
+      cached_blocks_;  /// Set of cached device allocations available for reuse
+  BusyBlocks live_blocks_;
+};
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 1702c7cbc84d8..b0d18e54139e3 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -204,6 +204,10 @@ def __bootstrap__():
         'call_stack_level',
         'sort_sum_gradient',
         'max_inplace_grad_add',
+        'sample_max_bin_bytes',
+        'sample_bin_growth',
+        'sample_min_bin',
+        'sample_debug_info',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')