From c514ea5f87c64f0180126d1ef2d370a250c15bdc Mon Sep 17 00:00:00 2001
From: chenenquan <chenenquan@baidu.com>
Date: Tue, 14 Sep 2021 11:22:32 +0800
Subject: [PATCH] =?UTF-8?q?Add=20api=20paddle.device.cuda.empty=5Fcache=20?=
 =?UTF-8?q?to=20release=20idle=20gpu=20memory=20hold=20by=20allocator?=
 =?UTF-8?q?=E3=80=82=20(#35427)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add empty_cache api to release idle gpu memory hold by allocator,test=develop

* Add empty_cache api to release idle gpu memory hold by allocator,test=develop

* Add empty_cache api to release idle gpu memory hold by allocator,test=develop

* Fix test coverage problem for empty_cache

* delete redundant check for empty_cache

* fix the problem of empty_cache's doc

* delete the nvidia-smi comment in doc of empty_cache, test=document_fix
---
 paddle/fluid/platform/gpu_info.cc             |  9 +++++++
 paddle/fluid/platform/gpu_info.h              |  3 +++
 paddle/fluid/pybind/pybind.cc                 |  1 +
 python/paddle/device/cuda/__init__.py         | 24 +++++++++++++++++
 .../tests/unittests/test_cuda_empty_cache.py  | 27 +++++++++++++++++++
 5 files changed, 64 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index fda233b3a016b..76edb3910ccce 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -22,10 +22,12 @@ limitations under the License. */
 #else
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -630,5 +632,12 @@ bool IsCudaMallocRecorded(int dev_id) {
   return RecordedCudaMallocHelper::Instance(dev_id)->NeedRecord();
 }
 
+void EmptyCache(void) {
+  std::vector<int> devices = GetSelectedDevices();
+  for (auto device : devices) {
+    memory::Release(CUDAPlace(device));
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index b5800ef083885..ef7f93a61dbfb 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -137,6 +137,9 @@ uint64_t RecordedCudaMallocSize(int dev_id);
 
 bool IsCudaMallocRecorded(int dev_id);
 
+//! Empty idle cached memory held by the allocator.
+void EmptyCache(void);
+
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b0148e50afc54..f797ed5142c3d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2254,6 +2254,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+  m.def("cuda_empty_cache", platform::EmptyCache);
 
 #if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
   m.def("nvprof_init", platform::CudaProfilerInit);
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index 834cda71fdc5f..be2e2488a3049 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -23,6 +23,7 @@
     'current_stream',
     'synchronize',
     'device_count',
+    'empty_cache',
 ]
 
 
@@ -117,3 +118,26 @@ def device_count():
         core, 'get_cuda_device_count') else 0
 
     return num_gpus
+
+
+def empty_cache():
+    """
+    Releases idle cached memory held by the allocator so that those can be used in other GPU
+    application and visible in `nvidia-smi`. In most cases you don't need to use this function,
+    Paddle does not release the memory back to the OS when you remove Tensors on the GPU,
+    Because it keeps gpu memory in a pool so that next allocations can be done much faster.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            # required: gpu
+            paddle.set_device("gpu")
+            tensor = paddle.randn([512, 512, 512], "float")
+            del tensor
+            paddle.device.cuda.empty_cache()
+    """
+
+    if core.is_compiled_with_cuda():
+        core.cuda_empty_cache()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py b/python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py
new file mode 100644
index 0000000000000..4aefb234bbfc1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_empty_cache.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+
+
+class TestEmptyCache(unittest.TestCase):
+    def test_empty_cache(self):
+        x = paddle.randn((2, 10, 12)).astype('float32')
+        del x
+        self.assertIsNone(paddle.device.cuda.empty_cache())
+
+
+if __name__ == '__main__':
+    unittest.main()