From a4d9851b89d8c4f3c33d556dd93ded40c254eb3a Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Thu, 8 Dec 2022 14:18:52 +0800
Subject: [PATCH] [PHI decoupling] move cuda_graph from fluid to phi (#48686)

* move cuda_graph from fluid to phi

* move device_memory_aligment from fluid to phi

* Revert "move device_memory_aligment from fluid to phi"

This reverts commit b92fcd39a0a50fdac13278f49be0237a85f3a13f.

* update xpu cmake
---
 paddle/fluid/memory/allocation/CMakeLists.txt |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |   2 +-
 .../platform/device/gpu/cuda/CMakeLists.txt   |   4 -
 .../platform/device/gpu/cuda/cuda_graph.h     | 224 +---------------
 .../fluid/platform/device/xpu/CMakeLists.txt  |   2 +-
 paddle/phi/backends/CMakeLists.txt            |   5 +-
 .../backends}/gpu/cuda/cuda_graph.cc          |  57 +++--
 paddle/phi/backends/gpu/cuda/cuda_graph.h     | 241 ++++++++++++++++++
 paddle/phi/backends/xpu/CMakeLists.txt        |   4 -
 paddle/phi/core/CMakeLists.txt                |   2 +-
 paddle/phi/core/device_context.cc             |   7 +-
 paddle/phi/kernels/gpudnn/conv_cudnn_v7.h     |   2 +-
 12 files changed, 291 insertions(+), 261 deletions(-)
 rename paddle/{fluid/platform/device => phi/backends}/gpu/cuda/cuda_graph.cc (90%)
 create mode 100644 paddle/phi/backends/gpu/cuda/cuda_graph.h
 delete mode 100644 paddle/phi/backends/xpu/CMakeLists.txt
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 20a922b406745..f7c57fa2b02d6 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -32,7 +32,7 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 
 if(WITH_GPU)
-  list(APPEND ALLOCATOR_DEPS cuda_graph)
+  list(APPEND ALLOCATOR_DEPS phi_backends)
 endif()
 
 if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2)
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 8c93eaf2469d1..2db144f423fc7 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -85,7 +85,7 @@ if(WITH_GPU)
   nv_library(
     cuda_graph_with_memory_pool
     SRCS cuda_graph_with_memory_pool.cc
-    DEPS device_context allocator cuda_graph)
+    DEPS device_context allocator phi_backends)
 else()
   cc_library(
     cuda_graph_with_memory_pool
diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
index 64a2f891c21cd..07901054b3b33 100644
--- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt
@@ -1,7 +1,3 @@
-nv_library(
-  cuda_graph
-  SRCS cuda_graph.cc
-  DEPS enforce)
 nv_library(
   cuda_profiler
   SRCS cuda_profiler.cc
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
index 5b5151ea822e8..1c0843a0eb645 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_graph.h
@@ -14,45 +14,23 @@
 
 #pragma once
 
-#include <atomic>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-#include "cuda.h"          // NOLINT
-#include "cuda_runtime.h"  // NOLINT
-#include "paddle/fluid/platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/utils/optional.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 
 namespace paddle {
 namespace platform {
 
+using CUDAKernelParams = phi::backends::gpu::CUDAKernelParams;
+#if CUDA_VERSION < 10010
+using cudaStreamCaptureMode = phi::backends::gpu::cudaStreamCaptureMode;
+#endif
+using CUDAGraph = phi::backends::gpu::CUDAGraph;
+using CUDAGraphCaptureModeGuard = phi::backends::gpu::CUDAGraphCaptureModeGuard;
+
 template <typename T>
 static bool IsBitwiseEqual(const T &x, const T &y) {
   return std::memcmp(&x, &y, sizeof(T)) == 0;
 }
 
-class CUDAKernelParams {
- public:
-  explicit CUDAKernelParams(const cudaKernelNodeParams *params)
-      : params_(params) {}
-
-  const void *func() const { return params_->func; }
-
-  template <typename T>
-  T &As(size_t idx) const {
-    return *reinterpret_cast<T *>(params_->kernelParams[idx]);
-  }
-
- private:
-  const cudaKernelNodeParams *params_;
-};
-
 template <typename F, F f>
 struct IsSameKernelHelper;
 
@@ -96,191 +74,5 @@ struct IsSameKernelHelper<Return (*)(FuncArgs...), kernel_fn> {
   }
 };
 
-#if CUDA_VERSION >= 10010
-static void ThrowErrorIfNotSupportCUDAGraph() {}
-#else
-enum cudaStreamCaptureMode {
-  cudaStreamCaptureModeGlobal = 0,
-  cudaStreamCaptureModeThreadLocal = 1,
-  cudaStreamCaptureModeRelaxed = 2
-};
-static void ThrowErrorIfNotSupportCUDAGraph() {
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "CUDA Graph is only supported when CUDA version >= 10.1"));
-}
-#endif
-
-// NOTE: Currently, we do not support to capture CUDA graph in parallel
-// NOTE: Do not use this class directly because it should be used with
-//       the memory pool.
-class CUDAGraph {
-  DISABLE_COPY_AND_ASSIGN(CUDAGraph);
-
-  // Since the constructor would throw error is CUDA_VERSION < 10010.
-  // The non-static method of CUDAGraph need not check CUDA_VERSION
-  // again.
-  CUDAGraph() {
-    ThrowErrorIfNotSupportCUDAGraph();
-    id_ = UniqueID();
-  }
-
- public:
-  static constexpr int64_t kDefaultPoolID = 0;
-  static constexpr int64_t kInvalidPoolID = -1;
-
-  ~CUDAGraph() { Reset(); }
-
-  CUDAGraphID ID() const { return id_; }
-
-  static int64_t SetMemoryPoolID(int64_t pool_id) {
-    auto &pool_id_ = capturing_graph_->pool_id_;
-    PADDLE_ENFORCE_EQ(
-        pool_id_,
-        kInvalidPoolID,
-        phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
-                                     "former memory pool id is %d.",
-                                     pool_id_));
-    if (pool_id <= kInvalidPoolID) {
-      pool_id_ = UniqueMemoryPoolID();
-    } else {
-      PADDLE_ENFORCE_GE(
-          pool_id,
-          kDefaultPoolID,
-          phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
-      pool_id_ = pool_id;
-    }
-    return pool_id_;
-  }
-
-  int64_t PoolID() const { return pool_id_; }
-
-  static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }
-
-  void Replay();
-
-  void Reset();
-
-  void AddResetCallback(std::function<void()> callback) {
-    std::lock_guard<std::mutex> guard(mtx_);
-    callbacks_.push_back(std::move(callback));
-  }
-
-  void PrintToDotFiles(const std::string &dirname, unsigned int flags);
-
-  static void BeginCapture(platform::CUDAPlace place,
-                           cudaStream_t stream,
-                           cudaStreamCaptureMode mode);
-  static std::unique_ptr<CUDAGraph> EndCapture();
-
-  static void BeginSegmentCapture();
-  static void EndSegmentCapture();
-
-  static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
-    capturing_graph_->AddResetCallback(std::move(callback));
-  }
-
-  // No need to add CUDA_VERSION macro because capturing_graph_ would
-  // always be nullptr (constructor throws error)
-  static bool IsCapturing() { return capturing_graph_ != nullptr; }
-
-  static CUDAGraphID CapturingID() { return capturing_graph_->id_; }
-
-  static platform::CUDAPlace CapturingPlace() {
-    return capturing_graph_->place_;
-  }
-
-  // This API can be used to debug which GPU operation is not
-  // supported during capturing CUDA Graph.
-  static bool IsValidCapturing();
-
-  static bool IsThreadLocalCapturing() {
-#if CUDA_VERSION >= 10010
-    return IsCapturing() &&
-           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
-#else
-    return false;
-#endif
-  }
-
-  static bool IsThisThreadCapturing() {
-    if (UNLIKELY(IsCapturing())) {
-      return IsThreadLocalCapturing()
-                 ? capturing_thread_id_.get() == std::this_thread::get_id()
-                 : true;
-    } else {
-      return false;
-    }
-  }
-
-  using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
-  static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
-    std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
-    capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
-  }
-
-  static int64_t UniqueMemoryPoolID();
-
- private:
-  static CUDAGraphID UniqueID();
-
- private:
-#if CUDA_VERSION >= 10010
-  std::vector<cudaGraph_t> graphs_;
-  std::vector<cudaGraphExec_t> exec_graphs_;
-  cudaStreamCaptureMode capture_mode_;
-#endif
-  cudaStream_t stream_{nullptr};
-  platform::CUDAPlace place_;
-  CUDAGraphID id_;
-  int64_t pool_id_{kInvalidPoolID};
-  std::vector<std::function<void()>> callbacks_;
-  bool is_reset_{false};
-  std::mutex mtx_;
-
-  std::vector<SetSeedFunc> set_seed_funcs_;
-  std::vector<std::vector<std::function<void(cudaGraphExec_t)>>> pre_hooks_;
-  std::mutex func_mtx_;
-
-  bool is_first_run_{true};
-
-  static paddle::optional<std::thread::id> capturing_thread_id_;
-  static std::unique_ptr<CUDAGraph> capturing_graph_;
-};
-
-#if CUDA_VERSION >= 10010
-class CUDAGraphCaptureModeGuard {
-  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
-
- public:
-  explicit CUDAGraphCaptureModeGuard(
-      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
-    if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
-      // After cudaThreadExchangeStreamCaptureMode is called,
-      // the variable "mode" would be set to the old capturing mode.
-      old_mode_ = mode;
-    }
-  }
-
-  ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
-    if (UNLIKELY(CUDAGraph::IsCapturing())) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaThreadExchangeStreamCaptureMode(&old_mode_));
-    }
-  }
-
- private:
-  cudaStreamCaptureMode old_mode_;
-};
-#else
-class CUDAGraphCaptureModeGuard {
-  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
-
- public:
-  explicit CUDAGraphCaptureModeGuard(
-      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
-};
-#endif
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index 31ac51050b87b..242f2a8e26002 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -30,7 +30,7 @@ cc_library(
        xpulib
        device_context
        op_kernel_type
-       phi_xpu_op_list)
+       phi_backends)
 cc_library(
   xpu_resource_pool
   SRCS xpu_resource_pool.cc
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index a4c76ab0e68a0..f8a6b2174a830 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,7 @@ if(WITH_GPU OR WITH_ROCM)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
-    list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc)
+    list(APPEND BACKENDS_SRCS gpu/cuda/cuda_info.cc gpu/cuda/cuda_graph.cc)
   endif()
   if(WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
@@ -16,8 +16,9 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 
 if(WITH_XPU)
-  add_subdirectory(xpu)
   list(APPEND BACKENDS_SRCS xpu/xpu_context.cc xpu/xpu_info.cc)
+  list(APPEND BACKENDS_SRCS xpu/xpu_op_list.cc xpu/xpu1_op_list.cc
+       xpu/xpu2_op_list.cc)
 endif()
 
 if(WITH_MKLDNN)
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
similarity index 90%
rename from paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
rename to paddle/phi/backends/gpu/cuda/cuda_graph.cc
index 61c8fe4f4c5fd..5fc39a5319945 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_graph.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 
 #include <queue>
 #include <unordered_map>
 #include <unordered_set>
 
-namespace paddle {
-namespace platform {
+namespace phi {
+namespace backends {
+namespace gpu {
 
 std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
 paddle::optional<std::thread::id> CUDAGraph::capturing_thread_id_{paddle::none};
@@ -113,7 +114,7 @@ void CUDAGraph::Replay() {
 #if CUDA_VERSION >= 10010
   PADDLE_ENFORCE_EQ(is_reset_,
                     false,
-                    errors::PermissionDenied(
+                    phi::errors::PermissionDenied(
                         "Cannot replay the CUDA Graph after reset is called."));
   size_t n = exec_graphs_.size();
   for (size_t i = 0; i < n; ++i) {
@@ -131,43 +132,43 @@ void CUDAGraph::Replay() {
 void CUDAGraph::BeginSegmentCapture() {
   ThrowErrorIfNotSupportCUDAGraph();
 #if CUDA_VERSION >= 10010
-  PADDLE_ENFORCE_EQ(
-      IsCapturing(),
-      true,
-      errors::PermissionDenied("BeginSegmentCapture should be called when CUDA "
-                               "Graph is capturing."));
+  PADDLE_ENFORCE_EQ(IsCapturing(),
+                    true,
+                    phi::errors::PermissionDenied(
+                        "BeginSegmentCapture should be called when CUDA "
+                        "Graph is capturing."));
   if (IsThreadLocalCapturing()) {
     PADDLE_ENFORCE_EQ(IsThisThreadCapturing(),
                       true,
-                      platform::errors::PermissionDenied(
+                      phi::errors::PermissionDenied(
                           "When capturing CUDA Graph in the thread local mode, "
                           "you cannot begin segmented capturing in the thread "
                           "which is not the one that starts the capturing."));
   }
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamBeginCapture(
       capturing_graph_->stream_, capturing_graph_->capture_mode_));
-  PADDLE_ENFORCE_EQ(IsValidCapturing(),
-                    true,
-                    platform::errors::PermissionDenied(
-                        "CUDA Graph should not be invalidated."));
+  PADDLE_ENFORCE_EQ(
+      IsValidCapturing(),
+      true,
+      phi::errors::PermissionDenied("CUDA Graph should not be invalidated."));
   VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_
            << ", segment id " << capturing_graph_->graphs_.size()
            << ", memory pool id " << capturing_graph_->pool_id_;
 #endif
 }
 
-void CUDAGraph::BeginCapture(platform::CUDAPlace place,
+void CUDAGraph::BeginCapture(phi::GPUPlace place,
                              cudaStream_t stream,
                              cudaStreamCaptureMode mode) {
   ThrowErrorIfNotSupportCUDAGraph();
 #if CUDA_VERSION >= 10010
-  PADDLE_ENFORCE_EQ(
-      IsCapturing(),
-      false,
-      errors::PermissionDenied("CUDA Graph can only captured one by one."));
+  PADDLE_ENFORCE_EQ(IsCapturing(),
+                    false,
+                    phi::errors::PermissionDenied(
+                        "CUDA Graph can only captured one by one."));
   PADDLE_ENFORCE_NOT_NULL(
       stream,
-      errors::PermissionDenied(
+      phi::errors::PermissionDenied(
           "CUDA Graph cannot be captured in default CUDA stream 0."));
   capturing_graph_.reset(new CUDAGraph());
   capturing_graph_->place_ = place;
@@ -185,9 +186,10 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place,
 void CUDAGraph::EndSegmentCapture() {
   ThrowErrorIfNotSupportCUDAGraph();
 #if CUDA_VERSION >= 10010
-  PADDLE_ENFORCE_EQ(IsCapturing(),
-                    true,
-                    errors::PermissionDenied("No CUDA Graph is capturing."));
+  PADDLE_ENFORCE_EQ(
+      IsCapturing(),
+      true,
+      phi::errors::PermissionDenied("No CUDA Graph is capturing."));
   cudaGraph_t graph;
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamEndCapture(capturing_graph_->stream_, &graph));
@@ -299,11 +301,12 @@ void CUDAGraph::PrintToDotFiles(const std::string &dirname,
         cudaGraphDebugDotPrint(graphs_[i], filename.c_str(), flags));
   }
 #else
-  PADDLE_THROW(platform::errors::Unimplemented(
+  PADDLE_THROW(phi::errors::Unimplemented(
       "The print_to_dot_files() method is only supported when CUDA version >= "
       "11.3."));
 #endif
 }
 
-}  // namespace platform
-}  // namespace paddle
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.h b/paddle/phi/backends/gpu/cuda/cuda_graph.h
new file mode 100644
index 0000000000000..f2004eb6c7da0
--- /dev/null
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -0,0 +1,241 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "cuda.h"          // NOLINT
+#include "cuda_runtime.h"  // NOLINT
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+class CUDAKernelParams {
+ public:
+  explicit CUDAKernelParams(const cudaKernelNodeParams *params)
+      : params_(params) {}
+
+  const void *func() const { return params_->func; }
+
+  template <typename T>
+  T &As(size_t idx) const {
+    return *reinterpret_cast<T *>(params_->kernelParams[idx]);
+  }
+
+ private:
+  const cudaKernelNodeParams *params_;
+};
+
+#if CUDA_VERSION >= 10010
+static void ThrowErrorIfNotSupportCUDAGraph() {}
+#else
+enum cudaStreamCaptureMode {
+  cudaStreamCaptureModeGlobal = 0,
+  cudaStreamCaptureModeThreadLocal = 1,
+  cudaStreamCaptureModeRelaxed = 2
+};
+static void ThrowErrorIfNotSupportCUDAGraph() {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "CUDA Graph is only supported when CUDA version >= 10.1"));
+}
+#endif
+
+using CUDAGraphID = unsigned long long;  // NOLINT
+
+// NOTE: Currently, we do not support to capture CUDA graph in parallel
+// NOTE: Do not use this class directly because it should be used with
+//       the memory pool.
+class CUDAGraph {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraph);
+
+  // Since the constructor would throw error is CUDA_VERSION < 10010.
+  // The non-static method of CUDAGraph need not check CUDA_VERSION
+  // again.
+  CUDAGraph() {
+    ThrowErrorIfNotSupportCUDAGraph();
+    id_ = UniqueID();
+  }
+
+ public:
+  static constexpr int64_t kDefaultPoolID = 0;
+  static constexpr int64_t kInvalidPoolID = -1;
+
+  ~CUDAGraph() { Reset(); }
+
+  CUDAGraphID ID() const { return id_; }
+
+  static int64_t SetMemoryPoolID(int64_t pool_id) {
+    auto &pool_id_ = capturing_graph_->pool_id_;
+    PADDLE_ENFORCE_EQ(
+        pool_id_,
+        kInvalidPoolID,
+        phi::errors::InvalidArgument("Cannot reset memory pool id twice, the "
+                                     "former memory pool id is %d.",
+                                     pool_id_));
+    if (pool_id <= kInvalidPoolID) {
+      pool_id_ = UniqueMemoryPoolID();
+    } else {
+      PADDLE_ENFORCE_GE(
+          pool_id,
+          kDefaultPoolID,
+          phi::errors::InvalidArgument("Invalid memory pool id %d.", pool_id));
+      pool_id_ = pool_id;
+    }
+    return pool_id_;
+  }
+
+  int64_t PoolID() const { return pool_id_; }
+
+  static int64_t CapturingPoolID() { return capturing_graph_->pool_id_; }
+
+  void Replay();
+
+  void Reset();
+
+  void AddResetCallback(std::function<void()> callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callbacks_.push_back(std::move(callback));
+  }
+
+  void PrintToDotFiles(const std::string &dirname, unsigned int flags);
+
+  static void BeginCapture(phi::GPUPlace place,
+                           cudaStream_t stream,
+                           cudaStreamCaptureMode mode);
+  static std::unique_ptr<CUDAGraph> EndCapture();
+
+  static void BeginSegmentCapture();
+  static void EndSegmentCapture();
+
+  static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
+    capturing_graph_->AddResetCallback(std::move(callback));
+  }
+
+  // No need to add CUDA_VERSION macro because capturing_graph_ would
+  // always be nullptr (constructor throws error)
+  static bool IsCapturing() { return capturing_graph_ != nullptr; }
+
+  static CUDAGraphID CapturingID() { return capturing_graph_->id_; }
+
+  static phi::GPUPlace CapturingPlace() { return capturing_graph_->place_; }
+
+  // This API can be used to debug which GPU operation is not
+  // supported during capturing CUDA Graph.
+  static bool IsValidCapturing();
+
+  static bool IsThreadLocalCapturing() {
+#if CUDA_VERSION >= 10010
+    return IsCapturing() &&
+           capturing_graph_->capture_mode_ == cudaStreamCaptureModeThreadLocal;
+#else
+    return false;
+#endif
+  }
+
+  static bool IsThisThreadCapturing() {
+    if (UNLIKELY(IsCapturing())) {
+      return IsThreadLocalCapturing()
+                 ? capturing_thread_id_.get() == std::this_thread::get_id()
+                 : true;
+    } else {
+      return false;
+    }
+  }
+
+  using SetSeedFunc = std::function<bool(CUDAKernelParams *, bool)>;
+  static void RecordRandomKernelInfo(SetSeedFunc set_seed_func) {
+    std::lock_guard<std::mutex> guard(capturing_graph_->func_mtx_);
+    capturing_graph_->set_seed_funcs_.emplace_back(std::move(set_seed_func));
+  }
+
+  static int64_t UniqueMemoryPoolID();
+
+ private:
+  static CUDAGraphID UniqueID();
+
+ private:
+#if CUDA_VERSION >= 10010
+  std::vector<cudaGraph_t> graphs_;
+  std::vector<cudaGraphExec_t> exec_graphs_;
+  cudaStreamCaptureMode capture_mode_;
+#endif
+  cudaStream_t stream_{nullptr};
+  phi::GPUPlace place_;
+  CUDAGraphID id_;
+  int64_t pool_id_{kInvalidPoolID};
+  std::vector<std::function<void()>> callbacks_;
+  bool is_reset_{false};
+  std::mutex mtx_;
+
+  std::vector<SetSeedFunc> set_seed_funcs_;
+  std::vector<std::vector<std::function<void(cudaGraphExec_t)>>> pre_hooks_;
+  std::mutex func_mtx_;
+
+  bool is_first_run_{true};
+
+  static paddle::optional<std::thread::id> capturing_thread_id_;
+  static std::unique_ptr<CUDAGraph> capturing_graph_;
+};
+
+#if CUDA_VERSION >= 10010
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
+      // After cudaThreadExchangeStreamCaptureMode is called,
+      // the variable "mode" would be set to the old capturing mode.
+      old_mode_ = mode;
+    }
+  }
+
+  ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaThreadExchangeStreamCaptureMode(&old_mode_));
+    }
+  }
+
+ private:
+  cudaStreamCaptureMode old_mode_;
+};
+#else
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
+};
+#endif
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt
deleted file mode 100644
index d84e6a63e058a..0000000000000
--- a/paddle/phi/backends/xpu/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-cc_library(
-  phi_xpu_op_list
-  SRCS xpu_op_list.cc xpu1_op_list.cc xpu2_op_list.cc
-  DEPS glog)
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 8911da82b5480..6dc43ff633f19 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -19,7 +19,7 @@ if(WITH_XPU)
   cc_library(
     kernel_factory
     SRCS kernel_factory.cc
-    DEPS phi_enforce convert_utils phi_xpu_op_list)
+    DEPS phi_enforce convert_utils phi_backends)
 else()
   cc_library(
     kernel_factory
diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc
index a18e695cce4d8..60747e36185a5 100644
--- a/paddle/phi/core/device_context.cc
+++ b/paddle/phi/core/device_context.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/core/device_context.h"
 
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #endif
 
 #include "paddle/phi/core/dense_tensor.h"
@@ -153,8 +153,9 @@ struct DeviceContext::Impl {
                           : (pinned ? pinned_allocator_ : device_allocator_);
 #ifdef PADDLE_WITH_CUDA
     bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned;
-    if (must_cuda_graph_allocator && paddle::platform::is_gpu_place(place) &&
-        paddle::platform::CUDAGraph::IsThisThreadCapturing()) {
+    if (must_cuda_graph_allocator &&
+        place.GetType() == phi::AllocationType::GPU &&
+        phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) {
       PADDLE_ENFORCE_NOT_NULL(cuda_graph_allocator_,
                               phi::errors::InvalidArgument(
                                   "Required cuda_graph_allocator_ shall not be "
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
index ac4a60384af19..cc32759b5f044 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
@@ -465,7 +465,7 @@ struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionBwdFilterAlgo_t algo) {
-    paddle::platform::CUDAGraphCaptureModeGuard guard;
+    phi::backends::gpu::CUDAGraphCaptureModeGuard guard;
     size_t workspace_size = 0;
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(