diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index c8f3dc0d673f1..ff0c28c00716d 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -882,21 +882,22 @@ target_link_libraries(
   conditional_block_op_helper
   pylayer_op_helper)
 
-cc_library(
-  parallel_executor
-  SRCS parallel_executor.cc
-  DEPS ssa_graph_executor graph build_strategy collective_helper
-       variable_helper)
-
 cc_library(
   compiled_program
   SRCS compiled_program.cc
-  DEPS graph build_strategy)
+  DEPS graph
+       build_strategy
+       reference_count_pass
+       eager_deletion_pass
+       buffer_shared_inplace_op_pass
+       buffer_shared_cross_op_memory_reuse_pass
+       inplace_addto_op_pass
+       set_reader_device_info_utils)
 
 cc_library(
   executor_cache
   SRCS executor_cache.cc
-  DEPS parallel_executor pir_transforms pir)
+  DEPS pir_transforms pir)
 cc_library(
   prune
   SRCS prune.cc
@@ -962,7 +963,8 @@ cc_library(
        phi
        common
        imperative_flag
-       layer)
+       layer
+       op_dialect_vjp)
 
 cc_library(
   type_info
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 20c1444f238eb..3a22cb8131c90 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -7,12 +7,6 @@ if(WITH_PSCORE)
   endif()
   set_source_files_properties(
     reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(
-    threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS
-                                              ${DISTRIBUTE_COMPILE_FLAGS})
-  set_source_files_properties(
-    async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS
-                                           ${DISTRIBUTE_COMPILE_FLAGS})
 endif()
 
 set(op_handle_srcs
@@ -81,36 +75,6 @@ endif()
 
 add_dependencies(detail_op_handle framework_proto auto_parallel_proto xxhash)
 
-set(ssa_graph_executor_srcs
-    ssa_graph_executor.cc
-    threaded_ssa_graph_executor.cc
-    parallel_ssa_graph_executor.cc
-    async_ssa_graph_executor.cc
-    bind_threaded_ssa_graph_executor.cc
-    fast_threaded_ssa_graph_executor.cc
-    scope_buffered_ssa_graph_executor.cc
-    scope_buffered_monitor.cc)
-set(SSA_GRAPH_EXECUTOR_DEPS
-    graph
-    framework_proto
-    detail_op_handle
-    reference_count_pass
-    eager_deletion_pass
-    buffer_shared_inplace_op_pass
-    buffer_shared_cross_op_memory_reuse_pass
-    inplace_addto_op_pass
-    set_reader_device_info_utils
-    scope
-    simple_threadpool
-    device_context
-    profiler
-    selected_rows_utils)
-
-cc_library(
-  ssa_graph_executor
-  SRCS ${ssa_graph_executor_srcs}
-  DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
-
 set(IR_PASS_DEPS
     graph_viz_pass
     multi_devices_graph_pass
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
deleted file mode 100644
index a93e59b27aebb..0000000000000
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
-
-#include "paddle/fluid/framework/variable_helper.h"
-
-#if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-inline void InitVarsInScope(const std::vector<VarInfo> &var_infos,
-                            Scope *scope,
-                            Scope *local_scope) {
-  VLOG(3) << "InitVarsInScope";
-  for (auto &info : var_infos) {
-    if (info.persistable_) {  // Persistable
-      auto *var = scope->FindVar(info.name_);
-      if (var != nullptr) {
-        VLOG(2) << info.name_
-                << " has been initialized beforehand in global scope, skipped";
-        continue;
-      }
-      InitializeVariable(scope->Var(info.name_), info.type_);
-    } else {
-      InitializeVariable(local_scope->Var(info.name_), info.type_);
-    }
-  }
-}
-
-// get CommContext and remote send and recv op
-void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { return; }
-
-AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places,
-    std::vector<ir::Graph *> graphs)
-    : strategy_(strategy),
-      local_scopes_(local_scopes),
-      local_exec_scopes_(local_exec_scopes),
-      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
-      places_(places),
-      graphs_(std::move(graphs)),
-      executors_(),
-      run_futures_(),
-      var_infos_() {
-  VLOG(3) << "build AsyncSSAGraphExecutor";
-  PADDLE_ENFORCE_EQ(places_.size(),
-                    local_scopes_.size(),
-                    platform::errors::InvalidArgument(
-                        "The number of places and the number of local scopes "
-                        "should be equal, but got number of places is %d and "
-                        "number of local scopes is %d.",
-                        places_.size(),
-                        local_scopes_.size()));
-  PADDLE_ENFORCE_EQ(
-      local_scopes_.size(),
-      local_exec_scopes_.size(),
-      platform::errors::InvalidArgument(
-          "The number of local scopes and the number of local execution scopes "
-          "should be equal, but got number of local scopes is %d and "
-          "number of local execution scopes is %d.",
-          local_scopes_.size(),
-          local_exec_scopes_.size()));
-
-  // set the correct size of thread pool to each device.
-  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
-                               ? 1UL
-                               : strategy_.num_threads_ / places_.size();
-  VLOG(1) << "set num_threads: " << strategy_.num_threads_
-          << " to run the operators of the graph on each device.";
-  for (size_t i = 0; i < places.size(); ++i) {
-    executors_.emplace_back(
-        new details::ThreadedSSAGraphExecutor(strategy_,
-                                              {local_scopes_[i]},
-                                              {local_exec_scopes_[i]},
-                                              {places_[i]},
-                                              graphs_[i]));
-  }
-
-  for (auto &node : graphs_[0]->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos_.emplace_back();
-      var_infos_.back().name_ = node->Var()->Name();
-      var_infos_.back().type_ = node->Var()->GetType();
-      var_infos_.back().persistable_ = node->Var()->Persistable();
-    }
-  }
-
-  for (size_t i = local_scopes_.size(); i >= 1; --i) {
-    InitVarsInScope(
-        var_infos_, local_scopes_[i - 1], local_exec_scopes_[i - 1]);
-  }
-  ProcessGraph(graphs_, local_scopes_[0]);
-}
-
-void AsyncSSAGraphExecutor::StartOffPythonTrainLoop(bool return_merged) {
-  VLOG(3) << "StartOffPythonTrainLoop size = " << places_.size();
-  for (size_t i = 1; i < places_.size(); ++i) {
-    auto call = [this, i, return_merged]() -> void {
-      VLOG(3) << "start off python thread " << i;
-      try {
-        while (true) {
-          executors_[i]->Run({}, return_merged);
-        }
-      } catch (...) {
-        exception_holder_.Catch(std::current_exception());
-        VLOG(3) << "get exception type = " << exception_holder_.Type();
-      }
-      VLOG(3) << "thread " << i << " exited!";
-    };
-    run_futures_.emplace_back(pool_->enqueue(std::move(call)));
-  }
-}
-
-void AsyncSSAGraphExecutor::HandleException() {
-  if (exception_holder_.IsCaught()) {
-    for (auto &f : run_futures_) {
-      VLOG(3) << "wait future";
-      f.wait();
-    }
-    VLOG(3) << "caught exception " << exception_holder_.Type()
-            << ", rethrow it";
-    run_futures_.clear();
-    exception_holder_.ReThrow();
-  }
-}
-
-FetchResultType AsyncSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  PADDLE_ENFORCE_EQ(return_merged,
-                    true,
-                    platform::errors::InvalidArgument(
-                        "AsyncSSAGraphExecutor does not support unmerged "
-                        "results to be fetched!"));
-  // init once
-  if (run_futures_.empty() && places_.size() > 1) {
-#if defined PADDLE_WITH_PSCORE
-    if (strategy_.thread_barrier_) {
-      paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
-          places_.size());
-    }
-#endif
-    exception_holder_.Clear();
-    StartOffPythonTrainLoop(return_merged);
-  }
-
-  if (places_.size() == 1) {
-    exception_holder_.Clear();
-  }
-
-  FetchResultType fetch_data;
-
-  try {
-    fetch_data = executors_[0]->Run(fetch_tensors, return_merged);
-  } catch (...) {
-    exception_holder_.Catch(std::current_exception());
-  }
-
-  HandleException();
-
-  FetchList ret;
-  auto &val = PADDLE_GET(FetchList, fetch_data);
-  for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
-    if (data_is_lod_tensor(val.at(fetch_idx))) {
-      std::vector<const phi::DenseTensor *> lodtensor_ptrs;
-      lodtensor_ptrs.push_back(
-          &(PADDLE_GET(phi::DenseTensor, val.at(fetch_idx))));
-      phi::DenseTensor var;
-      MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace());
-      ret.emplace_back(var);
-    } else {
-      auto array = PADDLE_GET(LoDTensorArray, val.at(fetch_idx));
-      LoDTensorArray item_array;
-      item_array.reserve(array.size());
-      for (auto &item : array) {
-        std::vector<const phi::DenseTensor *> lodtensor_ptrs;
-        lodtensor_ptrs.push_back(&item);
-        item_array.emplace_back();
-        MergeLoDTensor(
-            &(item_array.back()), lodtensor_ptrs, platform::CPUPlace());
-      }
-      ret.emplace_back(item_array);
-    }
-  }
-  return ret;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h
deleted file mode 100644
index bca1f0b460ff4..0000000000000
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "ThreadPool.h"
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct VarInfo {
-  std::string name_;
-  proto::VarType::Type type_;
-  bool persistable_;
-};
-
-class AsyncSSAGraphExecutor final : public SSAGraphExecutor {
- public:
-  AsyncSSAGraphExecutor(const ExecutionStrategy &strategy,
-                        const std::vector<Scope *> &local_scopes,
-                        const std::vector<Scope *> &local_exec_scopes,
-                        const std::vector<platform::Place> &places,
-                        std::vector<ir::Graph *> graphs);
-  ~AsyncSSAGraphExecutor() final = default;
-  const ir::Graph &Graph() const override { return *graphs_[0]; }
-
-  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
-                      bool return_merged) override;
-
- private:
-  void StartOffPythonTrainLoop(bool return_merged);
-  void HandleException();
-
- private:
-  ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-  std::vector<platform::Place> places_;
-  std::vector<ir::Graph *> graphs_;
-
-  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
-  ExceptionHolder exception_holder_;
-  std::vector<std::future<void>> run_futures_;
-  std::vector<VarInfo> var_infos_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
deleted file mode 100644
index f36f29a5b0217..0000000000000
--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
+++ /dev/null
@@ -1,350 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h"
-
-#include <deque>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-#if defined(PADDLE_WITH_XPU)
-namespace paddle {
-namespace framework {
-namespace details {
-
-BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places,
-    ir::Graph *graph)
-    : strategy_(strategy),
-      local_scopes_(local_scopes),
-      local_exec_scopes_(local_exec_scopes),
-      places_(places),
-      graph_(graph),
-      prepare_pool_(1),
-      multi_device_op_pool_(1) {
-  for (uint32_t i = 0; i < places.size(); i++) {
-    pool_.emplace_back(std::unique_ptr<::ThreadPool>(new ::ThreadPool(1)));
-  }
-  int index = 0;
-  for (uint32_t i = 0; i < places.size(); i++) {
-    int id = places_[i].device;
-    if (place_to_index_.find(id) == place_to_index_.end()) {
-      place_to_index_[id] = index;
-      index++;
-    }
-  }
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    int dep = static_cast<int>(op->NotReadyInputSize());
-    op_deps_.emplace(op, dep);
-    if (dep == 0) {
-      bootstrap_ops_.emplace_back(op);
-    }
-  }
-  PADDLE_ENFORCE_GT(op_deps_.size(),
-                    0,
-                    platform::errors::PreconditionNotMet(
-                        "The graph doesn't have operators."));
-  PrepareAtomicOpDeps();
-}
-
-static std::vector<OpHandleBase *> get_children(OpHandleBase *op) {
-  auto &outputs = op->Outputs();
-  std::vector<OpHandleBase *> ret;
-  for (auto &output : outputs) {
-    ret.insert(
-        ret.end(), output->PendingOps().begin(), output->PendingOps().end());
-  }
-  return ret;
-}
-
-static std::vector<OpHandleBase *> get_parents(OpHandleBase *op) {
-  auto &inputs = op->Inputs();
-  std::vector<OpHandleBase *> ret;
-  for (auto &input : inputs) {
-    if (input->GeneratedOp() != nullptr) {
-      ret.push_back(input->GeneratedOp());
-    }
-  }
-  return ret;
-}
-
-FetchResultType BindThreadedSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  VLOG(3) << "enter BindThreadedSSAGraphExecutor Run";
-  return RunMainStream(fetch_tensors, return_merged);
-}
-
-// use 2 streams to run op. The first stream is main stream and will run
-// most op exclude op depending on multi device(e.g., all_reduce, fetch op)
-FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  VLOG(3) << "enter MainStream Run";
-  std::unique_ptr<std::unordered_map<OpHandleBase *, struct RunningItem>>
-      op_deps = atomic_op_deps_.get();
-  PrepareAtomicOpDeps();
-
-  error_state = 0;
-  paddle::framework::FetchResultType fetches;
-  if (return_merged) {
-    fetches = FetchList(fetch_tensors.size());
-  } else {
-    fetches = FetchUnmergedList(fetch_tensors.size());
-  }
-  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::vector<OpHandleBase *> fetch_ops;
-  std::vector<OpHandleBase *> ready_fetch_ops;
-  auto ready_ops = std::make_shared<BlockingQueue<OpHandleBase *>>();
-  exception_.Clear();
-
-  InsertFetchOps(fetch_tensors,
-                 &fetches,
-                 &fetched_vars,
-                 op_deps.get(),
-                 &fetch_ops,
-                 &ready_fetch_ops,
-                 return_merged);
-  for (auto cur_op : bootstrap_ops_) {
-    ready_ops->Push(cur_op);
-  }
-  for (auto cur_op : ready_fetch_ops) {
-    ready_ops->Push(cur_op);
-  }
-
-  {
-    std::lock_guard<std::mutex> lock(mutex_);
-    exec_op_count_ = 0;
-  }
-
-  platform::XPUPlace cur_place;
-  std::size_t cur_count = 0;
-
-  while (cur_count < op_deps->size()) {
-    cur_count++;
-    auto cur_op = ready_ops->Pop();
-    // when exception, get cur_op == nullptr
-    if (cur_op == nullptr) {
-      std::lock_guard<std::mutex> lock(mutex_);
-      exec_op_count_ = op_deps->size();
-      break;
-    }
-    auto dev_ctxes_ = cur_op->DeviceContext();
-    if (cur_op->IsMultiDeviceTransfer()) {
-      RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops);
-      continue;
-    } else {
-      cur_place = dev_ctxes_.begin()->first;
-      int cur_index = place_to_index_[cur_place.device];
-      RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index);
-    }
-  }
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [&] { return exec_op_count_ >= op_deps->size(); });
-  }
-
-  if (exception_.IsCaught()) {
-    ExecutionFinal(&fetch_ops);
-  }
-
-  // Wait FetchOps.
-  ClearFetchOp(graph_, &fetch_ops);
-  return fetches;
-}
-
-void BindThreadedSSAGraphExecutor::InsertFetchOps(
-    const std::vector<std::string> &fetch_tensors,
-    FetchResultType *fetches,
-    std::unordered_map<std::string, std::vector<VarHandleBase *>> *fetched_vars,
-    std::unordered_map<OpHandleBase *, struct RunningItem> *op_deps,
-    std::vector<OpHandleBase *> *fetch_ops,
-    std::vector<OpHandleBase *> *ready_fetch_ops,
-    bool return_merged) {
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
-    for (auto &var_map : graph_->Get<GraphVars>(kGraphVars)) {
-      auto it = var_map.find(fetch_var_name);
-      if (it != var_map.end()) {
-        (*fetched_vars)[fetch_var_name].push_back(*it->second.rbegin());
-      }
-    }
-  }
-
-  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-    auto &var_name = fetch_tensors.at(i);
-    auto fetched_var_it = fetched_vars->find(var_name);
-    PADDLE_ENFORCE_NE(
-        fetched_var_it,
-        fetched_vars->end(),
-        platform::errors::PreconditionNotMet(
-            "Cannot find fetched variable(%s) in current computation graph. "
-            "Possible reasons are:\n"
-            "  1. The variable to be fetched is not defined in main program.\n"
-            "  2. The variable to be fetched is not an input or output of any "
-            "operator.\n"
-            "  3. Confirm that you have used the fetch `Variable` format "
-            "instead of the string literal('%s') in `fetch_list` parameter "
-            "when using `executor.run` method. In other words, the format of "
-            "`executor.run(fetch_list=[fetch_var])`(fetch_var is a Variable) "
-            "is recommended.",
-            var_name,
-            var_name));
-
-    auto &vars = fetched_var_it->second;
-
-    ir::Node *fetch_node =
-        graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node,
-                                 fetches,
-                                 i,
-                                 &local_scopes_,
-                                 &local_exec_scopes_,
-                                 return_merged);
-    fetch_ops->emplace_back(op);
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    for (auto &p : places_) {
-      op->SetDeviceContext(p, pool.Get(p));
-    }
-
-    for (auto *var : vars) {
-      op->AddInput(var);
-    }
-
-    int dep = static_cast<int>(op->NotReadyInputSize());
-    (*op_deps)[op].dep_num = dep;
-    (*op_deps)[op].op = op;
-    if (dep == 0) {
-      ready_fetch_ops->emplace_back(op);
-    }
-  }
-}
-// RunMultiDeviceOpAsync function is used for Communicated OPs
-// like all_reduce\broadcast among multicards.
-void BindThreadedSSAGraphExecutor::RunMultiDeviceOpAsync(
-    OpHandleBase *op,
-    std::unordered_map<OpHandleBase *, struct RunningItem> *op_deps,
-    std::shared_ptr<BlockingQueue<OpHandleBase *>> ready_ops) {
-  multi_device_op_pool_.enqueue([=] {
-    try {
-      if (error_state == 0 && LIKELY(!strategy_.dry_run_)) {
-        auto dev_ctxes = op->DeviceContext();
-        auto &inputs = op->Inputs();
-        for (auto &input : inputs) {
-          if (input && input->GeneratedOp() != nullptr) {
-            auto dev_ctxes = input->GeneratedOp()->DeviceContext();
-            for (auto &item : dev_ctxes) {
-              ((platform::XPUDeviceContext *)(item.second))->Wait();
-            }
-          } else {
-            VLOG(3) << "No generated op:" << op->Name();
-          }
-        }
-        op->Run(strategy_.use_device_);
-        auto &outputs = op->Outputs();
-        for (auto &output : outputs) {
-          for (auto &pending_op : output->PendingOps()) {
-            std::atomic<int> &deps = op_deps->at(pending_op).dep_num;
-            if (deps.fetch_sub(1) == 1) {
-              ready_ops->Push(pending_op);
-            }
-          }
-        }
-      } else if (error_state) {
-        ready_ops->Push(nullptr);
-      }
-    } catch (...) {
-      error_state = 1;
-      exception_.Catch(std::current_exception());
-      ready_ops->Push(nullptr);
-    }
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      exec_op_count_++;
-      cv_.notify_all();
-    }
-  });
-}
-// RunOpAsyncMainStream function is used for computed OPs
-void BindThreadedSSAGraphExecutor::RunOpAsyncMainStream(
-    OpHandleBase *op,
-    std::unordered_map<OpHandleBase *, struct RunningItem> *op_deps,
-    std::shared_ptr<BlockingQueue<OpHandleBase *>> ready_ops,
-    int index) {
-  pool_[index]->enqueue([=] {
-    try {
-      if (error_state == 0 && LIKELY(!strategy_.dry_run_)) {
-        op->Run(strategy_.use_device_);
-        auto &outputs = op->Outputs();
-        for (auto &output : outputs) {
-          for (auto &pending_op : output->PendingOps()) {
-            std::atomic<int> &deps = op_deps->at(pending_op).dep_num;
-            if (deps.fetch_sub(1) == 1) {
-              ready_ops->Push(pending_op);
-            }
-          }
-        }
-      } else if (error_state) {
-        ready_ops->Push(nullptr);
-      }
-    } catch (...) {
-      error_state = 1;
-      exception_.Catch(std::current_exception());
-      ready_ops->Push(nullptr);
-    }
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      exec_op_count_++;
-      cv_.notify_all();
-    }
-  });
-}
-
-void BindThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
-  atomic_op_deps_ = prepare_pool_.enqueue([&] {
-    auto *op_deps = new std::unordered_map<OpHandleBase *, struct RunningItem>;
-    for (auto &pair : op_deps_) {
-      (*op_deps)[pair.first].dep_num = pair.second;
-      (*op_deps)[pair.first].op = pair.first;
-    }
-    return std::unique_ptr<
-        std::unordered_map<OpHandleBase *, struct RunningItem>>(op_deps);
-  });
-}
-
-const ir::Graph &BindThreadedSSAGraphExecutor::Graph() const { return *graph_; }
-
-void BindThreadedSSAGraphExecutor::ExecutionFinal(
-    std::vector<OpHandleBase *> *fetch_ops) {
-  VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it";
-  ClearFetchOp(graph_, fetch_ops);
-  exception_.ReThrow();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
deleted file mode 100644
index ac07eb9fa5d1b..0000000000000
--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ThreadPool.h>
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/details/exception_holder.h"
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-
-#if defined(PADDLE_WITH_XPU)
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-struct RunningItem {
-  std::atomic<int> dep_num;
-  OpHandleBase *op;
-};
-
-class OpHandleBase;
-class BindThreadedSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  BindThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                               const std::vector<Scope *> &local_scopes,
-                               const std::vector<Scope *> &local_exec_scopes,
-                               const std::vector<platform::Place> &places,
-                               ir::Graph *graph);
-  // FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
-  // Run a SSAGraph by a thread pool
-  // Use topological sort algorithm
-  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
-                      bool return_merged) override;
-  const ir::Graph &Graph() const override;
-
- private:
-  FetchResultType RunMainStream(const std::vector<std::string> &fetch_tensors,
-                                bool return_merged);
-
-  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
-  // be destroyed first.
-  ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-  std::vector<platform::Place> places_;
-  ir::Graph *graph_;
-
-  std::unordered_map<OpHandleBase *, int> op_deps_;
-  std::unordered_map<int, int> place_to_index_;
-  std::vector<OpHandleBase *> bootstrap_ops_;
-
-  std::unique_ptr<int[]> stream_op_count_;
-
-  std::future<
-      std::unique_ptr<std::unordered_map<OpHandleBase *, struct RunningItem>>>
-      atomic_op_deps_;
-  ExceptionHolder exception_;
-
-  std::vector<std::unique_ptr<::ThreadPool>> pool_;
-  ::ThreadPool prepare_pool_;
-  ::ThreadPool multi_device_op_pool_;
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  uint32_t exec_op_count_;
-  std::atomic<int> error_state;
-
-  void RunOpAsyncMainStream(
-      OpHandleBase *op,
-      std::unordered_map<OpHandleBase *, struct RunningItem> *op_deps,
-      std::shared_ptr<BlockingQueue<OpHandleBase *>> ready_ops,
-      int index);
-
-  void RunMultiDeviceOpAsync(
-      OpHandleBase *op,
-      std::unordered_map<OpHandleBase *, struct RunningItem> *op_deps,
-      std::shared_ptr<BlockingQueue<OpHandleBase *>> ready_ops);
-
-  void PrepareAtomicOpDeps();
-
-  int get_pool_thread_index(int device_id);
-
-  inline void ExecutionFinal(std::vector<OpHandleBase *> *fetch_ops);
-
-  void InsertFetchOps(
-      const std::vector<std::string> &fetch_tensors,
-      FetchResultType *fetches,
-      std::unordered_map<std::string, std::vector<VarHandleBase *>>
-          *fetched_vars,
-      std::unordered_map<OpHandleBase *, struct RunningItem> *op_deps,
-      std::vector<OpHandleBase *> *fetch_ops,
-      std::vector<OpHandleBase *> *ready_fetch_ops,
-      bool return_merged);
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
deleted file mode 100644
index f947794ccdd05..0000000000000
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ /dev/null
@@ -1,391 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-
-#include <deque>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-namespace paddle::framework::details {
-
-FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places,
-    ir::Graph *graph)
-    : strategy_(strategy),
-      local_scopes_(local_scopes),
-      local_exec_scopes_(local_exec_scopes),
-      places_(places),
-      graph_(graph),
-      op_deps_(),
-      bootstrap_ops_(),
-      fetch_ctxs_(),
-      remaining_(0),
-      atomic_op_deps_(),
-      pool_(nullptr),
-      // add one more thread for generate op_deps
-      prepare_pool_(1),
-      traced_ops_() {
-  platform::EmplaceDeviceContexts(
-      &fetch_ctxs_,
-      places,
-      /*disable_setting_default_stream_for_allocator=*/true,
-      /*stream_priority=*/0);
-  if (ir::IsTopologySortOperationsUnique(*graph_)) {
-    VLOG(10) << "Change thread number to 1 because the topology sort order is "
-                "unique";
-    strategy_.num_threads_ = 1;
-    traced_ops_.clear();
-    for (auto *op_node : TopologySortOperations(*graph_)) {
-      if (op_node->IsWrappedBy<OpHandleBase>()) {
-        traced_ops_.emplace_back(&(op_node->Wrapper<OpHandleBase>()));
-      }
-    }
-  }
-  pool_ = std::make_unique<::ThreadPool>(strategy.num_threads_);
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    int dep = static_cast<int>(op->NotReadyInputSize());
-    op_deps_.emplace(op, dep);
-    if (dep == 0) {
-      bootstrap_ops_.emplace_back(op);
-    }
-  }
-  PADDLE_ENFORCE_GT(op_deps_.size(),
-                    0,
-                    platform::errors::PreconditionNotMet(
-                        "The graph doesn't have operators."));
-  PrepareAtomicOpDeps();
-}
-
-FetchResultType FastThreadedSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  VLOG(3) << "enter FastThreadedSSAGraphExecutor Run";
-  std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare",
-                                platform::TracerEventType::UserDefined,
-                                2));
-  std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
-      op_deps = atomic_op_deps_.get();
-  PrepareAtomicOpDeps();
-  size_t num_ops = op_deps->size();
-
-  FetchResultType fetches;
-  if (return_merged) {
-    fetches = FetchList(fetch_tensors.size());
-  } else {
-    fetches = FetchUnmergedList(fetch_tensors.size());
-  }
-  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::vector<OpHandleBase *> fetch_ops;
-  std::vector<OpHandleBase *> ready_fetch_ops;
-  exception_.Clear();
-  InsertFetchOps(fetch_tensors,
-                 &fetches,
-                 &fetched_vars,
-                 op_deps.get(),
-                 &fetch_ops,
-                 &ready_fetch_ops,
-                 return_merged);
-  event.reset(nullptr);
-  if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) {
-    // If the num_threads is 1, we can record the order of operator's
-    // execution in the first iteration, and in subsequent iterations,
-    // run the recorded operators directly. This strategy could make the
-    // execution faster.
-    VLOG(3) << "Run the traced ops.";
-    bool is_exception_free =
-        RunTracedOps(traced_ops_) && RunTracedOps(fetch_ops);
-    if (!is_exception_free) {
-      ExecutionFinal(&fetch_ops);
-    }
-  } else {
-    traced_ops_.clear();
-    remaining_ = 0;
-    auto complete_q = std::make_shared<BlockingQueue<size_t>>();
-    VLOG(3) << "number of bootstrap_ops_: " << bootstrap_ops_.size();
-    VLOG(3) << "number of ready_fetch_ops: " << ready_fetch_ops.size();
-    for (auto op : bootstrap_ops_) {
-      RunOpAsync(op_deps.get(), op, complete_q);
-    }
-    for (auto op : ready_fetch_ops) {
-      RunOpAsync(op_deps.get(), op, complete_q);
-    }
-
-    size_t num_complete = 0;
-    while (num_complete != op_deps->size()) {
-      size_t num_comp = complete_q->Pop();
-      if (num_comp == -1UL) {
-        int remaining = 0;
-        while (true) {
-          remaining = remaining_;
-          if (remaining == 0) {
-            break;
-          }
-          for (int i = 0; i < remaining; ++i) {
-            complete_q->Pop();
-          }
-        }
-        if (exception_.IsCaught()) {
-          ExecutionFinal(&fetch_ops);
-        }
-      }
-      num_complete += num_comp;
-    }
-  }
-  // Wait FetchOps.
-  if (!fetch_ops.empty()) {
-    platform::RecordEvent record_wait(
-        "FastThreadedSSAGraphExecutor::WaitFetchOps",
-        platform::TracerEventType::Operator,
-        1);
-    ClearFetchOp(graph_, &fetch_ops);
-
-    for (auto &place : places_) {
-      fetch_ctxs_[place].get().get()->Wait();
-    }
-  }
-
-  return fetches;
-}
-
-void FastThreadedSSAGraphExecutor::InsertFetchOps(
-    const std::vector<std::string> &fetch_tensors,
-    FetchResultType *fetches,
-    std::unordered_map<std::string, std::vector<VarHandleBase *>> *fetched_vars,
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    std::vector<OpHandleBase *> *fetch_ops,
-    std::vector<OpHandleBase *> *ready_fetch_ops,
-    bool return_merged) {
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
-    for (auto &var_map : graph_->Get<GraphVars>(kGraphVars)) {
-      auto it = var_map.find(fetch_var_name);
-      if (it != var_map.end()) {
-        (*fetched_vars)[fetch_var_name].push_back(*it->second.rbegin());
-      }
-    }
-  }
-
-  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-    auto &var_name = fetch_tensors.at(i);
-    auto fetched_var_it = fetched_vars->find(var_name);
-    PADDLE_ENFORCE_NE(
-        fetched_var_it,
-        fetched_vars->end(),
-        platform::errors::PreconditionNotMet(
-            "Cannot find fetched variable(%s) in current computation graph. "
-            "Possible reasons are:\n"
-            "  1. The variable to be fetched is not defined in main program.\n"
-            "  2. The variable to be fetched is not an input or output of any "
-            "operator.\n"
-            "  3. Confirm that you have used the fetch `Variable` format "
-            "instead of the string literal('%s') in `fetch_list` parameter "
-            "when using `executor.run` method. In other words, the format of "
-            "`executor.run(fetch_list=[fetch_var])`(fetch_var is a Variable) "
-            "is recommended.",
-            var_name,
-            var_name));
-
-    auto &vars = fetched_var_it->second;
-
-    ir::Node *fetch_node =
-        graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchAsyncOpHandle(fetch_node,
-                                      fetches,
-                                      i,
-                                      &local_scopes_,
-                                      &local_exec_scopes_,
-                                      return_merged);
-    fetch_ops->emplace_back(op);
-
-    for (auto &p : places_) {
-      op->SetDeviceContext(p, fetch_ctxs_[p].get().get());
-    }
-
-    for (auto *var : vars) {
-      op->AddInput(var);
-    }
-
-    for (auto *var : vars) {
-      auto *op = var->GeneratedOp();
-      auto *compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
-      if (compute_op) {
-        compute_op->SetLockAndRecordEventFree(false);
-      }
-    }
-
-    int dep = static_cast<int>(op->NotReadyInputSize());
-    (*op_deps)[op] = dep;
-    if (dep == 0) {
-      ready_fetch_ops->emplace_back(op);
-    }
-  }
-}
-
-bool FastThreadedSSAGraphExecutor::RunOp(
-    OpHandleBase *op,
-    const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
-    size_t *complete) {
-  RunOpSync(op);
-  if (LIKELY(!exception_.IsCaught())) {
-    if (LIKELY(!strategy_.dry_run_)) {
-      RecordOps(op);
-    }
-    ++(*complete);
-    return true;
-  } else {
-    --remaining_;
-    complete_q->Push(-1UL);
-    return false;
-  }
-}
-
-void FastThreadedSSAGraphExecutor::RunOpAsync(
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    OpHandleBase *op,
-    const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
-  ++remaining_;
-  platform::RecordEvent record("WorkQueue::AddTask",
-                               platform::TracerEventType::UserDefined,
-                               10 /*level*/);
-  this->pool_->enqueue([=] {
-    std::deque<OpHandleBase *> op_queue;
-    op_queue.push_front(op);
-
-    size_t complete = 0;
-    while (!op_queue.empty()) {
-      OpHandleBase *op_to_run = op_queue.back();
-      op_queue.pop_back();
-
-      // The Op involves data transfer of multiple devices may block other
-      // computations emit. For example:
-      // 1 step, queue=[Share, Allreduce], which Share is high priority
-      // 2 step, Share exec, pending_op=Grad, queue=[Allreduce, Grad]
-      // 3 step, Allreduce run with sync. Although Allreduce and Grad do not
-      // have topo dependency, but Grad must wait for Allreduce to complete
-      // before scheduling.
-      // In this scenario, calculation and communication may not overlap.
-      // Therefore, emit the op in the queue before running multi device op.
-      if (op_to_run->IsMultiDeviceTransfer()) {
-        while (!op_queue.empty()) {
-          OpHandleBase *post_op = op_queue.back();
-          op_queue.pop_back();
-          RunOpAsync(op_deps, post_op, complete_q);
-        }
-      }
-      VLOG(3) << "start to run op: " << op_to_run->Name();
-      if (!RunOp(op_to_run, complete_q, &complete)) {
-        return;
-      }
-      auto &outputs = op_to_run->Outputs();
-      op_to_run = nullptr;
-      for (auto &output : outputs) {
-        for (auto &pending_op : output->PendingOps()) {
-          std::atomic<int> &deps = op_deps->at(pending_op);
-          if (deps.fetch_sub(1) != 1) continue;
-
-          // NOTE(zjl): op with highest priority should run
-          // first without switching to another thread.
-          if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) {
-            op_queue.push_back(pending_op);
-          } else if (pending_op->IsMultiDeviceTransfer()) {
-            // multi device ops should be scheduled prior to computing ops
-            op_queue.push_front(pending_op);
-          } else {
-            if (op_to_run == nullptr) {
-              op_to_run = pending_op;
-            } else {
-              RunOpAsync(op_deps, pending_op, complete_q);
-            }
-          }
-        }
-      }
-
-      if (op_to_run != nullptr) {
-        op_queue.push_front(op_to_run);
-      }
-    }
-    --remaining_;
-    complete_q->Push(complete);
-  });
-}
-
-void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
-  atomic_op_deps_ = prepare_pool_.enqueue([&] {
-    auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
-    for (auto &pair : op_deps_) {
-      (*op_deps)[pair.first] = pair.second;
-    }
-    return std::unique_ptr<
-        std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
-  });
-}
-
-const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
-
-void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchAsyncOpHandle *>(op)) {
-    traced_ops_.emplace_back(op);
-  }
-}
-
-void FastThreadedSSAGraphExecutor::ExecutionFinal(
-    std::vector<OpHandleBase *> *fetch_ops) {
-  VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it";
-  // NOTE: If a new exception occurs in this ClearFetchOp operation, it will
-  // cause the loss of exception triggered firstly not thrown.
-  // Instead, the cleanup operation should only be performed when an EOF
-  // exception is caught. If other exceptions are triggered, the ClearFetchOp
-  // should not be continued.
-  if (exception_.Type() == "EOF") {
-    ClearFetchOp(graph_, fetch_ops);
-  }
-  exception_.ReThrow();
-}
-
-bool FastThreadedSSAGraphExecutor::RunTracedOps(
-    const std::vector<OpHandleBase *> &traced_ops) {
-  for (auto &op : traced_ops) {
-    if (!RunOpSync(op)) return false;
-  }
-  return true;
-}
-
-bool FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
-  try {
-    VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-    if (LIKELY(!strategy_.dry_run_)) {
-      op->Run(strategy_.use_device_);
-    }
-    VLOG(10) << op << " " << op->Name() << " Done ";
-    return true;
-  } catch (...) {
-    exception_.Catch(std::current_exception());
-    return false;
-  }
-}
-
-}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
deleted file mode 100644
index 11e137ba9637d..0000000000000
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ThreadPool.h>
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/details/exception_holder.h"
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace details {
-
-class OpHandleBase;
-class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                               const std::vector<Scope *> &local_scopes,
-                               const std::vector<Scope *> &local_exec_scopes,
-                               const std::vector<platform::Place> &places,
-                               ir::Graph *graph);
-  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
-                      bool return_merged) override;
-  const ir::Graph &Graph() const override;
-
- private:
-  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
-  // be destroyed first.
-  ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-  std::vector<platform::Place> places_;
-  ir::Graph *graph_;
-
-  std::unordered_map<OpHandleBase *, int> op_deps_;
-  std::vector<OpHandleBase *> bootstrap_ops_;
-
-  std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>
-      fetch_ctxs_;
-  std::atomic<int> remaining_;
-
-  std::future<
-      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
-      atomic_op_deps_;
-  ExceptionHolder exception_;
-
-  std::unique_ptr<::ThreadPool> pool_;
-  ::ThreadPool prepare_pool_;
-
-  std::vector<OpHandleBase *> traced_ops_;
-
-  bool RunOp(OpHandleBase *op,
-             const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
-             size_t *complete);
-
-  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-                  OpHandleBase *op,
-                  const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
-
-  void PrepareAtomicOpDeps();
-
-  inline void RecordOps(OpHandleBase *op);
-
-  inline void ExecutionFinal(std::vector<OpHandleBase *> *fetch_ops);
-
-  inline bool RunOpSync(OpHandleBase *op);
-
-  bool RunTracedOps(const std::vector<OpHandleBase *> &traced_ops);
-
-  void InsertFetchOps(
-      const std::vector<std::string> &fetch_tensors,
-      FetchResultType *fetches,
-      std::unordered_map<std::string, std::vector<VarHandleBase *>>
-          *fetched_vars,
-      std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-      std::vector<OpHandleBase *> *fetch_ops,
-      std::vector<OpHandleBase *> *ready_fetch_ops,
-      bool return_merged);
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 60c3f35a4f7f7..cca7d203df5da 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -21,8 +21,8 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
@@ -41,6 +41,12 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+struct VariableInfo {
+  std::string name_;
+  proto::VarType::Type type_;
+  bool persistable_;
+};
+
 // all variable in each devices.
 // The outside vector is the device vector. Each element of this vector is a
 // map from variable name to variables. The variables, who have the same name,
@@ -63,7 +69,7 @@ constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce";
 typedef std::unordered_set<VarHandleBase *> GraphDepVars;
 constexpr char kGraphDepVars[] = "dep_vars";
 
-typedef std::unordered_map<std::string, details::VariableInfo> FusedVars;
+typedef std::unordered_map<std::string, VariableInfo> FusedVars;
 constexpr char kFusedVars[] = "fused_vars";
 constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@";
 
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
deleted file mode 100644
index 45660331c1202..0000000000000
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-
-#include <algorithm>
-#include <memory>
-#include <utility>
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-
-namespace paddle::framework::details {
-
-static std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
-    ir::Graph *graph, size_t place_num) {
-  std::vector<std::unique_ptr<ir::Graph>> graphs;
-  graphs.reserve(place_num);
-  for (size_t i = 0; i < place_num; ++i) {
-    ProgramDesc empty;
-    graphs.emplace_back(std::make_unique<ir::Graph>(empty));
-    auto &g = graphs.back();
-    g->Set(kGraphVars, new GraphVars(1UL));
-    g->Set(kGraphDepVars, new GraphDepVars);
-    auto &stale_ops =
-        graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
-    g->Erase(details::kStaleProgramOpDescs);
-    g->Set<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs,
-                                        new std::vector<OpDesc *>(stale_ops));
-  }
-  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-
-  for (auto &op : op_handles) {
-    auto &dev_ctx = op->DeviceContext();
-    auto &p = dev_ctx.begin()->first;
-    int dev_id = p.device;  // NOLINT
-    auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
-    graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
-
-    for (auto &var : op->Inputs()) {
-      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
-      if (dummy_ptr) {
-        dev_dummys.insert(var);
-        if (graph->Nodes().count(var->Node()))
-          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
-      }
-    }
-    for (auto &var : op->Outputs()) {
-      auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
-      if (dummy_ptr) {
-        dev_dummys.insert(var);
-        if (graph->Nodes().count(var->Node()))
-          graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
-      }
-    }
-  }
-
-  for (size_t dev_id = 0; dev_id < place_num; ++dev_id) {
-    auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
-    auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
-    for (auto &name_pair : origin_vars) {
-      dev_vars.emplace(name_pair.first, name_pair.second);
-      for (auto &version_pair : name_pair.second) {
-        if (graph->Nodes().count(version_pair->Node())) {
-          graphs[dev_id]->AddNode(
-              graph->RemoveNode(version_pair->Node()).release());
-        }
-      }
-    }
-  }
-
-  return graphs;
-}
-
-ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places,
-    ir::Graph *graph)
-    // TODO(Yancey1989): Copying graphs is not safely since it deleted the
-    // attrs.
-    : ParallelSSAGraphExecutor(
-          strategy,
-          local_scopes,
-          local_exec_scopes,
-          places,
-          SeparateMultiDevicesGraph(graph, places.size())) {}
-
-ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places,
-    std::vector<std::unique_ptr<ir::Graph>> graphs)
-    : strategy_(strategy),
-      local_scopes_(local_scopes),
-      pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
-      places_(places),
-      graphs_(std::move(graphs)),
-      feed_status_(places.size(), FeedStatus::kNone) {
-  PADDLE_ENFORCE_EQ(places_.size(),
-                    local_scopes_.size(),
-                    platform::errors::InvalidArgument(
-                        "The number of places and the number of local scopes "
-                        "should be equal, but got number of places is %d and "
-                        "number of local scopes is %d.",
-                        places_.size(),
-                        local_scopes_.size()));
-
-  PADDLE_ENFORCE_EQ(places_.size(),
-                    graphs_.size(),
-                    platform::errors::InvalidArgument(
-                        "Graph number does not match place number"));
-
-  PADDLE_ENFORCE_GT(
-      places_.size(),
-      0,
-      platform::errors::InvalidArgument("place number must be larger than 0"));
-
-  auto seq_allreduce_pass =
-      ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
-  seq_allreduce_pass->Set<bool>(kUseHierarchicalAllReduce, new bool(false));
-  for (auto &graph : graphs_) {
-    graph.reset(seq_allreduce_pass->Apply(graph.release()));
-  }
-
-  // set the correct size of thread pool to each device.
-  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
-                               ? 1UL
-                               : strategy_.num_threads_ / places_.size();
-  VLOG(1) << "set num_threads: " << strategy_.num_threads_
-          << " to run the operators of the graph on each device.";
-  for (size_t i = 0; i < places.size(); ++i) {
-    executors_.emplace_back(
-        new details::FastThreadedSSAGraphExecutor(strategy_,
-                                                  local_scopes_,
-                                                  local_exec_scopes,
-                                                  {places_[i]},
-                                                  graphs_.at(i).get()));
-  }
-}
-
-std::vector<ir::Graph *> ParallelSSAGraphExecutor::Graphs() {
-  std::vector<ir::Graph *> result;
-  result.reserve(graphs_.size());
-  for (auto &g : graphs_) {
-    result.emplace_back(g.get());
-  }
-  return result;
-}
-
-enum ExceptionStatus { kSuccess = 0, kEOF, kOther };
-
-FetchResultType ParallelSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  size_t feed_num = std::count(
-      feed_status_.begin(), feed_status_.end(), FeedStatus::kHasFeed);
-  bool has_feed = (feed_num > 0);
-
-  VLOG(10) << "Feed num " << feed_num;
-
-  size_t place_num = places_.size();
-
-  std::vector<std::future<FetchResultType>> run_futures;
-  std::vector<ExceptionStatus> exception_status(place_num,
-                                                ExceptionStatus::kSuccess);
-
-  std::vector<FetchResultType> fetch_data;
-  fetch_data.reserve(place_num);
-  exception_holder_.Clear();
-
-  for (size_t i = 0; i < place_num; ++i) {
-    auto call = [&, i]() -> FetchResultType {
-      try {
-        if (!support_partial_feed_ || !has_feed ||
-            feed_status_[i] == FeedStatus::kHasFeed) {
-          return executors_[i]->Run(fetch_tensors, return_merged);
-        }
-      } catch (platform::EOFException &) {
-        exception_status[i] = ExceptionStatus::kEOF;
-        exception_holder_.Catch(std::current_exception());
-      } catch (...) {
-        exception_status[i] = ExceptionStatus::kOther;
-        exception_holder_.Catch(std::current_exception());
-      }
-
-      if (return_merged) {
-        return FetchList();
-      } else {
-        return FetchUnmergedList();
-      }
-    };
-
-    if (pool_) {
-      run_futures.emplace_back(pool_->enqueue(std::move(call)));
-    } else {
-      fetch_data.emplace_back(call());
-    }
-  }
-
-  if (pool_) {
-    for (auto &f : run_futures) {
-      fetch_data.emplace_back(f.get());
-    }
-  }
-
-  bool has_exception = exception_holder_.IsCaught();
-  if (!support_partial_feed_ && has_exception) {
-    VLOG(10) << "Exception rethrow because partial feed is not supported";
-    exception_holder_.ReThrow();
-  }
-
-  std::vector<bool> is_valid(place_num, true);
-
-  if (support_partial_feed_) {
-    if (has_feed) {
-      for (size_t i = 0; i < place_num; ++i) {
-        if (feed_status_[i] == FeedStatus::kNone) {
-          is_valid[i] = false;
-        } else if (exception_status[i] != ExceptionStatus::kSuccess) {
-          PADDLE_ENFORCE_EQ(has_exception,
-                            true,
-                            platform::errors::InvalidArgument(
-                                "Thread pool raises exception but not caught"));
-          VLOG(10) << "Exception rethrow because non-EOF exception raises when "
-                      "feed is given";
-          exception_holder_.ReThrow();
-        }
-      }
-    } else {
-      for (size_t i = 0; i < place_num; ++i) {
-        if (exception_status[i] == ExceptionStatus::kOther) {
-          PADDLE_ENFORCE_EQ(has_exception,
-                            true,
-                            platform::errors::InvalidArgument(
-                                "Thread pool raises exception but not caught"));
-          VLOG(10) << "Exception rethrow because non-EOF exception raises when "
-                      "feed is not given";
-          exception_holder_.ReThrow();
-        } else if (exception_status[i] != ExceptionStatus::kSuccess) {
-          is_valid[i] = false;
-        }
-      }
-    }
-  }
-
-  if (std::count(is_valid.begin(), is_valid.end(), true) == 0) {
-    PADDLE_ENFORCE_EQ(has_exception,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Thread pool raises exception but not caught"));
-    VLOG(10) << "Raise exception because there is no success worker";
-    exception_holder_.ReThrow();
-  }
-
-  if (return_merged) {
-    FetchList ret;
-    ret.reserve(fetch_tensors.size());
-    for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
-      std::vector<const phi::DenseTensor *> lodtensor_ptrs;
-      lodtensor_ptrs.reserve(place_num);
-      std::vector<const LoDTensorArray *> lodtensorarray_ptrs;
-      lodtensorarray_ptrs.reserve(place_num);
-      for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
-        if (!is_valid[scope_idx]) {
-          continue;
-        }
-        const auto &fetch_list =
-            PADDLE_GET_CONST(FetchList, fetch_data[scope_idx]);
-        if (data_is_lod_tensor(fetch_list[fetch_idx])) {
-          lodtensor_ptrs.push_back(
-              &(PADDLE_GET_CONST(phi::DenseTensor, fetch_list[fetch_idx])));
-        } else {
-          lodtensorarray_ptrs.push_back(
-              &(PADDLE_GET_CONST(LoDTensorArray, fetch_list[fetch_idx])));
-        }
-      }
-      if (!lodtensor_ptrs.empty()) {
-        phi::DenseTensor var;
-        MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace());
-        ret.emplace_back(var);
-      } else {
-        LoDTensorArray var_array(lodtensorarray_ptrs[0]->size());
-        for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) {
-          phi::DenseTensor var;
-          std::vector<const phi::DenseTensor *> ptrs;
-          ptrs.reserve(lodtensor_ptrs.size());
-          for (auto &lodtensorarray_ptr : lodtensorarray_ptrs) {
-            ptrs.push_back(&(lodtensorarray_ptr->at(i)));
-          }
-          MergeLoDTensor(&var, ptrs, platform::CPUPlace());
-          var_array[i] = std::move(var);
-        }
-        ret.emplace_back(var_array);
-      }
-    }
-    return ret;
-  } else {
-    FetchUnmergedList ret;
-    ret.reserve(fetch_tensors.size());
-    for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) {
-      ret.emplace_back();
-      for (size_t scope_idx = 0; scope_idx < local_scopes_.size();
-           ++scope_idx) {
-        if (!is_valid[scope_idx]) {
-          continue;
-        }
-        const auto &fetch_list =
-            PADDLE_GET_CONST(FetchUnmergedList, fetch_data[scope_idx]);
-        PADDLE_ENFORCE_EQ(
-            fetch_list[fetch_idx].size(),
-            1,
-            platform::errors::Fatal("Each place must have only one fetched "
-                                    "phi::DenseTensor/LoDTensorArray!"));
-        ret.back().emplace_back(fetch_list[fetch_idx][0]);
-      }
-    }
-    return ret;
-  }
-}
-
-}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
deleted file mode 100644
index 3414c7361e040..0000000000000
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "ThreadPool.h"
-#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class ParallelSSAGraphExecutor final : public SSAGraphExecutor {
- public:
-  enum FeedStatus {
-    kNone = 0,    // No feed
-    kHasFeed = 1  // Has feed
-  };
-
- public:
-  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
-                           const std::vector<Scope *> &local_scopes,
-                           const std::vector<Scope *> &local_exec_scopes,
-                           const std::vector<platform::Place> &places,
-                           ir::Graph *graph);
-
-  ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
-                           const std::vector<Scope *> &local_scopes,
-                           const std::vector<Scope *> &local_exec_scopes,
-                           const std::vector<platform::Place> &places,
-                           std::vector<std::unique_ptr<ir::Graph>> graphs);
-
-  ~ParallelSSAGraphExecutor() final = default;
-
-  const ir::Graph &Graph() const override { return *graphs_[0]; }
-
-  std::vector<ir::Graph *> Graphs();
-
-  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
-                      bool return_merged) override;
-
-  void SetHasFeed(size_t dev_idx, bool has_feed) {
-    feed_status_[dev_idx] = has_feed ? FeedStatus::kHasFeed : FeedStatus::kNone;
-  }
-
-  void EnablePartialFeedSupport() { support_partial_feed_ = true; }
-
-  bool SupportPartialFeed() const { return support_partial_feed_; }
-
- private:
-  ExecutionStrategy strategy_;
-  std::vector<Scope *> local_scopes_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-  std::vector<platform::Place> places_;
-  std::vector<std::unique_ptr<ir::Graph>> graphs_;
-
-  std::vector<std::unique_ptr<details::FastThreadedSSAGraphExecutor>>
-      executors_;
-  ExceptionHolder exception_holder_;
-
-  bool support_partial_feed_{false};
-  std::vector<FeedStatus> feed_status_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
deleted file mode 100644
index e3b3446209584..0000000000000
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
-
-#include "paddle/common/flags.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-namespace paddle::framework {
-class Variable;
-}  // namespace paddle::framework
-
-COMMON_DECLARE_double(local_exe_sub_scope_limit);
-
-namespace paddle::framework::details {
-
-static constexpr double kMB = 1.0 / (1024.0 * 1024.0);
-
-static void GetTensors(Variable *var,
-                       std::unordered_set<phi::DenseTensor *> *tensor_set) {
-  if (var->IsType<phi::DenseTensor>() &&
-      var->Get<phi::DenseTensor>().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<phi::DenseTensor>());
-  } else if (var->IsType<phi::SelectedRows>() &&
-             var->Get<phi::SelectedRows>().value().IsInitialized()) {
-    tensor_set->insert(var->GetMutable<phi::SelectedRows>()->mutable_value());
-  } else if (var->IsType<LoDTensorArray>()) {
-    auto *tensor_arr = var->GetMutable<LoDTensorArray>();
-    for (auto &t : *tensor_arr) {
-      if (t.IsInitialized()) {
-        tensor_set->insert(&t);
-      }
-    }
-  }
-}
-
-static void GetTensors(Scope *scope,
-                       std::unordered_set<phi::DenseTensor *> *tensor_set) {
-  for (auto &var_name : scope->LocalVarNames()) {
-    GetTensors(scope->FindVar(var_name), tensor_set);
-  }
-
-  for (auto *kid : scope->kids()) {
-    GetTensors(kid, tensor_set);
-  }
-}
-
-static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) {
-  std::unordered_set<phi::DenseTensor *> tensor_set;
-  GetTensors(scope, &tensor_set);
-  size_t memory_size = 0;
-  std::unordered_set<memory::Allocation *> allocation_set;
-  for (auto *tensor : tensor_set) {
-    if (clear_cpu_tensor && platform::is_cpu_place(tensor->place())) {
-      tensor->clear();
-    } else {
-      auto allocation = tensor->Holder().get();
-      if (!allocation_set.count(allocation)) {
-        memory_size += allocation->size();
-        allocation_set.insert(allocation);
-      }
-    }
-  }
-  return memory_size;
-}
-
-size_t GetScopeVarMemorySize(Scope *scope) {
-  return GetTensorMemorySize(scope, false /*clear_cpu_tensor*/);
-}
-
-ScopeBufferedMonitor::ScopeBufferedMonitor(
-    const std::vector<platform::Place> &places,
-    const std::vector<Scope *> &local_exec_scopes)
-    : places_(places),
-      local_exec_scopes_(local_exec_scopes),
-      pre_local_exec_scopes_(local_exec_scopes.size()),
-      post_local_exec_scopes_(local_exec_scopes.size()),
-      history_local_exec_scopes_() {}
-
-void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
-                                 bool has_fetch) {
-  std::unique_ptr<platform::RecordEvent> pre_local_exec_scopes_event(
-      new platform::RecordEvent(
-          "ScopeBufferedMonitor::pre_local_exec_scopes_process",
-          platform::TracerEventType::UserDefined,
-          2));
-  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
-    pre_local_exec_scopes_.at(scope_id).clear();
-    auto scopes = local_exec_scopes_.at(scope_id)->kids();
-    VLOG(10) << "pre_local_exec_scopes[" << scope_id
-             << "] sub-scope: " << scopes.size();
-    pre_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
-  }
-  pre_local_exec_scopes_event.reset();
-
-  callback();
-
-  std::unique_ptr<platform::RecordEvent> post_local_exec_scopes_event(
-      new platform::RecordEvent(
-          "ScopeBufferedMonitor::post_local_exec_scopes_process",
-          platform::TracerEventType::UserDefined,
-          2));
-  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
-    post_local_exec_scopes_.at(scope_id).clear();
-    auto scopes = local_exec_scopes_.at(scope_id)->kids();
-    VLOG(10) << "post_local_exec_scopes[" << scope_id
-             << "] sub-scope: " << scopes.size();
-    post_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end());
-  }
-
-  history_local_exec_scopes_.emplace_back();
-  auto &incr_local_exec_scopes = history_local_exec_scopes_.back();
-  incr_local_exec_scopes.resize(local_exec_scopes_.size());
-  for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
-    for (auto &scope : post_local_exec_scopes_.at(scope_id)) {
-      if (!pre_local_exec_scopes_.at(scope_id).count(scope)) {
-        incr_local_exec_scopes.at(scope_id).insert(scope);
-      }
-    }
-
-    if (VLOG_IS_ON(10)) {
-      if (!incr_local_exec_scopes.at(scope_id).empty() &&
-          FLAGS_local_exe_sub_scope_limit > 0) {
-        VLOG(10)
-            << "FLAGS_local_exe_sub_scope_limit is "
-            << FLAGS_local_exe_sub_scope_limit
-            << " MBytes now. If you don't need to limit the memory of local "
-               "execution scope, you should set "
-               "FLAGS_local_exe_sub_scope_limit=-1.";
-      }
-      std::stringstream out;
-      out << scope_id << " kids: ";
-      for (auto &scope : incr_local_exec_scopes.at(scope_id)) {
-        out << scope << ", ";
-      }
-      VLOG(10) << out.str();
-    }
-  }
-
-  size_t history_step = history_local_exec_scopes_.size();
-  if (has_fetch && history_step >= 2) {
-    ClearHistoryLocalExecScopes(history_step - 1);
-  }
-
-  // Delete CPU Memory
-  std::vector<size_t> gpu_memory_size_per_gpu(places_.size());
-  for (auto &scope_vec : history_local_exec_scopes_) {
-    for (size_t idx = 0; idx < scope_vec.size(); ++idx) {
-      for (auto &scope : scope_vec.at(idx)) {
-        gpu_memory_size_per_gpu.at(idx) +=
-            GetTensorMemorySize(scope, true /*clear_cpu_tensor*/);
-      }
-    }
-  }
-  if (VLOG_IS_ON(8)) {
-    for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
-      VLOG(8) << "history local exec scopes contains "
-              << string::HumanReadableSize(
-                     gpu_memory_size_per_gpu.at(idx))  // NOLINT
-              << " in " << places_.at(idx);
-    }
-  }
-
-  if (FLAGS_local_exe_sub_scope_limit > 0) {
-    for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) {
-      if (gpu_memory_size_per_gpu.at(idx) / kMB >=  // NOLINT
-          FLAGS_local_exe_sub_scope_limit) {
-        platform::DeviceContextPool::Instance().Get(places_.at(idx))->Wait();
-        local_exec_scopes_.at(idx)->DropKids();
-      }
-      for (auto &scope_vec : history_local_exec_scopes_) {
-        scope_vec.at(idx).clear();
-      }
-    }
-  }
-}
-
-void ScopeBufferedMonitor::ClearHistoryLocalExecScopes(size_t history_step) {
-  VLOG(10) << "delete pre_incr_local_exec_scopes.";
-  for (size_t i = 0; i < history_step; ++i) {
-    auto &pre_incr_local_exec_scopes = history_local_exec_scopes_.front();
-    for (size_t scope_idx = 0; scope_idx < pre_incr_local_exec_scopes.size();
-         ++scope_idx) {
-      for (auto scope : pre_incr_local_exec_scopes[scope_idx]) {
-        local_exec_scopes_.at(scope_idx)->DeleteScope(scope);
-      }
-    }
-    history_local_exec_scopes_.pop_front();
-  }
-}
-
-void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() {
-  history_local_exec_scopes_.clear();
-}
-
-}  // namespace paddle::framework::details
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.h b/paddle/fluid/framework/details/scope_buffered_monitor.h
deleted file mode 100644
index 3a94534eff458..0000000000000
--- a/paddle/fluid/framework/details/scope_buffered_monitor.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <deque>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/scope.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-class ScopeBufferedMonitor {
- public:
-  ScopeBufferedMonitor(const std::vector<platform::Place> &places,
-                       const std::vector<Scope *> &local_exec_scopes);
-
-  void Apply(const std::function<void()> &callback, bool has_fetch);
-
-  void ClearHistoryLocalExecScopes();
-
-  void ClearHistoryLocalExecScopes(size_t history_step);
-
- private:
-  std::vector<platform::Place> places_;
-  std::vector<Scope *> local_exec_scopes_;
-  std::vector<std::unordered_set<Scope *>> pre_local_exec_scopes_;
-  std::vector<std::unordered_set<Scope *>> post_local_exec_scopes_;
-  std::deque<std::vector<std::unordered_set<Scope *>>>
-      history_local_exec_scopes_;
-};
-
-size_t GetScopeVarMemorySize(Scope *scope);
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
deleted file mode 100644
index 1db2fff2b556d..0000000000000
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
-    ExecutionStrategy strategy,
-    std::vector<Scope *> local_scopes,
-    std::vector<Scope *> local_exec_scopes,
-    std::vector<VariableInfo> var_infos,
-    std::vector<platform::Place> places,
-    std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
-    : strategy_(strategy),
-      underlying_executor_(std::move(underlying_executor)),
-      local_scopes_(std::move(local_scopes)),
-      local_exec_scopes_(std::move(local_exec_scopes)),
-      preserve_vars_(),
-      tmp_var_infos_(),
-      tensor_array_vars_(),
-      var_infos_(std::move(var_infos)),
-      places_(std::move(places)),
-      scope_monitor_(places_, local_exec_scopes_) {
-  PADDLE_ENFORCE_EQ(
-      local_scopes_.size(),
-      local_exec_scopes_.size(),
-      platform::errors::InvalidArgument(
-          "The number of local scopes and the number of local execution scopes "
-          "should be equal, but got number of local scopes is %d and "
-          "number of local execution scopes is %d.",
-          local_scopes_.size(),
-          local_exec_scopes_.size()));
-  PrepareLocalExeScopes();
-}
-
-static void RunProgramDescs(const ProgramDescs &programs,
-                            const std::vector<Scope *> &local_exec_scopes,
-                            const std::vector<platform::Place> &places) {
-  for (auto &program : programs) {
-    for (auto &op_desc : program.Block(0).AllOps()) {
-      for (size_t i = 0; i < local_exec_scopes.size(); ++i) {
-        auto op = OpRegistry::CreateOp(*op_desc);
-        op->Run(*local_exec_scopes[i], places[i]);
-      }
-    }
-  }
-}
-
-FetchResultType ScopeBufferedSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (platform::IsCUDAGraphCapturing()) {
-    strategy_.num_iteration_per_drop_scope_ =
-        std::numeric_limits<size_t>::max();
-    DropLocalExeScopes(/*need_wait=*/false);
-  }
-#endif
-
-  if (drop_scope_counter_ == 0) {
-    platform::RecordEvent e(
-        "InitLocalVars", platform::TracerEventType::UserDefined, 2);
-    InitVariables();
-  }
-
-  FetchResultType fetch_data;
-  std::exception_ptr eptr = nullptr;
-
-  auto exe_run_func = [&]() {
-    try {
-      fetch_data = underlying_executor_->Run(fetch_tensors, return_merged);
-    } catch (...) {
-      eptr = std::current_exception();
-    }
-  };
-
-  if (strategy_.num_iteration_per_drop_scope_ == 1) {
-    exe_run_func();
-  } else {
-    scope_monitor_.Apply(exe_run_func, !fetch_tensors.empty());
-  }
-
-  if (VLOG_IS_ON(5)) {
-    for (auto *scope : local_exec_scopes_) {
-      VLOG(5) << "Left "
-              << string::HumanReadableSize(
-                     GetScopeVarMemorySize(scope))  // NOLINT
-              << " on scope " << scope << " before deleting";
-    }
-  }
-
-  ++drop_scope_counter_;
-  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ ||
-      DropScopeOrNot()) {
-    DropLocalExeScopes(!platform::IsCUDAGraphCapturing());
-  }
-
-  if (VLOG_IS_ON(5)) {
-    for (auto *scope : local_exec_scopes_) {
-      VLOG(5) << "Left "
-              << string::HumanReadableSize(
-                     GetScopeVarMemorySize(scope))  // NOLINT
-              << " on scope " << scope << " after deleting";
-    }
-  }
-
-  if (eptr) {
-    std::rethrow_exception(eptr);
-  } else {
-    return fetch_data;
-  }
-}
-
-bool ScopeBufferedSSAGraphExecutor::DropScopeOrNot() const {
-  for (auto &var : tensor_array_vars_) {
-    auto tensor_array = var->GetMutable<LoDTensorArray>();
-    for (phi::DenseTensor &tensor : *tensor_array) {
-      if (tensor.IsInitialized()) {
-        return true;
-      }
-    }
-    tensor_array->clear();
-  }
-  return false;
-}
-
-void ScopeBufferedSSAGraphExecutor::InitVariables() {
-  for (auto &info : tmp_var_infos_) {
-    for (auto &pair : info) {
-      InitializeVariable(pair.first, pair.second);
-    }
-  }
-
-  const ir::Graph &graph = Graph();
-  if (!is_initialized_) {
-    // startup_program_descs only need to be executed once
-    if (graph.Has(details::kStartupProgramDescs)) {
-      auto &program_descs =
-          graph.Get<details::ProgramDescs>(details::kStartupProgramDescs);
-      RunProgramDescs(program_descs, local_exec_scopes_, places_);
-    }
-    is_initialized_ = true;
-  }
-
-  if (graph.Has(details::kProgramDescs)) {
-    auto &program_descs =
-        graph.Get<details::ProgramDescs>(details::kProgramDescs);
-    RunProgramDescs(program_descs, local_exec_scopes_, places_);
-  }
-}
-
-void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
-  platform::RecordEvent drop_scope_event(
-      "DropLocalExeScopes", platform::TracerEventType::UserDefined, 2);
-  drop_scope_counter_ = 0;
-  if (need_wait) {
-    for (auto &p : places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
-  }
-  scope_monitor_.ClearHistoryLocalExecScopes();
-  for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-    local_exec_scopes_[i]->EraseVarsExcept(preserve_vars_[i]);
-    local_exec_scopes_[i]->DropKids();
-    for (auto &preserve_var : preserve_vars_[i]) {
-      preserve_var->Clear();
-    }
-    VLOG(3) << "Drop local execution scope: " << local_scopes_[i];
-  }
-}
-
-void ScopeBufferedSSAGraphExecutor::PrepareLocalExeScopes() {
-  // Create local scopes.
-  preserve_vars_.resize(local_scopes_.size());
-  tmp_var_infos_.resize(local_scopes_.size());
-
-  for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
-    size_t idx = local_scopes_.size() - 1 - (it - local_scopes_.rbegin());
-    auto *scope = local_scopes_[idx];
-    auto *local_scope = local_exec_scopes_[idx];
-
-    for (auto &info : var_infos_) {
-      if (info.persistable_) {  // Persistable
-        auto var = scope->FindVar(info.name_);
-        if (var != nullptr) {
-          VLOG(2)
-              << info.name_
-              << " has been initialized beforehand in global scope, skipped";
-          continue;
-        }
-        InitializeVariable(scope->Var(info.name_), info.type_);
-      } else {
-        Variable *tmp_var = local_scope->Var(info.name_);
-        preserve_vars_[idx].emplace(tmp_var);
-        tmp_var_infos_[idx].emplace_back(tmp_var, info.type_);
-        if (info.type_ == proto::VarType::LOD_TENSOR_ARRAY) {
-          tensor_array_vars_.emplace_back(tmp_var);
-        }
-      }
-    }
-  }
-}
-
-bool ScopeBufferedSSAGraphExecutor::NeedCreateLocalExeScope() {
-  return drop_scope_counter_ == 0;
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
deleted file mode 100644
index 6e64b486d29ef..0000000000000
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ /dev/null
@@ -1,91 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <ThreadPool.h>
-
-#include <deque>
-#include <list>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/scope_buffered_monitor.h"
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/var_handle.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/place.h"
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct VariableInfo {
-  std::string name_;
-  proto::VarType::Type type_;
-  bool persistable_;
-};
-
-class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
- public:
-  ScopeBufferedSSAGraphExecutor(
-      ExecutionStrategy strategy,
-      std::vector<Scope*> local_scopes,
-      std::vector<Scope*> local_exec_scopes,
-      std::vector<VariableInfo> var_infos,
-      std::vector<platform::Place> places,
-      std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
-
-  const ir::Graph& Graph() const override {
-    return underlying_executor_->Graph();
-  }
-
-  FetchResultType Run(const std::vector<std::string>& fetch_tensors,
-                      bool return_merged) override;
-
-  void DropLocalExeScopes(bool need_wait = true);
-
-  bool NeedCreateLocalExeScope();
-
-  void PrepareLocalExeScopes();
-
- private:
-  void InitVariables();
-
-  bool DropScopeOrNot() const;
-
-  bool is_initialized_{false};
-  size_t drop_scope_counter_{0};
-  ExecutionStrategy strategy_;
-  std::unique_ptr<SSAGraphExecutor> underlying_executor_;
-  std::vector<Scope*> local_scopes_;
-
-  std::vector<Scope*> local_exec_scopes_;
-  std::vector<std::unordered_set<Variable*>> preserve_vars_;
-  std::vector<std::vector<std::pair<Variable*, proto::VarType::Type>>>
-      tmp_var_infos_;
-
-  std::vector<Variable*> tensor_array_vars_;
-
-  std::vector<VariableInfo> var_infos_;
-  std::vector<platform::Place> places_;
-
-  ScopeBufferedMonitor scope_monitor_;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
deleted file mode 100644
index 6ade32097bd82..0000000000000
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-
-#include "paddle/fluid/framework/details/fetch_async_op_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-SSAGraphExecutor::~SSAGraphExecutor() = default;
-
-void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
-  if (fetch_ops->empty()) return;
-
-  for (auto& op : *fetch_ops) {
-    PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
-                          dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "The input ops of ClearFetchOp function should be "
-                          "FetchOpHandle or FetchAsyncOpHandle."));
-    for (auto& out_var : op->Node()->outputs) {
-      graph->RemoveNode(out_var);
-    }
-    for (auto& in_var : op->Inputs()) {
-      in_var->RemoveOutput(op, op->Node());
-    }
-    graph->RemoveNode(op->Node());
-  }
-  fetch_ops->clear();
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
deleted file mode 100644
index 0ac46bbc4da25..0000000000000
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-class SSAGraphExecutor {
-  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
-
- public:
-  SSAGraphExecutor() {}
-
-  virtual ~SSAGraphExecutor();
-
-  virtual const ir::Graph& Graph() const = 0;
-
-  virtual FetchResultType Run(const std::vector<std::string>& fetch_tensors,
-                              bool return_merged = true) = 0;
-};
-
-void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops);
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
deleted file mode 100644
index 6697a33e3e1d6..0000000000000
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ /dev/null
@@ -1,411 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-#if defined PADDLE_WITH_PSCORE
-#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    const ExecutionStrategy &strategy,
-    const std::vector<Scope *> &local_scopes,
-    const std::vector<Scope *> &local_exec_scopes,
-    const std::vector<platform::Place> &places,
-    ir::Graph *graph)
-    : graph_(graph),
-      local_scopes_(local_scopes),
-      local_exec_scopes_(local_exec_scopes),
-      places_(places),
-      fetch_ctxs_(),
-      op_deps_(nullptr),
-      op_deps_futures_(),
-      strategy_(strategy),
-      run_op_futures_(),
-      prepare_pool_(1),
-      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
-                                       : nullptr),
-      traced_ops_() {
-  platform::EmplaceDeviceContexts(
-      &fetch_ctxs_,
-      places,
-      /*disable_setting_default_stream_for_allocator=*/true,
-      /*stream_priority=*/0);
-
-  if (strategy_.num_iteration_per_run_ > 1) {
-    int read_op_num = 0;
-    for (auto *node : graph_->Nodes()) {
-      if (node->IsOp() && node->Name() == "read") {
-        read_op_num++;
-      }
-    }
-    if (read_op_num == 0) {
-      LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model "
-                      "should use pyreader to feed data!";
-    }
-  }
-  PrepareOpDeps();
-  CopyOpDeps();
-}
-
-inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare",
-                                platform::TracerEventType::UserDefined,
-                                2));
-  std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
-  CopyOpDeps();
-
-  VLOG(10) << "ThreadedSSAGraphExecutor::Run";
-  std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
-      new BlockingQueue<VarHandleBase *>);
-  auto &pending_ops = op_deps->pending_ops_;
-  auto &pending_vars = op_deps->pending_vars_;
-  auto &ready_ops = op_deps->ready_ops_;
-  size_t num_ops = op_deps->num_ops_;
-
-  // Step 2. Insert FetchOps
-  std::vector<OpHandleBase *> fetch_ops;
-  std::unordered_set<VarHandleBase *> fetch_dependencies;
-  FetchResultType fetch_data;
-  if (return_merged) {
-    fetch_data = FetchList(fetch_tensors.size());
-  } else {
-    fetch_data = FetchUnmergedList(fetch_tensors.size());
-  }
-
-  InsertFetchOps(fetch_tensors,
-                 &fetch_ops,
-                 &fetch_dependencies,
-                 &ready_ops,
-                 &pending_ops,
-                 &pending_vars,
-                 &fetch_data,
-                 return_merged);
-
-  exception_holder_.Clear();
-  event.reset(nullptr);
-
-  // Step 3. Execution
-  if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) {
-    // If the num_threads is 1, we can record the order of operator's
-    // execution in the first iteration, and in subsequent iterations,
-    // run the recorded operators directly. This strategy could make the
-    // execution faster.
-    VLOG(3) << "Run the traced ops.";
-    bool is_exception_free =
-        RunTracedOps(traced_ops_) && RunTracedOps(fetch_ops);
-    if (!is_exception_free) {
-      ExecutionFinal(&fetch_ops);
-    }
-  } else {
-    traced_ops_.clear();
-    auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
-      for (auto *op : set) {
-        RunOp(ready_vars, op);
-      }
-      set.clear();
-    };
-    // Clean run context
-    run_op_futures_.clear();
-
-    while (!pending_vars.empty()) {
-      // 1. Run All Ready ops
-      // Keep loop until all vars are ready.
-      run_all_ops(ready_ops);
-
-      // 2. Find ready variable
-      bool timeout = false;
-      auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
-      if (timeout) {
-        for (auto &run_op_future : run_op_futures_) {
-          run_op_future.wait();
-        }
-        if (exception_holder_.IsCaught()) {
-          ExecutionFinal(&fetch_ops);
-        } else {
-          continue;
-        }
-      }
-
-      // 3. Remove the dependency of ready_var.
-      // Find the ready_ops after the ready_var.
-      for (auto ready_var : cur_ready_vars) {
-        pending_vars.erase(ready_var);
-        for (auto *op : ready_var->PendingOps()) {
-          auto &deps = pending_ops[op];
-          --deps;
-          if (deps == 0) {
-            ready_ops.insert(op);
-          }
-        }
-      }
-    }
-    PADDLE_ENFORCE_EQ(
-        ready_ops.empty(),
-        true,
-        platform::errors::Fatal("After the execution of computation graph, "
-                                "there are unexecuted operators left."));
-  }
-
-  // Wait FetchOps.
-  ClearFetchOp(graph_, &fetch_ops);
-
-  return fetch_data;
-}
-
-FetchResultType ThreadedSSAGraphExecutor::Run(
-    const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) {
-    RunImpl({}, return_merged);
-  }
-  return RunImpl(fetch_tensors, return_merged);
-}
-
-void ThreadedSSAGraphExecutor::InsertFetchOps(
-    const std::vector<std::string> &fetch_tensors,
-    std::vector<OpHandleBase *> *fetch_ops,
-    std::unordered_set<VarHandleBase *> *fetch_dependencies,
-    std::unordered_set<OpHandleBase *> *ready_ops,
-    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-    std::unordered_set<VarHandleBase *> *pending_vars,
-    FetchResultType *fetch_data,
-    bool return_merged) {
-  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::unordered_set<VarHandleBase *> local_ready_vars;
-
-  for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-      auto it = var_map.find(fetch_var_name);
-      if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin());
-      }
-    }
-  }
-
-  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-    auto &var_name = fetch_tensors[i];
-    auto fetched_var_it = fetched_vars.find(var_name);
-    PADDLE_ENFORCE_NE(
-        fetched_var_it,
-        fetched_vars.end(),
-        platform::errors::PreconditionNotMet(
-            "Cannot find fetched variable(%s) in current computation graph. "
-            "Possible reasons are:\n"
-            "  1. The variable to be fetched is not defined in main program.\n"
-            "  2. The variable to be fetched is not an input or output of any "
-            "operator.\n"
-            "  3. Confirm that you have used the fetch `Variable` format "
-            "instead of the string literal('%s') in `fetch_list` parameter "
-            "when using `executor.run` method. In other words, the format of "
-            "`executor.run(fetch_list=[fetch_var])`(fetch_var is a Variable) "
-            "is recommended.",
-            var_name,
-            var_name));
-
-    auto &vars = fetched_var_it->second;
-
-    ir::Node *fetch_node =
-        graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node,
-                                 fetch_data,
-                                 i,
-                                 &local_scopes_,
-                                 &local_exec_scopes_,
-                                 return_merged);
-    fetch_ops->emplace_back(op);
-
-    for (auto &p : places_) {
-      op->SetDeviceContext(p, fetch_ctxs_[p].get().get());
-    }
-
-    for (auto *var : vars) {
-      op->AddInput(var);
-    }
-
-    ir::Node *fetch_var =
-        graph_->CreateEmptyNode("fetch", ir::Node::Type::kVariable);
-    auto *fetch_dummy = new DummyVarHandle(fetch_var);
-    op->AddOutput(fetch_dummy);
-    fetch_dependencies->emplace(fetch_dummy);
-
-    this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy);
-
-    size_t wait_input_num = 0;
-    std::unordered_set<VarHandleBase *> input_set(vars.begin(), vars.end());
-    for (auto *var : input_set) {
-      if (pending_vars->count(var)) {
-        ++wait_input_num;
-      }
-    }
-    if (wait_input_num) {
-      pending_ops->insert({op, wait_input_num});
-    } else {
-      ready_ops->insert(static_cast<OpHandleBase *>(op));
-    }
-  }
-  PADDLE_ENFORCE_EQ(
-      local_ready_vars.size(),
-      0,
-      platform::errors::Fatal(
-          "The number of ready variables should be 0, but got %d.",
-          local_ready_vars.size()));
-}
-
-void ThreadedSSAGraphExecutor::InsertPendingOp(
-    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-    OpHandleBase *op_instance) const {
-  pending_ops->insert({op_instance, op_instance->NoDupInputSize()});
-}
-
-void ThreadedSSAGraphExecutor::InsertPendingVar(
-    std::unordered_set<VarHandleBase *> *pending_vars,
-    std::unordered_set<VarHandleBase *> *ready_vars,
-    VarHandleBase *var) const {
-  pending_vars->insert(var);
-  if (var->GeneratedOp() == nullptr) {
-    ready_vars->insert(var);
-  }
-}
-
-void ThreadedSSAGraphExecutor::PrepareOpDeps() {
-  op_deps_ = std::make_unique<OpDependentData>();
-  std::unordered_map<OpHandleBase *, size_t> &pending_ops =
-      op_deps_->pending_ops_;
-  std::unordered_set<VarHandleBase *> &pending_vars = op_deps_->pending_vars_;
-  std::unordered_set<OpHandleBase *> &ready_ops = op_deps_->ready_ops_;
-  std::unordered_set<VarHandleBase *> ready_vars;
-
-  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, &ready_vars, version_pair);
-      }
-    }
-  }
-  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, &ready_vars, var);
-  }
-
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op);
-    } else {
-      InsertPendingOp(&pending_ops, op);
-    }
-  }
-  op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
-  PADDLE_ENFORCE_GT(
-      op_deps_->num_ops_,
-      0,
-      platform::errors::InvalidArgument("The graph doesn't have operators."));
-
-  for (auto ready_var : ready_vars) {
-    pending_vars.erase(ready_var);
-    for (auto *op : ready_var->PendingOps()) {
-      auto &deps = pending_ops[op];
-      --deps;
-      if (deps == 0) {
-        ready_ops.insert(op);
-      }
-    }
-  }
-}
-
-void ThreadedSSAGraphExecutor::CopyOpDeps() {
-  op_deps_futures_ = prepare_pool_.enqueue([&] {
-    auto *op_deps = new OpDependentData();
-    op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(),
-                                 op_deps_->pending_ops_.end());
-    op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(),
-                                  op_deps_->pending_vars_.end());
-    op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
-                               op_deps_->ready_ops_.end());
-    op_deps->num_ops_ = op_deps_->num_ops_;
-    return std::unique_ptr<OpDependentData>(op_deps);
-  });
-}
-
-void ThreadedSSAGraphExecutor::RunOp(
-    const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
-    details::OpHandleBase *op) {
-  auto op_run = [ready_var_q, op, this] {
-    RunOpSync(op);
-    try {
-      ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << " Signal posted";
-    } catch (...) {
-      exception_holder_.Catch(std::current_exception());
-    }
-  };
-
-  if (pool_) {
-    run_op_futures_.emplace_back(pool_->enqueue(op_run));
-  } else {
-    op_run();
-  }
-
-  RecordOps(op);
-}
-
-bool ThreadedSSAGraphExecutor::RunTracedOps(
-    const std::vector<OpHandleBase *> &traced_ops) {
-  for (auto &op : traced_ops) {
-    if (!RunOpSync(op)) return false;
-  }
-  return true;
-}
-
-bool ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
-  try {
-    VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-    if (LIKELY(!strategy_.dry_run_)) {
-      op->Run(strategy_.use_device_);
-    }
-    VLOG(10) << op << " " << op->Name() << " Done ";
-    return true;
-  } catch (...) {
-    exception_holder_.Catch(std::current_exception());
-    return false;
-  }
-}
-
-void ThreadedSSAGraphExecutor::ExecutionFinal(
-    std::vector<OpHandleBase *> *fetch_ops) {
-#if defined PADDLE_WITH_PSCORE
-  if (strategy_.thread_barrier_) {
-    paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
-  }
-#endif
-  VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it";
-  ClearFetchOp(graph_, fetch_ops);
-  exception_holder_.ReThrow();
-}
-
-void ThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
-    traced_ops_.emplace_back(op);
-  }
-}
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
deleted file mode 100644
index 0633bffd5bdfb..0000000000000
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ /dev/null
@@ -1,124 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>  // ThreadPool in thrird party
-
-#include <deque>
-#include <functional>
-#include <list>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/details/exception_holder.h"
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/fetch_op_handle.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/ssa_graph_executor.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-
-namespace details {
-
-struct OpDependentData {
-  std::unordered_map<OpHandleBase *, size_t> pending_ops_;
-  std::unordered_set<VarHandleBase *> pending_vars_;
-  std::unordered_set<OpHandleBase *> ready_ops_;
-  size_t num_ops_{0};
-};
-
-class ThreadedSSAGraphExecutor final : public SSAGraphExecutor {
- public:
-  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
-                           const std::vector<Scope *> &local_scopes,
-                           const std::vector<Scope *> &local_exec_scopes,
-                           const std::vector<platform::Place> &places,
-                           ir::Graph *graph);
-
-  const ir::Graph &Graph() const override { return *graph_; }
-  // Run a SSAGraph by a thread pool
-  // Use topological sort algorithm
-  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
-                      bool return_merged) override;
-
-  ~ThreadedSSAGraphExecutor() final = default;
-
- private:
-  inline FetchResultType RunImpl(const std::vector<std::string> &fetch_tensors,
-                                 bool return_merged);
-  void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
-             details::OpHandleBase *op);
-
- private:
-  // Note(zcd): the ThreadPool should be placed last so that ThreadPool should
-  // be destroyed first.
-  ir::Graph *graph_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-
-  std::vector<platform::Place> places_;
-  std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>
-      fetch_ctxs_;
-
-  ExceptionHolder exception_holder_;
-  std::unique_ptr<OpDependentData> op_deps_;
-  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
-  ExecutionStrategy strategy_;
-  // use std::list because clear(), push_back, and for_each are O(1)
-  std::list<std::future<void>> run_op_futures_;
-  ::ThreadPool prepare_pool_;
-  std::unique_ptr<::ThreadPool> pool_;
-  std::vector<OpHandleBase *> traced_ops_;
-
-  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-                       OpHandleBase *op_instance) const;
-
-  void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
-                        std::unordered_set<VarHandleBase *> *ready_vars,
-                        VarHandleBase *var) const;
-
-  void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
-                      std::vector<OpHandleBase *> *fetch_ops,
-                      std::unordered_set<VarHandleBase *> *fetch_dependencies,
-                      std::unordered_set<OpHandleBase *> *ready_ops,
-                      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
-                      std::unordered_set<VarHandleBase *> *pending_vars,
-                      FetchResultType *fetch_data,
-                      bool return_merged);
-
-  void PrepareOpDeps();
-
-  void CopyOpDeps();
-
-  inline void RecordOps(OpHandleBase *op);
-
-  inline void ExecutionFinal(std::vector<OpHandleBase *> *fetch_ops);
-
-  inline bool RunOpSync(OpHandleBase *op);
-
-  bool RunTracedOps(const std::vector<OpHandleBase *> &traced_ops);
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 97bcf41845039..1f62e57faf119 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -38,116 +38,6 @@ class ProgramDesc;
 
 namespace paddle::framework::details {
 
-static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
-  framework::ExecutionStrategy execution_strategy;
-
-  auto device_type = platform::Place2DeviceType(place);
-  switch (device_type) {
-    case platform::DeviceType::CPU: {
-      execution_strategy.num_threads_ = 2;
-      break;
-    }
-    case platform::DeviceType::CUDA: {  // NOLINT
-      // NOTE: According experiments, one thread is faster in
-      // most model training.
-      execution_strategy.num_threads_ = 1;
-      break;
-    }
-    case platform::DeviceType::XPU: {
-      execution_strategy.num_threads_ = 1;
-      break;
-    }
-    case platform::DeviceType::IPU: {
-      execution_strategy.num_threads_ = 1;
-      break;
-    }
-    case platform::DeviceType::CUSTOM_DEVICE: {
-      execution_strategy.num_threads_ = 1;
-      break;
-    }
-    default:
-      PADDLE_THROW(platform::errors::Unavailable("Unsupported Device type %d.",
-                                                 device_type));
-  }
-  execution_strategy.use_device_ = device_type;
-
-  return execution_strategy;
-}
-
-void AppendSkipDeletionVars(const std::vector<std::string> &append_vars,
-                            std::vector<std::string> *all_vars) {
-  for (auto &var : append_vars) {
-    all_vars->emplace_back(var);
-  }
-}
-
-/*
- * NOTE(Aurelius84): In ParallelExecutor, memory optimized pass will be applied.
- * To avoid eagerly deleting last alive variables which are necessary in
- * backward program, we firstly parse these variable names as
- * skip_eager_vars. While executing pe.run skip_eager_vars are used to
- * skip memory optimization.
- *
- * Variables satisfying the following rules are considered as skip_eager_var:
- *
- *   1. it is an output var in run_program_op
- *   2. it is an input var used in backward_op
- */
-void ParseSafeEagerDeletionSkipVars(
-    const ProgramDesc &program,
-    int64_t forward_op_nums,
-    const std::vector<std::string> &output_var_names,
-    std::vector<std::string> *skip_eager_delete_vars) {
-  auto all_ops = program.Block(0).AllOps();
-  auto &op_info_map = OpInfoMap::Instance();
-  // NOTE: skip `shape` and `fill_constant` op created by
-  // fluid.backward.gradients, one forward output will generate one `shape`
-  // and `fill_constant`.
-  size_t backward_op_start_index =
-      forward_op_nums + (output_var_names.size() * 2);
-
-  // step 2: parse the necessary variable of backward op
-  std::unordered_set<std::string> op_outputs;
-  std::unordered_set<std::string> op_inputs;
-  std::unordered_set<std::string> no_need_buffer_ins;
-
-  for (auto i = backward_op_start_index; i < all_ops.size(); ++i) {
-    framework::OpDesc *op = all_ops[i];
-    // NOTE: skip NoNeedBufferVars of grad_op and GC its memory in advance.
-    auto &op_info = op_info_map.Get(op->Type());
-    auto &inferer = op_info.NoNeedBufferVarsInferer();
-    no_need_buffer_ins.clear();
-    if (inferer != nullptr) {
-      no_need_buffer_ins =
-          inferer(op->Inputs(), op->Outputs(), op->GetAttrMap());
-    }
-    for (auto &in_names : op->Inputs()) {
-      if (no_need_buffer_ins.count(in_names.first) == 0) {
-        for (auto &in_name : in_names.second) {
-          op_inputs.emplace(in_name);
-        }
-      } else {
-        VLOG(2) << op->Type() << " has no_need_buffer_in: " << in_names.first
-                << " , skip it.";
-      }
-    }
-
-    for (const std::string &out_arg_name : op->OutputArgumentNames()) {
-      op_outputs.emplace(out_arg_name);
-    }
-  }
-  // For the grad op input variables, if it is not output of grad_op, it may
-  // be output of forward op and we should set the variables as skip_var to
-  // prevent it being deleted when grad op is called multiple times.
-  for (const std::string &var_name : op_inputs) {
-    if (op_outputs.find(var_name) == op_outputs.end()) {
-      VLOG(2) << "skip eager var: " << var_name;
-      skip_eager_delete_vars->emplace_back(var_name);
-    }
-  }
-  VLOG(3) << "Found skip_eager_delete_vars: " << skip_eager_delete_vars->size();
-}
-
 void AppendSkipDeletionVars(const std::vector<std::string> &append_vars,
                             std::set<std::string> *all_vars) {
   for (auto &var : append_vars) {
@@ -214,89 +104,6 @@ int64_t hash_with_seed(int64_t value, int64_t seed) {
   return value + 0x9e3779b9 + (value << 6) + (seed >> 2);
 }
 
-ExecutorInfoCache &ExecutorInfoCache::Instance() {
-  static ExecutorInfoCache g_exe_cache_info_map;
-  return g_exe_cache_info_map;
-}
-
-static PEAndGraphPair CreateExecutorInfo(
-    const ProgramDesc &program_desc,
-    const platform::Place &place,
-    int64_t start_op_index,
-    int64_t end_op_index,
-    framework::Scope *scope,
-    const details::BuildStrategy &build_strategy) {
-  auto execution_strategy = details::GetExecutionStrategy(place);
-  auto graph = std::make_shared<framework::ir::Graph>(
-      program_desc, start_op_index, end_op_index);
-  auto parallel_executor = std::make_shared<framework::ParallelExecutor>(
-      place, scope, execution_strategy, build_strategy, graph.get());
-  parallel_executor->PrepareVariables(scope);
-  return std::make_pair(parallel_executor, graph);
-}
-
-PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc &program_desc,
-                                          const platform::Place &place,
-                                          int64_t start_op_index,
-                                          int64_t end_op_index,
-                                          framework::Scope *scope) {
-  details::BuildStrategy build_strategy;
-  build_strategy.fix_op_run_order_ = true;
-  auto pe_and_graph = CreateExecutorInfo(
-      program_desc, place, start_op_index, end_op_index, scope, build_strategy);
-  return pe_and_graph;
-}
-
-CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc,
-                                   const platform::Place &place,
-                                   int64_t start_op_index,
-                                   int64_t end_op_index,
-                                   bool is_grad,
-                                   int64_t program_id,
-                                   framework::Scope *scope) {
-  auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
-
-  if (!cached_exe_info.Has(program_id, is_grad)) {
-    // TODO(Aurelius84): Consider to use LRU algorithm to replace this.
-    if (cached_exe_info.Size() > 4u /* max_cached_size*/) {
-      VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear "
-                 "all cache!";
-      cached_exe_info.Finalize();
-    }
-
-    VLOG(1) << "create exe_info for " << program_id << " is_grad: " << is_grad;
-    auto &build_strategy = cached_exe_info.GetBuildStrategy(program_id);
-
-    // 2. Construct Graph and ParallelExecutor.
-    auto pe_and_graph = CreateExecutorInfo(program_desc,
-                                           place,
-                                           start_op_index,
-                                           end_op_index,
-                                           scope,
-                                           build_strategy);
-
-    // 3. Insert value into cached map.
-    auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad);
-    cached_value.executor_ = pe_and_graph.first;
-    cached_value.graph_ = pe_and_graph.second;
-    return std::make_pair(pe_and_graph.first, true);
-  } else {
-    VLOG(1) << "get exe_info from cache by: " << program_id
-            << " is_grad: " << is_grad;
-    auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad);
-
-    auto &parallel_executor = cached_value.executor_;
-    // update op_handle scope_map in pe->executor_->Graph
-    std::unordered_map<Scope *, Scope *> scope_map = {
-        {parallel_executor->GetLocalScopes().front(), scope}};
-    parallel_executor->ResetOpHandleScopeMapOfGraphs(scope_map);
-    // need to recreate tmp variables in new scope
-    parallel_executor->PrepareVariables(scope);
-
-    return std::make_pair(parallel_executor, false);
-  }
-}
-
 InterpreterCoreInfoCache &InterpreterCoreInfoCache::Instance() {
   static InterpreterCoreInfoCache g_info_cache;
   return g_info_cache;
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index 1e5136892d13f..5643fe3b75198 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -24,8 +24,8 @@
 #include <vector>
 
 #include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/utils/string/string_helper.h"
 
@@ -46,15 +46,6 @@ class Graph;
 class InterpreterCore;
 
 namespace details {
-void AppendSkipDeletionVars(const std::vector<std::string>& append_vars,
-                            std::vector<std::string>* all_vars);
-
-void ParseSafeEagerDeletionSkipVars(
-    const ProgramDesc& program,
-    int64_t forward_op_nums,
-    const std::vector<std::string>& output_var_names,
-    std::vector<std::string>* skip_eager_delete_vars);
-
 void AppendSkipDeletionVars(const std::vector<std::string>& append_vars,
                             std::set<std::string>* all_vars);
 
@@ -65,106 +56,6 @@ std::set<std::string> ParseSafeEagerDeletionSkipVarsSet(
 
 }  // namespace details
 
-class ExecutorInfo {
- public:
-  struct CacheValue {
-    std::shared_ptr<ParallelExecutor> executor_{nullptr};
-    std::shared_ptr<ir::Graph> graph_{nullptr};
-
-    std::vector<std::string> skip_eager_delete_vars_;
-  };
-
-  bool IsAvailable(bool is_grad) {
-    const auto& executor =
-        is_grad ? backward_info_.executor_ : forward_info_.executor_;
-    return executor != nullptr;
-  }
-
-  CacheValue& GetMutable(bool is_grad) {
-    return is_grad ? backward_info_ : forward_info_;
-  }
-
- private:
-  CacheValue forward_info_;
-  CacheValue backward_info_;
-};
-
-class ExecutorInfoCache {
- public:
-  static ExecutorInfoCache& Instance();
-
-  const BuildStrategy& GetBuildStrategy(int64_t program_id) {
-    // If not found, insert build_strategy with default value.
-    return strategy_map_[program_id];
-  }
-
-  void SetBuildStrategy(int64_t program_id,
-                        const BuildStrategy& build_strategy) {
-    PADDLE_ENFORCE_EQ(
-        strategy_map_.count(program_id),
-        0,
-        platform::errors::PreconditionNotMet(
-            "program_id: %s already exist in ExecutorInfoCache", program_id));
-    strategy_map_[program_id] = build_strategy;
-  }
-
-  bool Has(int64_t program_id, bool is_grad) {
-    return info_map_.find(program_id) != info_map_.end() &&
-           info_map_[program_id].IsAvailable(is_grad);
-  }
-
-  ExecutorInfo::CacheValue& GetMutable(int64_t program_id, bool is_grad) {
-    return info_map_[program_id].GetMutable(is_grad);
-  }
-
-  void UpdateSkipEagerDeleteVars(int64_t program_id,
-                                 bool is_grad,
-                                 const std::vector<std::string>& skip_vars) {
-    auto& cached_value = GetMutable(program_id, is_grad);
-    cached_value.skip_eager_delete_vars_ = std::move(skip_vars);
-  }
-
-  std::vector<std::string>& SkipEagerDeleteVars(int64_t program_id,
-                                                bool is_grad) {
-    auto& cached_value = GetMutable(program_id, is_grad);
-    return cached_value.skip_eager_delete_vars_;
-  }
-
-  size_t Size() const { return info_map_.size(); }
-
-  void Finalize() {
-    // NOTE(Aurelius84): DO NOT perform finalize in destructor
-    // to avoid problems caused by destructor order of static
-    // object.
-    info_map_.clear();
-    strategy_map_.clear();
-  }
-
- private:
-  std::unordered_map<int64_t, ExecutorInfo> info_map_;
-  std::unordered_map<int64_t, BuildStrategy> strategy_map_;
-};
-
-using CacheInfo =
-    std::pair<std::shared_ptr<ParallelExecutor>, bool /*is_new_created*/>;
-
-using PEAndGraphPair =
-    std::pair<std::shared_ptr<ParallelExecutor>, std::shared_ptr<ir::Graph>>;
-
-CacheInfo GetExecutorInfoFromCache(const ProgramDesc& program_desc,
-                                   const platform::Place& place,
-                                   int64_t start_op_index,
-                                   int64_t end_op_index,
-                                   bool is_grad,
-                                   int64_t program_id,
-                                   framework::Scope* scope);
-
-PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc& program_desc,
-                                          const platform::Place& place,
-                                          int64_t start_op_index,
-                                          int64_t end_op_index,
-                                          framework::Scope* scope);
-
 int64_t hash_with_seed(int64_t value, int64_t seed);
 
 class InterpreterCoreInfo {
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index 4b830e7b05e55..f97a78fd156ac 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -67,7 +67,7 @@ class StatisticsEngine {
         : evt_idx(idx), start_ns(start), end_ns(end) {}
   };
 
-  enum class ExecutorType { EXECUTOR, PARALLEL_EXECUTOR, INTERPRETER_CORE };
+  enum class ExecutorType { EXECUTOR, INTERPRETER_CORE };
 
   using Filter = std::function<bool(const platform::HostTraceEventNode&)>;
 
@@ -83,8 +83,6 @@ class StatisticsEngine {
 
   int InitFiltersForExecutor();
 
-  int InitFiltersForParallelExecutor();
-
   int InitFiltersForInterpreterCore();
 
   int RegisterEventFilter(const std::string& std_event, Filter filter) {
@@ -154,10 +152,6 @@ int StatisticsEngine::Init(const platform::NodeTrees& trees) {
         VLOG(10) << "type: Executor";
         executor_type_ = ExecutorType::EXECUTOR;
         return InitFiltersForExecutor();
-      } else if (name.find("ParallelExecutor::") == 0) {
-        VLOG(10) << "type: ParallelExecutor";
-        executor_type_ = ExecutorType::PARALLEL_EXECUTOR;
-        return InitFiltersForParallelExecutor();
       } else if (name.find("StandaloneExecutor::") == 0) {
         VLOG(10) << "type: InterpreterCore";
         executor_type_ = ExecutorType::INTERPRETER_CORE;
@@ -295,57 +289,6 @@ int StatisticsEngine::InitFiltersForExecutor() {
              });
 }
 
-int StatisticsEngine::InitFiltersForParallelExecutor() {
-  return RegisterEventFilter("Total",
-                             [](const platform::HostTraceEventNode& evt) {
-                               return evt.Name().find("ProfileStep") == 0;
-                             }) ||
-         RegisterEventFilter("CplusplusEnd",
-                             [](const platform::HostTraceEventNode& evt) {
-                               return evt.Name() == "ParallelExecutor::Run";
-                             }) ||
-         RegisterEventFilter("RunOp",
-                             [](const platform::HostTraceEventNode& evt) {
-                               return evt.Type() ==
-                                      platform::TracerEventType::Operator;
-                             }) ||
-         RegisterEventFilter(
-             "OpCompute",
-             [](const platform::HostTraceEventNode& evt) {
-               return evt.Name() == "compute" &&
-                      evt.Type() == platform::TracerEventType::OperatorInner;
-             }) ||
-         RegisterEventFilter(
-             "OpInfershape",
-             [](const platform::HostTraceEventNode& evt) {
-               return evt.Name() == "infer_shape" &&
-                      evt.Type() == platform::TracerEventType::OperatorInner;
-             }) ||
-         RegisterEventFilter("GarbageCollect",
-                             [](const platform::HostTraceEventNode& evt) {
-                               return evt.Name() == "eager_deletion" ||
-                                      evt.Name() == "CheckGC";
-                             }) ||
-         RegisterEventFilter("AllocateDeviceMem",
-                             [](const platform::HostTraceEventNode& evt) {
-                               return evt.Name() == alloc_device_mem;
-                             }) ||
-         RegisterEventFilter("FreeDeviceMem",
-                             [](const platform::HostTraceEventNode& evt) {
-                               return evt.Name() == free_device_mem;
-                             }) ||
-         RegisterEventFilter(
-             "DataTransform",
-             [](const platform::HostTraceEventNode& evt) {
-               return evt.Name() == "prepare_data" &&
-                      evt.Type() == platform::TracerEventType::OperatorInner;
-             }) ||
-         RegisterEventFilter("ThreadpoolAddTask",
-                             [](const platform::HostTraceEventNode& evt) {
-                               return evt.Name() == "WorkQueue::AddTask";
-                             });
-}
-
 int StatisticsEngine::InitFiltersForInterpreterCore() {
   return RegisterEventFilter("Total",
                              [](const platform::HostTraceEventNode& evt) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
deleted file mode 100644
index 3b6206284e9c6..0000000000000
--- a/paddle/fluid/framework/parallel_executor.cc
+++ /dev/null
@@ -1,1929 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/parallel_executor.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
-#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
-#include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/event.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-#include "paddle/fluid/platform/flags.h"
-
-COMMON_DECLARE_double(eager_delete_tensor_gb);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-COMMON_DECLARE_bool(sync_nccl_allreduce);
-#endif
-
-#ifdef WITH_GPERFTOOLS
-#include "gperftools/profiler.h"
-#endif
-PADDLE_DEFINE_EXPORTED_string(
-    pe_profile_fname,
-    "",
-    "Profiler filename for PE, which generated by gperftools."
-    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
-
-namespace paddle {
-namespace framework {
-
-static std::once_flag gProfileOnce;
-#ifdef WITH_GPERFTOOLS
-static bool gProfileStarted = false;
-#endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-std::once_flag p2p_init_flag_pe;
-#endif
-
-class ParallelExecutorPrivate {
- public:
-  ParallelExecutorPrivate(const std::vector<platform::Place> &places,
-                          Scope *global_scope)
-      : places_(places),
-        local_scopes_(),
-        local_exec_scopes_(),
-        global_scope_(global_scope),
-        executor_(nullptr),
-        is_persistable_(),
-        own_local_scope_(false),
-        use_device_(DeviceType::CPU),
-        use_all_reduce_(false),
-        nranks_(0),
-        mem_opt_var_infos_(),
-        gcs_() {
-    if (!FLAGS_pe_profile_fname.empty()) {
-      std::call_once(gProfileOnce, [] {
-#ifdef WITH_GPERFTOOLS
-        ProfilerStart(FLAGS_pe_profile_fname.c_str());
-        gProfileStarted = true;
-#else
-        LOG(WARNING) << "Paddle is not compiled with gperftools. "
-          "FLAGS_pe_profile_fname will be ignored";
-#endif
-      });
-    }
-  }
-
-  ~ParallelExecutorPrivate() {
-    if (own_local_scope_) {
-      for (size_t i = 1; i < local_scopes_.size(); ++i) {
-        // Skip the first scope, since it is the global scope.
-        Scope *local_scope = local_scopes_[i];
-        if (global_scope_->HasKid(local_scope)) {
-          global_scope_->DeleteScope(local_scope);
-        }
-      }
-    }
-  }
-
-  bool IsUseCUDA(DeviceType use_device);
-
-  void SetHasFeed(size_t dev_idx, bool has_feed = true);
-
-  bool AllowPartialFeed() const;
-
-  ir::Graph *ApplyMemoryOptimizePass(ir::Graph *graph);
-
-  inline bool HasGarbageCollectors() const { return !gcs_.empty(); }
-
-  void ApplyFixOpRunOrderPass(ir::Graph *graph) {
-    if (build_strategy_.fix_op_run_order_) {
-      auto pass = ir::PassRegistry::Instance().Get("fix_op_run_order_pass");
-      pass->Apply(graph);
-    }
-  }
-
-  /**
-   * NOTE(zengjinle): the fed variables of users should not be reused,
-   * because users may feed them into another network. Changing the fed
-   * variables that users can visit may cause calculation wrong, which is
-   * a very subtle bug when training networks. However, these variables
-   * can be garbage collected.
-   *
-   * ParallelExecutor provides 2 methods to feed variables:
-   *
-   *  - FeedTensorsIntoLocalScopes: this method would share memory of fed
-   *                                variables, so we have to skip these.
-   *
-   *  - FeedAndSplitTensorIntoLocalScopes: this method would copy data of fed
-   *                                       variables, so we do not need to skip
-   *                                       them.
-   */
-  inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
-    if (mem_opt_var_infos_.empty()) {
-      VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
-                 "optimization strategy is enabled";
-      return;
-    }
-    auto iter = mem_opt_var_infos_[scope_idx].find(name);
-    if (iter != mem_opt_var_infos_[scope_idx].end()) {
-      iter->second->SetSkipMemoryReuse(true);
-    }
-  }
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
-    VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
-            << ", num_trainers:" << bst.num_trainers_
-            << ", trainer_id:" << bst.trainer_id_;
-
-    if (bst.use_hierarchical_allreduce_) {
-      VLOG(1) << ", use_hierarchical_allreduce:"
-              << bst.use_hierarchical_allreduce_ << ", inter_trainers_num:"
-              << bst.hierarchical_allreduce_inter_nranks_
-              << ", exter_trainers_num:"
-              << bst.hierarchical_allreduce_exter_nranks_;
-    }
-
-    std::vector<ncclUniqueId *> flat_nccl_ids;
-    if (nranks_ == 1) {
-      // FIXME(gongwb): need not to create ncclid when nranks==1
-      nccl_ctxs_->InitFlatCtxs(
-          places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
-      return;
-    }
-
-    if (bst.enable_parallel_graph_) {
-      VLOG(1) << "use only one ncclid in pg model";
-
-      ncclUniqueId *nccl_id = nullptr;
-
-      std::string var_name = platform::GetFlatNCCLVarName(0);
-      auto nccl_id_var = scope->FindVar(var_name);
-      if (nccl_id_var) {
-        nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id;
-      } else {
-        nccl_id = new ncclUniqueId();
-        PADDLE_ENFORCE_EQ(
-            platform::dynload::ncclGetUniqueId(nccl_id),
-            ncclSuccess,
-            platform::errors::PreconditionNotMet(
-                "PaddlePaddle failed to get NCCL unique ID. It may due to your "
-                "system settings or NCCL library error, please debug on NCCL"));
-        VLOG(10) << "can't find nccl_id_var:" << var_name
-                 << ", nccl_id:" << nccl_id;
-      }
-
-      flat_nccl_ids.push_back(nccl_id);
-
-      nccl_ctxs_->InitFlatCtxs(
-          places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
-      VLOG(1) << "init bst nccl context complete!";
-      return;
-    }
-
-    // num_trainers ==1 && places > 1
-    if (bst.num_trainers_ == 1) {
-      nccl_ctxs_->InitFlatCtxs(
-          places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
-      return;
-    }
-
-    for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-      std::string var_name = platform::GetFlatNCCLVarName(i);
-      auto nccl_id_var = scope->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(
-          nccl_id_var,
-          platform::errors::NotFound("Can't find nccl_id_var '%s'.", var_name));
-      auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-      flat_nccl_ids.push_back(nccl_id);
-    }
-
-    nccl_ctxs_->InitFlatCtxs(
-        places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
-
-    if (bst.use_hierarchical_allreduce_) {
-      std::vector<ncclUniqueId *> inter_nccl_ids;
-      for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-        std::string var_name = platform::GetHierarchicalInterNCCLVarName(i);
-        auto nccl_id_var = scope->FindVar(var_name);
-        PADDLE_ENFORCE_NOT_NULL(nccl_id_var,
-                                platform::errors::NotFound(
-                                    "Can't find nccl_id_var '%s'.", var_name));
-        auto inter_nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        inter_nccl_ids.push_back(inter_nccl_id);
-      }
-
-      std::vector<ncclUniqueId *> exter_nccl_ids;
-      for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-        std::string var_name = platform::GetHierarchicalExterNCCLVarName(i);
-        auto nccl_id_var = scope->FindVar(var_name);
-        PADDLE_ENFORCE_NOT_NULL(nccl_id_var,
-                                platform::errors::NotFound(
-                                    "Can't find nccl_id_var '%s'.", var_name));
-        auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        exter_nccl_ids.push_back(nccl_id);
-      }
-
-      nccl_ctxs_->InitHierarchicalCtxs(
-          places_,
-          inter_nccl_ids,
-          exter_nccl_ids,
-          bst.num_trainers_,
-          bst.trainer_id_,
-          bst.hierarchical_allreduce_inter_nranks_,
-          bst.hierarchical_allreduce_exter_nranks_);
-    }
-  }
-
-  void InitOrGetNCCLCommunicator(framework::Scope *scope, BuildStrategy *bst) {
-    const std::string var_name = "NCCLCommunicator";
-    auto var = scope->FindVar(var_name);
-    if (var != nullptr) {
-      PADDLE_ENFORCE_EQ(var->IsInitialized(),
-                        true,
-                        platform::errors::PreconditionNotMet(
-                            "if %s exists, it must be initialized", var_name));
-      VLOG(1) << "find " << var_name
-              << " in scope, so use it and does not recreate!";
-      nccl_ctxs_ = var->GetMutable<platform::NCCLCommunicator>();
-      return;
-    }
-
-    if (bst->use_hierarchical_allreduce_) {
-      PADDLE_ENFORCE_GT(
-          bst->num_trainers_,
-          1,
-          platform::errors::PreconditionNotMet(
-              "The num_trainers should be greater than 1, but received %llu.",
-              bst->num_trainers_));
-      PADDLE_ENFORCE_GT(
-          bst->hierarchical_allreduce_inter_nranks_,
-          1,
-          platform::errors::PreconditionNotMet(
-              "The inter_nranks should be greater than 1, but received %d.",
-              bst->hierarchical_allreduce_inter_nranks_));
-      PADDLE_ENFORCE_EQ(
-          bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_,
-          0,
-          platform::errors::PreconditionNotMet(
-              "num_trainers:%llu mod inter_nranks:%d != 0",
-              bst->num_trainers_,
-              bst->hierarchical_allreduce_inter_nranks_));
-
-      bst->hierarchical_allreduce_exter_nranks_ =
-          bst->num_trainers_ / bst->hierarchical_allreduce_inter_nranks_;
-    }
-
-    VLOG(1) << "not find " << var_name << " in scope, so recreate it!";
-    nccl_ctxs_ = scope->Var(var_name)->GetMutable<platform::NCCLCommunicator>();
-    InitNCCLCtxs(scope, *bst);
-  }
-#endif
-
-#if defined(PADDLE_WITH_XPU_BKCL)
-  void InitBKCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
-    VLOG(1) << "bkcl comm num:" << bst.bkcl_comm_num_ << ", nranks:" << nranks_
-            << ", num_trainers:" << bst.num_trainers_
-            << ", trainer_id:" << bst.trainer_id_;
-
-    PADDLE_ENFORCE_EQ(bst.use_hierarchical_allreduce_,
-                      false,
-                      platform::errors::Unimplemented(
-                          "xpu doesn't support use_hierarchical_allreduce"));
-
-    std::vector<BKCLUniqueId *> flat_bkcl_ids;
-    if (nranks_ == 1) {
-      // FIXME(gongwb): need not to create bkclid when nranks==1
-      bkcl_ctxs_->InitFlatCtxs(
-          places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
-      return;
-    }
-
-    if (bst.enable_parallel_graph_) {
-      VLOG(1) << "use only one bkclid in pg model";
-
-      BKCLUniqueId *bkcl_id = nullptr;
-
-      std::string var_name = platform::GetFlatBKCLVarName(0);
-      auto bkcl_id_var = scope->FindVar(var_name);
-      std::unique_ptr<BKCLUniqueId> id(new BKCLUniqueId());
-      if (bkcl_id_var) {
-        bkcl_id = bkcl_id_var->GetMutable<BKCLUniqueId>();
-      } else {
-        PADDLE_ENFORCE_EQ(
-            bkcl_get_unique_id(id.get()),
-            BKCL_SUCCESS,
-            platform::errors::Unavailable("bkcl get unique id failed"));
-        bkcl_id = id.get();
-      }
-
-      flat_bkcl_ids.push_back(bkcl_id);
-
-      bkcl_ctxs_->InitFlatCtxs(
-          places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
-      VLOG(1) << "init bst bkcl context complete!";
-      return;
-    }
-
-    // num_trainers ==1 && places > 1
-    if (bst.num_trainers_ == 1) {
-      bkcl_ctxs_->InitFlatCtxs(
-          places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
-      return;
-    }
-
-    for (int i = 0; i < static_cast<int>(bst.bkcl_comm_num_); i++) {
-      std::string var_name = platform::GetFlatBKCLVarName(i);
-      auto bkcl_id_var = scope->FindVar(var_name);
-      PADDLE_ENFORCE_NOT_NULL(
-          bkcl_id_var,
-          platform::errors::NotFound("can't find %s bkcl_id_var", var_name));
-      auto bkcl_id = bkcl_id_var->GetMutable<BKCLUniqueId>();
-      flat_bkcl_ids.push_back(bkcl_id);
-    }
-
-    bkcl_ctxs_->InitFlatCtxs(
-        places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
-  }
-
-  void InitOrGetBKCLCommunicator(framework::Scope *scope,
-                                 const BuildStrategy &bst) {
-    const std::string var_name = "BKCLCommunicator";
-    auto var = scope->FindVar(var_name);
-    if (var != nullptr) {
-      PADDLE_ENFORCE_EQ(var->IsInitialized(),
-                        true,
-                        platform::errors::PreconditionNotMet(
-                            "if %s exists, it must be initialized", var_name));
-      VLOG(1) << "find " << var_name
-              << " in scope, so use it and does not recreate!";
-      bkcl_ctxs_ = var->GetMutable<platform::BKCLCommunicator>();
-      return;
-    }
-
-    VLOG(1) << "not find " << var_name << " in scope, so recreate it!";
-    bkcl_ctxs_ = scope->Var(var_name)->GetMutable<platform::BKCLCommunicator>();
-    InitBKCLCtxs(scope, bst);
-  }
-#endif
-
-  inline bool IsPersistable(const std::string &name) const {
-    auto iter = is_persistable_.find(name);
-    return iter != is_persistable_.end() && iter->second;
-  }
-
-  BuildStrategy build_strategy_;
-  std::vector<platform::Place> places_;
-  std::vector<Scope *> local_scopes_;
-  std::vector<Scope *> local_exec_scopes_;
-  Scope *global_scope_;  // not owned
-  std::unique_ptr<details::SSAGraphExecutor> executor_;
-
-  std::unordered_map<std::string, bool> is_persistable_;
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  platform::NCCLCommunicator *nccl_ctxs_{nullptr};
-#elif defined(PADDLE_WITH_XPU_BKCL)
-  platform::BKCLCommunicator *bkcl_ctxs_{nullptr};
-#endif
-  bool own_local_scope_;
-  DeviceType use_device_;
-  bool use_all_reduce_;
-  size_t nranks_;
-
-  ir::MemOptVarInfoMapList mem_opt_var_infos_;
-  ir::GarbageCollectorMap gcs_;
-
-  details::ParallelSSAGraphExecutor *inference_executor_{nullptr};
-};
-
-bool ParallelExecutorPrivate::IsUseCUDA(DeviceType use_device) {
-  return use_device == p::kCUDA;
-}
-
-void ParallelExecutorPrivate::SetHasFeed(size_t dev_idx, bool has_feed) {
-  if (inference_executor_) {
-    inference_executor_->SetHasFeed(dev_idx, has_feed);
-  }
-}
-
-bool ParallelExecutorPrivate::AllowPartialFeed() const {
-  return inference_executor_ && inference_executor_->SupportPartialFeed();
-}
-
-ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
-  /**
-   * NOTE(zengjinle): If BuildStrategy.memory_optimize = None in Python,
-   * set BuildStrategy.memory_optimize according to whether gc is enabled.
-   * If gc is enabled, BuildStrategy.memory_optimize = False.
-   * If gc is disabled, BuildStrategy.memory_optimize = True.
-   * This is because gc+memory_optimize is worse than gc only.
-   *
-   * As an option, users can enable BuildStrategy.memory_optimize forcely
-   * by setting True, and disable it forcely by setting False.
-   */
-  bool is_gc_enabled = (GetEagerDeletionThreshold() >= 0);
-  if (!build_strategy_.memory_optimize_) {
-    build_strategy_.memory_optimize_ = !is_gc_enabled;
-  }
-
-  bool need_mem_opt = build_strategy_.enable_inplace_ ||
-                      build_strategy_.enable_addto_ ||
-                      build_strategy_.memory_optimize_.get() || is_gc_enabled;
-
-  if (!need_mem_opt) return graph;
-
-  std::vector<ir::LastLiveOpsOfVars> last_live_ops_of_vars;
-
-  auto ref_cnt_pass = ir::PassRegistry::Instance().Get("reference_count_pass");
-  ref_cnt_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
-  ref_cnt_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
-  graph = ref_cnt_pass->Apply(graph);
-  VLOG(10) << "ReferenceCountPass Applied";
-
-  if (build_strategy_.enable_addto_) {
-    auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
-    addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
-    addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
-    addto_pass->Set(ir::kUseCuda, new bool(use_device_ == p::kCUDA));
-    VLOG(10) << "Start to apply inplace_addto_op_pass";
-    graph = addto_pass->Apply(graph);
-    VLOG(10) << "inplace_addto_op_pass Applied";
-  }
-
-  if (build_strategy_.enable_inplace_) {
-    auto inplace_pass =
-        ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
-    inplace_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
-    inplace_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
-    inplace_pass->Set(ir::kUseCuda, new bool(use_device_ == p::kCUDA));
-    VLOG(10) << "Start to apply buffer_shared_inplace_pass";
-    graph = inplace_pass->Apply(graph);
-    VLOG(10) << "buffer_shared_inplace_pass Applied";
-    VLOG(1) << "Inplace strategy is enabled, when "
-               "build_strategy.enable_inplace = True";
-  }
-
-  if (build_strategy_.memory_optimize_.get()) {
-    auto cross_op_memory_reuse_pass = ir::PassRegistry::Instance().Get(
-        "buffer_shared_cross_op_memory_reuse_pass");
-    cross_op_memory_reuse_pass->SetNotOwned(ir::kMemOptVarInfoMapList,
-                                            &mem_opt_var_infos_);
-    cross_op_memory_reuse_pass->SetNotOwned(ir::kLastLiveOpsOfVars,
-                                            &last_live_ops_of_vars);
-    cross_op_memory_reuse_pass->Set(ir::kUseCuda,
-                                    new bool(use_device_ == p::kCUDA));
-    VLOG(10) << "Start to apply buffer_shared_cross_op_memory_reuse_pass";
-    graph = cross_op_memory_reuse_pass->Apply(graph);
-    VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied";
-    LOG(INFO) << "Cross op memory reuse strategy is enabled, when "
-                 "build_strategy.memory_optimize = True or garbage collection "
-                 "strategy is disabled, which is not recommended";
-  }
-
-  if (!is_gc_enabled) {
-    return graph;
-  }
-  size_t max_memory_size = static_cast<size_t>(GetEagerDeletionThreshold());
-
-  for (size_t i = 0; i < places_.size(); ++i) {
-    auto &place = places_[i];
-    if (gcs_.count(place) > 0) {
-      continue;
-    }
-    std::unique_ptr<GarbageCollector> gc;
-    if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc = std::make_unique<UnsafeFastGPUGarbageCollector>(place,
-                                                             max_memory_size);
-      } else {
-        gc = std::make_unique<StreamGarbageCollector>(place, max_memory_size);
-      }
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use CUDA device since it's not compiled with CUDA,"
-          "Please recompile or reinstall Paddle with GPU support."));
-#endif
-    } else if (platform::is_xpu_place(place)) {
-#if defined(PADDLE_WITH_XPU)
-      gc = std::make_unique<XPUGarbageCollector>(place, max_memory_size);
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use XPU device since it's not compiled with XPU,"
-          "Please recompile or reinstall Paddle with XPU support."));
-#endif
-    } else if (platform::is_ipu_place(place)) {
-#if defined(PADDLE_WITH_IPU)
-      gc = std::make_unique<IPUGarbageCollector>(place, max_memory_size);
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use IPU device since it's not compiled with IPU,"
-          "Please recompile or reinstall Paddle with IPU support."));
-#endif
-    } else if (platform::is_custom_place(place)) {
-#if defined(PADDLE_WITH_CUSTOM_DEVICE)
-      if (IsFastEagerDeletionModeEnabled()) {
-        gc = std::make_unique<CustomDeviceUnsafeFastGarbageCollector>(
-            place, max_memory_size);
-      } else {
-        gc = std::make_unique<CustomStreamGarbageCollector>(place,
-                                                            max_memory_size);
-      }
-      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
-#else
-      PADDLE_THROW(platform::errors::PermissionDenied(
-          "Paddle can't use custom device since it's not compiled with "
-          "CustomDevice,"
-          "Please recompile or reinstall Paddle with CustomDevice support."));
-#endif
-    } else if (platform::is_cpu_place(place)) {
-      gc = std::make_unique<CPUGarbageCollector>(place, max_memory_size);
-      VLOG(10) << "Created GarbageCollector at " << place;
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Unsupported place for garbage collection"));
-    }
-    gcs_.emplace(place, std::move(gc));
-  }
-
-  if (!gcs_.empty()) {
-    auto eager_deletion_pass =
-        ir::PassRegistry::Instance().Get("eager_deletion_pass");
-    eager_deletion_pass->SetNotOwned(ir::kMemOptVarInfoMapList,
-                                     &mem_opt_var_infos_);
-    eager_deletion_pass->SetNotOwned(ir::kGarbageCollector, &gcs_);
-    eager_deletion_pass->SetNotOwned(ir::kLastLiveOpsOfVars,
-                                     &last_live_ops_of_vars);
-    eager_deletion_pass->SetNotOwned(ir::kAllPlaces, &places_);
-    graph = eager_deletion_pass->Apply(graph);
-    VLOG(10) << "EagerDeletionPass Applied";
-    VLOG(1) << "Garbage collection strategy is enabled, when "
-            << "FLAGS_eager_delete_tensor_gb = "
-            << FLAGS_eager_delete_tensor_gb;
-  }
-  return graph;
-}
-
-class ResetHasFeedGuard {
- public:
-  explicit ResetHasFeedGuard(ParallelExecutorPrivate *pe_member)
-      : pe_member_(pe_member) {}
-
-  ~ResetHasFeedGuard() {
-    for (size_t i = 0; i < pe_member_->places_.size(); ++i) {
-      pe_member_->SetHasFeed(i, false);
-    }
-  }
-
- private:
-  ParallelExecutorPrivate *pe_member_;
-};
-
-size_t ParallelExecutor::DeviceCount() const { return member_->places_.size(); }
-
-std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
-  return member_->local_scopes_;
-}
-
-void ParallelExecutor::DropLocalExeScopes() {
-  auto executor = dynamic_cast<details::ScopeBufferedSSAGraphExecutor *>(
-      member_->executor_.get());
-  if (executor) {
-    executor->DropLocalExeScopes();
-  }
-}
-
-bool ParallelExecutor::NeedCreateLocalExeScope() {
-  auto executor = dynamic_cast<details::ScopeBufferedSSAGraphExecutor *>(
-      member_->executor_.get());
-  return executor && executor->NeedCreateLocalExeScope();
-}
-
-void InitP2PInPE(const std::vector<platform::Place> &places) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  std::call_once(p2p_init_flag_pe, [&]() {
-    int count = places.size();
-    if (count <= 1) return;
-
-    std::vector<int> devices;
-    for (int i = 0; i < count; i++) {
-      if (!platform::is_gpu_place(places[i])) return;
-
-      platform::CUDAPlace device = places[i];
-      devices.push_back(device.GetDeviceId());
-    }
-
-    for (int i = 0; i < count; ++i) {
-      for (int j = 0; j < count; ++j) {
-        if (devices[i] == devices[j]) continue;
-        int can_access = -1;
-#ifdef PADDLE_WITH_HIP
-        hipError_t ret =
-            hipDeviceCanAccessPeer(&can_access, devices[i], devices[j]);
-        if (ret != hipSuccess || can_access != 1) {
-#else
-        cudaError_t ret =
-            cudaDeviceCanAccessPeer(&can_access, devices[i], devices[j]);
-        if (ret != cudaSuccess || can_access != 1) {
-#endif
-          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
-                       << " to " << devices[j];
-        } else {
-          platform::CUDADeviceGuard guard(devices[i]);
-#ifdef PADDLE_WITH_HIP
-          hipDeviceEnablePeerAccess(devices[j], 0);
-#else
-          cudaDeviceEnablePeerAccess(devices[j], 0);
-#endif
-        }
-      }
-    }
-    VLOG(1) << "init p2p";
-  });
-#endif
-}
-
-ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
-                                   const std::vector<std::string> &bcast_vars,
-                                   const std::string &loss_var_name,
-                                   Scope *scope,
-                                   const std::vector<Scope *> &local_scopes,
-                                   const ExecutionStrategy &exec_strategy,
-                                   const BuildStrategy &build_strategy,
-                                   ir::Graph *graph)
-    : member_(new ParallelExecutorPrivate(places, scope)),
-      async_graphs_(),
-      var_infos_() {
-  PADDLE_ENFORCE_EQ(!places.empty(),
-                    true,
-                    platform::errors::Unavailable(
-                        "NPU is not supported in ParallelExecutor."));
-  InitP2PInPE(places);
-  ir::InitReaderQueueDeviceCount(
-      graph, *(member_->global_scope_), member_->places_.size());
-  // Initialize necessary info of member_ with strategy.
-  InitExecutorPrivateMemberInfo(
-      exec_strategy, build_strategy, places.size(), *graph);
-
-  // Step 1. Create local scopes and Clone graph into multi device
-  CreateLocalScopes(scope, local_scopes, /*create_new*/ true);
-  std::vector<ir::Graph *> graphs = CloneGraphToMultiDevices(graph);
-  PrepareNCCLCommunicator(scope);
-
-  // broadcast parameters from the 0th device to others:
-  auto need_broadcast = [&]() -> bool {
-    if (member_->build_strategy_.num_trainers_ > 1) {  // NOLINT
-      // 1. num_tariners would be grater than 1 for nccl distributed training.
-      return true;
-    } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-      // 2. Only one trainer process, but ParallelExecutor hold multiple
-      // devices.
-      return true;
-    }
-    return false;
-  };
-  if (need_broadcast()) {
-    BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_);
-  }
-
-  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
-  // ncclOp
-  std::vector<ir::Graph *> async_graphs =
-      CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
-  PrepareForCUDAGraphCapture(graph);
-  graph = member_->ApplyMemoryOptimizePass(graph);
-  async_graphs[0] = graph;
-
-  // Step 3. Create vars in each scope. Passes may also create new vars.
-  //         skip control vars and empty vars
-  // std::vector<details::VariableInfo> var_infos;
-  // CreateVariableInfos(&var_infos, graph);
-  // std::unordered_map<Scope *, Scope *> scope_map =
-  //     CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true);
-
-  // Step 4. Create SSAGraph executor
-  /* std::vector<ir::Graph *> final_graphs =
-      CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph);
-
-  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
-  if (!member_->build_strategy_.async_mode_) {
-    member_->executor_ =
-        std::make_unique<details::ScopeBufferedSSAGraphExecutor>(
-            exec_strategy,
-            member_->local_scopes_,
-            member_->local_exec_scopes_,
-            std::move(var_infos),
-            member_->places_,
-            std::move(member_->executor_));
-  }
-
-  ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map);
-  SetReaderOpDeviceInfoOfGraphs(final_graphs); */
-}
-
-ParallelExecutor::ParallelExecutor(const platform::Place &place,
-                                   Scope *scope,
-                                   const ExecutionStrategy &exec_strategy,
-                                   const BuildStrategy &build_strategy,
-                                   ir::Graph *graph)
-    : member_(new ParallelExecutorPrivate({place}, scope)),
-      async_graphs_(),
-      var_infos_() {
-  // Initialize necessary info of member_ with strategy.
-  InitExecutorPrivateMemberInfo(exec_strategy,
-                                build_strategy,
-                                /*device_count=*/1,
-                                *graph);
-
-  CreateLocalScopes(scope, /*local_scopes=*/{scope}, /*create_new=*/false);
-
-  // Apply BuildStrategy to compile graph.
-  std::vector<ir::Graph *> graphs = {graph};
-  std::vector<ir::Graph *> async_graphs =
-      CompileGraphWithBuildStrategy(graph, &graphs, /*loss_var_name=*/"");
-
-  graph = member_->ApplyMemoryOptimizePass(graph);
-
-  // Create vars in each scope. Passes may also create new vars.
-  //         skip control vars and empty vars
-  CreateVariableInfos(&var_infos_, graph);
-
-  // Create local execution scopes
-  std::unordered_map<Scope *, Scope *> scope_map =
-      CreateLocalExecScopes(member_->local_scopes_, /*create_new=*/false);
-
-  std::vector<ir::Graph *> final_graphs =
-      CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph);
-
-  // Set scope_map of op from each graph
-  ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map);
-}
-
-void ParallelExecutor::PrepareVariables(Scope *scope) {
-  for (auto &info : var_infos_) {
-    auto var = scope->FindVar(info.name_);
-    if (var != nullptr) {
-      VLOG(2) << info.name_
-              << " has been initialized beforehand in global scope, skipped.";
-      continue;
-    }
-    framework::InitializeVariable(scope->Var(info.name_), info.type_);
-  }
-}
-
-void ParallelExecutor::BCastParamsToDevices(
-    const std::vector<std::string> &vars, int trainer_id) const {
-  VLOG(3) << "BCastParamsToDevices";
-  // the initializing bcast, all vars would be bcast from device(0).
-  for (auto &var : vars) {
-    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
-    if (main_var == nullptr || !main_var->IsType<phi::DenseTensor>()) {
-      continue;
-    }
-
-    auto &main_tensor = main_var->Get<phi::DenseTensor>();
-    if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
-      continue;
-    }
-    auto &dims = main_tensor.dims();
-    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      std::vector<void *> buffers;
-      buffers.reserve(member_->places_.size());
-      size_t numel = main_tensor.numel();
-      auto dtype = framework::TransToProtoVarType(main_tensor.dtype());
-      ncclDataType_t data_type = platform::ToNCCLDataType(dtype);
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-
-        if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data());
-        } else {
-          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var)->GetMutable<phi::DenseTensor>();
-          t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.dtype());
-        }
-        buffers.push_back(buffer);
-      }
-
-      PADDLE_ENFORCE_EQ(member_->places_.size(),
-                        buffers.size(),
-                        platform::errors::PreconditionNotMet(
-                            "variables' buffer size to bcast is %d, which is "
-                            "NOT equal to places size %d",
-                            buffers.size(),
-                            member_->places_.size()));
-      if (member_->nccl_ctxs_ != nullptr) {
-        auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx();
-        platform::NCCLGroupGuard guard;
-        for (size_t i = 0; i < member_->places_.size(); ++i) {
-          auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i],
-                                       numel,
-                                       data_type,
-                                       0,
-                                       nccl_ctx.comm_,
-                                       nccl_ctx.stream());
-        }
-        nccl_ctxs->WaitAll();
-      } else {
-        auto src_place = member_->places_[0];
-        auto src_dev_ctx = static_cast<phi::GPUContext *>(
-            platform::DeviceContextPool::Instance().Get(src_place));
-        auto sizeof_dtype = framework::SizeOfType(dtype) * numel;
-        for (size_t i = 1; i < member_->places_.size(); ++i) {
-          auto dst_place = member_->places_[i];
-          auto dst_dev_ctx = static_cast<phi::GPUContext *>(
-              platform::DeviceContextPool::Instance().Get(dst_place));
-          src_dev_ctx->Wait();
-          dst_dev_ctx->Wait();
-          memory::Copy(dst_place,
-                       buffers[i],
-                       src_place,
-                       buffers[0],
-                       sizeof_dtype,
-                       src_dev_ctx->stream());
-          src_dev_ctx->Wait();
-          dst_dev_ctx->Wait();
-        }
-      }
-#endif
-    } else if (paddle::platform::is_xpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_XPU_BKCL)
-      std::vector<void *> buffers;
-      buffers.reserve(member_->places_.size());
-      size_t numel = main_tensor.numel();
-      auto dtype = framework::TransToProtoVarType(main_tensor.dtype());
-      BKCLDataType data_type = platform::ToBKCLDataType(dtype);
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-
-        if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data());
-        } else {
-          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var)->GetMutable<phi::DenseTensor>();
-          t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.dtype());
-        }
-        buffers.push_back(buffer);
-      }
-
-      PADDLE_ENFORCE_EQ(member_->places_.size(),
-                        buffers.size(),
-                        platform::errors::PreconditionNotMet(
-                            "variables' buffer size to bcast is %d, which is "
-                            "NOT equal to places size %d",
-                            buffers.size(),
-                            member_->places_.size()));
-      {
-        auto *bkcl_ctxs = member_->bkcl_ctxs_->DefaultFlatCtx();
-        platform::BKCLGroupGuard guard;
-        for (size_t i = 0; i < member_->places_.size(); ++i) {
-          auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[i]);
-          PADDLE_ENFORCE_EQ(
-              bkcl_broadcast(bkcl_ctx.comm(),
-                             buffers[i],
-                             buffers[i],
-                             numel,
-                             data_type,
-                             0,
-                             NULL),
-              BKCL_SUCCESS,
-              platform::errors::Unavailable("bkcl_broadcast failed"));
-        }
-        bkcl_ctxs->WaitAll();
-      }
-#else
-      PADDLE_THROW(
-          platform::errors::PreconditionNotMet("Not compiled with BKCL."));
-#endif
-    } else {
-      platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
-        auto local_scope = member_->local_scopes_[i];
-        auto *t = local_scope->Var(var)->GetMutable<phi::DenseTensor>();
-
-        auto copy_memory = [&] {
-          t->Resize(dims);
-          t->mutable_data(cpu, main_tensor.dtype());
-          paddle::framework::TensorCopy(main_tensor, cpu, t);
-        };
-
-        auto share_memory = [&] { t->ShareDataWith(main_tensor); };
-
-        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
-        if (member_->use_all_reduce_ ||
-            member_->IsUseCUDA(member_->use_device_) ||
-            var == "@LR_DECAY_COUNTER@") {
-          copy_memory();
-        } else {
-          share_memory();
-        }
-      }
-    }
-  }
-}
-
-FetchUnmergedList ParallelExecutor::Run(
-    const std::vector<std::string> &fetch_tensors) {
-  LOG_FIRST_N(INFO, 1) << "ParallelExecutor is Running (Run).";
-  PreludeToRun(fetch_tensors);
-  platform::RecordBlock b(0);
-
-  ResetHasFeedGuard reset_has_feed_guard(member_);
-
-  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_),
-                                fetch_tensors,
-                                member_->HasGarbageCollectors());
-
-  VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
-  auto fetch_data =
-      member_->executor_->Run(fetch_tensors, /*return_merged=*/false);
-  return PADDLE_GET(FetchUnmergedList, fetch_data);
-}
-
-FetchList ParallelExecutor::RunAndMerge(
-    const std::vector<std::string> &fetch_tensors) {
-  LOG_FIRST_N(INFO, 1) << "ParallelExecutor is Running (RunAndMerge).";
-  PreludeToRun(fetch_tensors);
-  platform::RecordBlock b(0);
-
-  ResetHasFeedGuard reset_has_feed_guard(member_);
-
-  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_),
-                                fetch_tensors,
-                                member_->HasGarbageCollectors());
-
-  VLOG(3) << "ParallelExecutor begin to run member_->executor_->RunAndMerge";
-  auto fetch_data =
-      member_->executor_->Run(fetch_tensors, /*return_merged=*/true);
-  return PADDLE_GET(FetchList, fetch_data);
-}
-
-void ParallelExecutor::RunWithoutFetch(
-    const std::vector<std::string> &skip_eager_vars) {
-  VLOG(3) << "enter ParallelExecutor RunWithoutFetch";
-#ifdef WITH_GPERFTOOLS
-  if (gProfileStarted) {
-    ProfilerFlush();
-  }
-#endif
-  platform::RecordBlock b(0);
-
-  ResetHasFeedGuard reset_has_feed_guard(member_);
-
-  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_),
-                                skip_eager_vars,
-                                member_->HasGarbageCollectors());
-
-  VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
-  member_->executor_->Run(/*fetch_tensors*/ {}, /*return_merged*/ false);
-}
-
-void ParallelExecutor::SkipMemoryReuse(
-    size_t scope_idx, const std::vector<std::string> &skip_vars) {
-  for (auto &var_name : skip_vars) {
-    bool is_persistable = member_->IsPersistable(var_name);
-    if (!is_persistable) {
-      VLOG(3) << "SkipMemoryReuse for var: " << var_name;
-      member_->SetSkipMemoryReuse(scope_idx, var_name);
-    }
-  }
-}
-
-void ParallelExecutor::FeedTensorsIntoLocalScopes(
-    const std::vector<std::unordered_map<std::string, phi::DenseTensor>>
-        &tensors) {
-  if (platform::IsCUDAGraphCapturing()) {
-    for (auto &tensor : tensors) {
-      PADDLE_ENFORCE_EQ(
-          tensor.empty(),
-          true,
-          platform::errors::PermissionDenied(
-              "Feeding data is not permitted when capturing CUDA Graph."));
-    }
-    return;
-  }
-
-  if (!member_->AllowPartialFeed()) {
-    PADDLE_ENFORCE_EQ(tensors.size(),
-                      member_->local_scopes_.size(),
-                      platform::errors::Unimplemented(
-                          "The feed data number %d does not match the device "
-                          "number %d. If you are using DataLoader to feed "
-                          "data, this may be because you set drop_last=False "
-                          "in training network. Currently, drop_last=False for "
-                          "DataLoader is not supported for training network. "
-                          "Please set drop_last=True when defining DataLoader.",
-                          tensors.size(),
-                          member_->local_scopes_.size()));
-  } else {
-    PADDLE_ENFORCE_GE(member_->local_scopes_.size(),
-                      tensors.size(),
-                      platform::errors::InvalidArgument(
-                          "The feed tensor number exceeds the device number"));
-  }
-
-  size_t feed_num = 0;
-  for (size_t i = 0; i < tensors.size(); ++i) {
-    auto &map = tensors[i];
-    if (map.empty()) {
-      continue;
-    }
-
-    member_->SetHasFeed(i);
-    ++feed_num;
-    for (auto &pair : map) {
-      bool is_persistable = member_->IsPersistable(pair.first);
-      if (!is_persistable) {
-        member_->SetSkipMemoryReuse(i, pair.first);
-      }
-      auto *feed_scope = is_persistable ? member_->local_scopes_[i]
-                                        : member_->local_exec_scopes_[i];
-      auto *feed_var = feed_scope->Var(pair.first);
-
-      auto *trg = feed_var->GetMutable<phi::DenseTensor>();
-      trg->ShareDataWith(pair.second);
-      trg->set_lod(pair.second.lod());
-    }
-  }
-
-  if (!member_->AllowPartialFeed()) {
-    PADDLE_ENFORCE_EQ(feed_num,
-                      member_->local_scopes_.size(),
-                      platform::errors::Unimplemented(
-                          "The feed data number %d does not match the device "
-                          "number %d. If you are using DataLoader to feed "
-                          "data, this may be because you set drop_last=False "
-                          "in training network. Currently, drop_last=False for "
-                          "DataLoader is not supported for training network. "
-                          "Please set drop_last=True when defining DataLoader.",
-                          feed_num,
-                          member_->local_scopes_.size()));
-  }
-}
-
-void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
-    const std::unordered_map<std::string, phi::DenseTensor> &tensors) {
-  if (platform::IsCUDAGraphCapturing()) {
-    PADDLE_ENFORCE_EQ(
-        tensors.empty(),
-        true,
-        platform::errors::PermissionDenied(
-            "Feeding data is not permitted when capturing CUDA Graph."));
-    return;
-  }
-
-  size_t num_places = member_->places_.size();
-  bool allow_partial_feed = member_->AllowPartialFeed();
-
-  size_t persistable_feed_len = -1UL;
-  size_t non_persistable_feed_len = -1UL;
-
-  for (auto &pair : tensors) {
-    bool is_persistable = member_->IsPersistable(pair.first);
-    VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable")
-            << " data (" << pair.first << "), dim:" << pair.second.dims()
-            << ", place: " << pair.second.place();
-    auto lod_tensors = SplitLoDTensor(pair.second, member_->places_);
-    bool is_cpu_place = platform::is_cpu_place(member_->places_.front());
-    if (!is_persistable && num_places != lod_tensors.size() &&
-        !allow_partial_feed) {
-      auto error_info = string::Sprintf(
-          "The number(%d) of samples[%s] of current batch is less than the "
-          "count(%d) of devices(%s), currently, it is not allowed. ",
-          lod_tensors.size(),
-          pair.first,
-          num_places,
-          (is_cpu_place ? "CPU" : "GPU"));
-      if (is_cpu_place) {
-        error_info +=
-            "You should set the environment variable CPU_NUM in the system "
-            "to determine the number of devices you need.";
-      }
-      PADDLE_THROW(platform::errors::PreconditionNotMet(error_info));
-    } else if (is_persistable) {
-      if (lod_tensors.size() == 1) {
-        lod_tensors.reserve(num_places);
-        auto &tensor = lod_tensors.front();
-        PADDLE_ENFORCE_EQ(
-            tensor.dims(),
-            pair.second.dims(),
-            platform::errors::PreconditionNotMet("The dim doesn't match."));
-        PADDLE_ENFORCE_EQ(
-            tensor.place(),
-            member_->places_.at(0),
-            platform::errors::PreconditionNotMet("The place doesn't match."));
-        for (size_t i = 1; i < num_places; ++i) {
-          lod_tensors.emplace_back();
-          auto &tmp = lod_tensors.back();
-          framework::TensorCopy(pair.second, member_->places_.at(i), &tmp);
-        }
-      }
-      if (lod_tensors.size() != num_places && !allow_partial_feed) {
-        auto error_info = string::Sprintf(
-            "The number(%d) of samples[%s] of the current batch does not match "
-            "the count(%d) of devices(%s). Because that %s is a persistable "
-            "variable, you can feed just one sample, in that case, the input "
-            "sample will be copied in %d copies and be sent to different "
-            "places separately. If you need that different place has different "
-            "value, you should feed %d samples.",
-            lod_tensors.size(),
-            pair.first,
-            num_places,
-            (is_cpu_place ? "CPU" : "GPU"),
-            pair.first,
-            num_places,
-            num_places);
-        PADDLE_THROW(platform::errors::PreconditionNotMet(error_info));
-      }
-    }
-
-    if (allow_partial_feed) {
-      if (is_persistable) {
-        if (persistable_feed_len == -1UL) {
-          persistable_feed_len = lod_tensors.size();
-        } else {
-          PADDLE_ENFORCE_EQ(
-              persistable_feed_len,
-              lod_tensors.size(),
-              platform::errors::InvalidArgument(
-                  "The feeded number of different persistable variables "
-                  "should be the same"));
-        }
-      } else {
-        if (non_persistable_feed_len == -1UL) {
-          non_persistable_feed_len = lod_tensors.size();
-        } else {
-          PADDLE_ENFORCE_EQ(
-              non_persistable_feed_len,
-              lod_tensors.size(),
-              platform::errors::InvalidArgument(
-                  "The feeded number of different non-persistable variables "
-                  "should be the same"));
-        }
-      }
-    }
-
-    for (size_t j = 0; j < lod_tensors.size(); ++j) {
-      auto *feed_scope = is_persistable ? member_->local_scopes_[j]
-                                        : member_->local_exec_scopes_[j];
-      auto *feed_var = feed_scope->Var(pair.first);
-
-      auto t = feed_var->GetMutable<phi::DenseTensor>();
-      t->ShareDataWith(lod_tensors[j]);
-      t->set_lod(lod_tensors[j].lod());
-    }
-  }
-
-  if (allow_partial_feed && persistable_feed_len != -1UL &&
-      non_persistable_feed_len != -1UL) {
-    VLOG(10) << "Persistable len " << persistable_feed_len;
-    VLOG(10) << "Non persistable len " << non_persistable_feed_len;
-    PADDLE_ENFORCE_GE(persistable_feed_len,
-                      non_persistable_feed_len,
-                      platform::errors::InvalidArgument(
-                          "The feeded number of persistable variables should "
-                          "not be less than non-persistable variables"));
-  }
-
-  if (non_persistable_feed_len != -1UL) {
-    for (size_t i = 0; i < non_persistable_feed_len; ++i) {
-      member_->SetHasFeed(i);
-    }
-  }
-}
-
-ParallelExecutor::~ParallelExecutor() {
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
-  delete member_;
-}
-
-bool ParallelExecutor::EnableParallelGraphExecution(
-    const ir::Graph &graph, const BuildStrategy &build_strategy) const {
-  return false;
-
-  bool enable_parallel_graph = true;
-
-  for (ir::Node *node : graph.Nodes()) {
-    if (node->IsVar() && node->Var()) {
-      // TODO(Yancey1989): support sparse update in ParallelGraph mode.
-      if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) {
-        enable_parallel_graph = false;
-        break;
-      }
-    } else if (node->IsOp() && node->Op()) {
-      // TODO(Yancey1989): support pserver mode
-      if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") {
-        enable_parallel_graph = false;
-        break;
-      }
-    }
-  }
-
-  if (!member_->use_all_reduce_ || !member_->IsUseCUDA(member_->use_device_)) {
-    if (build_strategy.enable_sequential_execution_) {
-      enable_parallel_graph = false;
-    }
-  }
-
-#ifdef WIN32
-  VLOG(1) << "Windows has no support to parallel graph, enable_parallel_graph "
-             "would be forced to false.";
-  enable_parallel_graph = false;
-#endif
-
-  return enable_parallel_graph;
-}
-
-void ParallelExecutor::InitExecutorPrivateMemberInfo(
-    const ExecutionStrategy &exec_strategy,
-    const BuildStrategy &build_strategy,
-    size_t device_count,
-    const ir::Graph &graph) {
-  member_->use_device_ = exec_strategy.use_device_;
-  member_->build_strategy_ = build_strategy;
-  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
-                             BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = build_strategy.num_trainers_ * device_count;
-  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
-    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
-                 "the number of places should be greater than 1.";
-    member_->build_strategy_.reduce_ =
-        BuildStrategy::ReduceStrategy::kAllReduce;
-    member_->use_all_reduce_ = true;
-  }
-
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        device_count,
-        1,
-        platform::errors::Unavailable("Windows can support Single GPU only."));
-  }
-#endif
-
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        device_count,
-        1,
-        platform::errors::PermissionDenied(
-            "Your machine has multiple cards, "
-            "but the WITH_NCCL option is not turned on during compilation, "
-            "and you cannot use multi-card training or prediction. "
-            "Please recompile and turn on the WITH_NCCL option."));
-  }
-#endif
-
-  std::string device_name;
-  if (member_->use_device_ == p::kCPU) {
-    device_name = "CPU";
-  } else if (member_->use_device_ == p::kCUDA) {
-    device_name = "CUDA";
-  } else if (member_->use_device_ == p::kXPU) {
-    device_name = "XPU";
-  } else {
-    PADDLE_THROW(
-        platform::errors::Unavailable("Only CPU/CUDA/XPU is supported. "
-                                      "please use CPU/CUDA/XPU backend."));
-  }
-
-  VLOG(1) << string::Sprintf(
-      "The Program will be executed on %s using ParallelExecutor, %lu "
-      "cards are used, so %lu programs are executed in parallel.",
-      device_name,
-      device_count,
-      device_count);
-  // FIXME(Yancey1989): parallel graph mode get better performance
-  // in GPU allreduce distributed training. Need an elegant way to
-  // choice the execution strategy.
-  member_->build_strategy_.enable_parallel_graph_ =
-      EnableParallelGraphExecution(graph, member_->build_strategy_);
-  if (member_->build_strategy_.enable_parallel_graph_) {
-    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
-                 "Execution which can get better performance,"
-              << "you can force it off by env FLAGS_enable_parallel_graph=0";
-  }
-}
-
-void ParallelExecutor::CreateLocalScopes(
-    Scope *global_scope,
-    const std::vector<Scope *> &local_scopes,
-    bool create_new) {
-  if (local_scopes.empty()) {
-    member_->own_local_scope_ = true;
-    member_->local_scopes_.emplace_back(global_scope);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&global_scope->NewScope());
-    }
-  } else {
-    member_->own_local_scope_ = false;
-    PADDLE_ENFORCE_EQ(member_->places_.size(),
-                      local_scopes.size(),
-                      platform::errors::PreconditionNotMet(
-                          "member_->places_.size() = %d is not equal to "
-                          "local_scopes.size() = %d",
-                          member_->places_.size(),
-                          local_scopes.size()));
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      if (create_new) {
-        member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
-      } else {
-        // Use local scopes directly
-        member_->local_scopes_.emplace_back(local_scopes[i]);
-      }
-    }
-  }
-}
-
-std::unordered_map<Scope *, Scope *> ParallelExecutor::CreateLocalExecScopes(
-    const std::vector<Scope *> &local_scopes, bool create_new) {
-  std::unordered_map<Scope *, Scope *> scope_map;
-
-  for (auto *scope : local_scopes) {
-    Scope *local_exec_scope = scope;
-    if (create_new) {
-      local_exec_scope = &scope->NewScope();
-    }
-    member_->local_exec_scopes_.emplace_back(local_exec_scope);
-    scope_map.emplace(scope, local_exec_scope);
-  }
-
-  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(),
-                    member_->local_exec_scopes_.size(),
-                    platform::errors::PreconditionNotMet(
-                        "member_->local_scopes_.size() = %d is not equal to "
-                        "member_->local_exec_scopes_.size() = %d",
-                        member_->local_scopes_.size(),
-                        member_->local_exec_scopes_.size()));
-
-  return scope_map;
-}
-
-std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
-    ir::Graph *graph) {
-  std::vector<ir::Graph *> graphs;
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_),
-                      false,
-                      platform::errors::Unavailable(
-                          "gpu mode does not support async_mode_ now!"));
-    graphs.push_back(graph);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
-      async_graphs_.emplace_back(tmp_graph);
-      graphs.push_back(tmp_graph);
-    }
-  }
-
-  return graphs;
-}
-
-void ParallelExecutor::PreludeToRun(
-    const std::vector<std::string> &fetch_tensors) {
-  platform::RecordEvent record_run(
-      "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1);
-  VLOG(3) << "enter ParallelExecutor Run";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (platform::IsCUDAGraphCapturing()) {
-    PADDLE_ENFORCE_EQ(fetch_tensors.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Cannot fetch data when using CUDA Graph."));
-    PADDLE_ENFORCE_EQ(
-        member_->build_strategy_.allow_cuda_graph_capture_,
-        true,
-        platform::errors::InvalidArgument(
-            "You must turn on build_strategy.allow_cuda_graph_capture = True "
-            "to enable CUDA Graph capturing."));
-    PADDLE_ENFORCE_EQ(
-        member_->places_[0],
-        platform::CUDAGraphCapturingPlace(),
-        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
-                                          "not the same as the place to run."));
-  }
-#endif
-
-#ifdef WITH_GPERFTOOLS
-  if (gProfileStarted) {
-    ProfilerFlush();
-  }
-#endif
-}
-
-void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
-  if (member_->build_strategy_.reduce_ ==
-      BuildStrategy::ReduceStrategy::kNoReduce) {
-    return;
-  }
-
-  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
-
-    // Initialize device context's nccl comm, will be used by normal
-    // Operators like sync_batch_norm, and collective ops.
-    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
-    // be rewrite and there will be some problem.
-    // NOTE: NCCL group-calls and non-group-calls can not use the same
-    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
-    // same communicators.
-    auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx(
-        global_scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (auto &place : member_->places_) {
-      auto *dev_ctx = static_cast<phi::GPUContext *>(pool.Get(place));
-      auto &nccl_ctx = nccl_ctxs->at(place);
-      dev_ctx->set_nccl_comm(nccl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
-#endif
-  }
-  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_XPU_BKCL)
-    member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_);
-
-    auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx(
-        global_scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->SetBkclContext(bkcl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with XPU."));
-#endif
-  }
-}
-
-std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
-    ir::Graph *graph,
-    std::vector<ir::Graph *> *device_graphs,
-    const std::string &loss_var_name) {
-  auto device_count = member_->places_.size();
-  std::vector<ir::Graph *> async_graphs(device_count);
-
-  auto &graphs = *device_graphs;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(graphs.size(),
-                      device_count,
-                      platform::errors::PreconditionNotMet(
-                          "graphs.size() should be %d, but received %d",
-                          device_count,
-                          graphs.size()));
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(graph,  // NOLINT
-                                           {member_->places_[0]},
-                                           loss_var_name,
-                                           {member_->local_scopes_[0]},
-                                           1,
-                                           member_->use_device_,
-                                           member_->nccl_ctxs_);
-    for (size_t i = 1; i < device_count; ++i) {
-      graphs[i] = member_->build_strategy_.Apply(graphs[i],
-                                                 {member_->places_[i]},
-                                                 loss_var_name,
-                                                 {member_->local_scopes_[i]},
-                                                 1,
-                                                 member_->use_device_,
-                                                 member_->nccl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(graph,  // NOLINT
-                                           member_->places_,
-                                           loss_var_name,
-                                           member_->local_scopes_,
-                                           member_->nranks_,
-                                           member_->use_device_,
-                                           member_->nccl_ctxs_);
-  }
-#elif defined(PADDLE_WITH_XPU_BKCL)
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(graphs.size(),
-                      device_count,
-                      platform::errors::PreconditionNotMet(
-                          "graphs.size() should be %d, but received %d",
-                          device_count,
-                          graphs.size()));
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(graph,
-                                           {member_->places_[0]},
-                                           loss_var_name,
-                                           {member_->local_scopes_[0]},
-                                           1,
-                                           member_->use_device_,
-                                           member_->bkcl_ctxs_);
-    for (size_t i = 1; i < device_count; ++i) {
-      graphs[i] = member_->build_strategy_.Apply(graphs[i],
-                                                 {member_->places_[i]},
-                                                 loss_var_name,
-                                                 {member_->local_scopes_[i]},
-                                                 1,
-                                                 member_->use_device_,
-                                                 member_->bkcl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(graph,
-                                           member_->places_,
-                                           loss_var_name,
-                                           member_->local_scopes_,
-                                           member_->nranks_,
-                                           member_->use_device_,
-                                           member_->bkcl_ctxs_);
-  }
-#else
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(graph,
-                                           {member_->places_[0]},
-                                           loss_var_name,
-                                           {member_->local_scopes_[0]},
-                                           1,
-                                           member_->use_device_);
-    for (size_t i = 1; i < device_count; ++i) {
-      graphs[i] = member_->build_strategy_.Apply(graphs[i],
-                                                 {member_->places_[i]},
-                                                 loss_var_name,
-                                                 {member_->local_scopes_[i]},
-                                                 1,
-                                                 member_->use_device_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(graph,
-                                           member_->places_,
-                                           loss_var_name,
-                                           member_->local_scopes_,
-                                           member_->nranks_,
-                                           member_->use_device_);
-  }
-#endif
-
-  return async_graphs;
-}
-
-void ParallelExecutor::CreateVariableInfos(
-    std::vector<details::VariableInfo> *var_infos, ir::Graph *graph) {
-  PADDLE_ENFORCE_EQ(
-      var_infos->size(),
-      0,
-      platform::errors::PreconditionNotMet(
-          "var_infos->size() should be 0, but received %d", var_infos->size()));
-  PADDLE_ENFORCE_EQ(
-      member_->is_persistable_.size(),
-      0,
-      platform::errors::PreconditionNotMet(
-          "member_->is_persistable_.size() should be 0, but received %d",
-          member_->is_persistable_.size()));
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos->emplace_back();
-      var_infos->back().name_ = node->Var()->Name();
-      var_infos->back().type_ = node->Var()->GetType();
-      var_infos->back().persistable_ = node->Var()->Persistable();
-
-      member_->is_persistable_.emplace(node->Var()->Name(),
-                                       node->Var()->Persistable());
-    }
-  }
-
-  if (graph->Has(details::kFusedVars)) {
-    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
-    for (auto &fused_var : fused_vars) {
-      var_infos->emplace_back();
-      var_infos->back() = fused_var.second;
-
-      member_->is_persistable_.emplace(fused_var.first,
-                                       fused_var.second.persistable_);
-    }
-  }
-}
-
-std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
-    const ExecutionStrategy &exec_strategy,
-    std::vector<ir::Graph *> *async_graphs,
-    ir::Graph *graph) {
-  std::vector<ir::Graph *> final_graphs;
-
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use AsyncSSAGraphExecutor";
-    member_->executor_ = std::make_unique<details::AsyncSSAGraphExecutor>(
-        exec_strategy,
-        member_->local_scopes_,
-        member_->local_exec_scopes_,
-        member_->places_,
-        *async_graphs);
-    final_graphs = *async_graphs;
-  } else if (member_->build_strategy_.enable_parallel_graph_) {
-    VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    // TODO(Yancey1989): Remove passing in the main_program when
-    // allreduce_seq_pass doesn't need it as the attr.
-    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
-
-    auto *pg_exe =
-        new details::ParallelSSAGraphExecutor(exec_strategy,
-                                              member_->local_scopes_,
-                                              member_->local_exec_scopes_,
-                                              member_->places_,
-                                              graph);
-    final_graphs = pg_exe->Graphs();
-    member_->executor_.reset(pg_exe);
-
-    if (is_inference && member_->places_.size() > 1) {
-      member_->inference_executor_ = pg_exe;
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
-#endif
-  } else {
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
-    auto possible_inference_graphs =
-        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
-    if (!possible_inference_graphs.empty()) {
-      for (auto &g : possible_inference_graphs) {
-        member_->ApplyFixOpRunOrderPass(g.get());
-      }
-
-      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
-      auto *pg_exe = new details::ParallelSSAGraphExecutor(
-          exec_strategy,
-          member_->local_scopes_,
-          member_->local_exec_scopes_,
-          member_->places_,
-          std::move(possible_inference_graphs));
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-      final_graphs = pg_exe->Graphs();
-      member_->executor_.reset(pg_exe);
-      member_->inference_executor_ = pg_exe;
-    } else {
-      if (member_->places_.size() == 1) {
-        member_->ApplyFixOpRunOrderPass(graph);
-      }
-      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
-          << "drop_last=False for DataLoader is not supported in training "
-             "network. It is automatically turned to drop_last=True.";
-      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-        VLOG(3) << "use ThreadedSSAGraphExecutor";
-        member_->executor_ =
-            std::make_unique<details::ThreadedSSAGraphExecutor>(
-                exec_strategy,
-                member_->local_scopes_,
-                member_->local_exec_scopes_,
-                member_->places_,
-                graph);
-      } else {
-        if (member_->use_device_ == p::kXPU) {
-#if defined(PADDLE_WITH_XPU)
-          VLOG(3) << "use BindThreadedSSAGraphExecutor";
-          member_->executor_ =
-              std::make_unique<details::BindThreadedSSAGraphExecutor>(
-                  exec_strategy,
-                  member_->local_scopes_,
-                  member_->local_exec_scopes_,
-                  member_->places_,
-                  graph);
-#else
-          PADDLE_THROW(platform::errors::PermissionDenied(
-              "Paddle can't use XPU device since it's not compiled with XPU,"
-              "Please recompile or reinstall Paddle with XPU support."));
-#endif
-        } else {
-          VLOG(3) << "use FastThreadedSSAGraphExecutor";
-          member_->executor_ =
-              std::make_unique<details::FastThreadedSSAGraphExecutor>(
-                  exec_strategy,
-                  member_->local_scopes_,
-                  member_->local_exec_scopes_,
-                  member_->places_,
-                  graph);
-        }
-      }
-      final_graphs.emplace_back(graph);
-    }
-  }
-  return final_graphs;
-}
-
-void ParallelExecutor::ResetOpHandleScopeMapOfGraphs(
-    const std::vector<ir::Graph *> &final_graphs,
-    const std::unordered_map<Scope *, Scope *> &scope_map) {
-  PADDLE_ENFORCE_GE(
-      final_graphs.size(),
-      1,
-      platform::errors::PreconditionNotMet(
-          "final_graphs shoule contain at least one graph, but received %d",
-          final_graphs.size()));
-
-  PADDLE_ENFORCE_GT(scope_map.size(),
-                    0,
-                    platform::errors::PreconditionNotMet(
-                        "scope_map shoule contain at least one "
-                        "element, but received %d",
-                        scope_map.size()));
-  for (auto *g : final_graphs) {
-    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
-    for (auto *op : ops) {
-      op->SetLocalExecScopes(scope_map);
-      op->SetIsVariantScope(true);
-    }
-  }
-}
-
-void ParallelExecutor::ResetOpHandleScopeMapOfGraphs(
-    const std::unordered_map<Scope *, Scope *> &scope_map) {
-  auto inner_graph = const_cast<ir::Graph *>(&Graph());
-  std::vector<ir::Graph *> graphs = {inner_graph};
-  ResetOpHandleScopeMapOfGraphs(graphs, scope_map);
-}
-
-void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs(
-    const std::vector<ir::Graph *> &final_graphs) {
-  if (final_graphs.size() == 1) {
-    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
-  } else {
-    for (size_t i = 0; i < final_graphs.size(); ++i) {
-      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
-    }
-  }
-}
-
-const ir::Graph &ParallelExecutor::Graph() const {
-  return member_->executor_->Graph();
-}
-
-void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
-  const auto &build_strategy = member_->build_strategy_;
-  if (!build_strategy.allow_cuda_graph_capture_) return;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PADDLE_ENFORCE_EQ(
-      build_strategy.async_mode_,
-      false,
-      platform::errors::InvalidArgument(
-          "Async Executor does not support CUDA Graph capturing."));
-  PADDLE_ENFORCE_EQ(
-      platform::IsCUDAGraphCapturing(),
-      false,
-      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
-                                         "when running the first batch."));
-  PADDLE_ENFORCE_EQ(
-      member_->places_.size(),
-      1,
-      platform::errors::InvalidArgument(
-          "CUDA Graph is only supported when one GPU device is running."));
-  PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]),
-                    true,
-                    platform::errors::InvalidArgument(
-                        "CUDA Graph is only supported on NVIDIA GPU device."));
-  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
-                    false,
-                    platform::errors::InvalidArgument(
-                        "FLAGS_sync_nccl_allreduce must be False to support "
-                        "CUDA Graph capturing."));
-
-  std::unordered_map<std::string, std::vector<VarDesc *>> all_vars;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      auto *var_desc = node->Var();
-      all_vars[var_desc->Name()].emplace_back(var_desc);
-    }
-  }
-
-  auto mark_var_as_persistable = [&all_vars](const std::string &name) {
-    auto iter = all_vars.find(name);
-    if (iter != all_vars.end()) {
-      for (auto *var_desc : iter->second) {
-        var_desc->SetPersistable(true);
-      }
-    }
-  };
-
-  // Step 1: All fused vars must be persistable.
-  if (graph->Has(details::kFusedVars)) {
-    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
-    for (auto &fused_var : fused_vars) {
-      fused_var.second.persistable_ = true;
-      mark_var_as_persistable(fused_var.first);
-    }
-  }
-
-  // Step 2: All pinned vars must be persistable.
-  if (graph->Has(details::kPinnedVars)) {
-    auto &pinned_vars = graph->Get<details::PinnedVars>(details::kPinnedVars);
-    for (auto &pinned_var : pinned_vars) {
-      mark_var_as_persistable(pinned_var);
-    }
-  }
-
-  // Step 3: Move all main programs to startup programs to make sure that
-  // the main programs would only be run once.
-  if (graph->Has(details::kProgramDescs)) {
-    auto &startup_programs =
-        graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
-    auto &main_programs =
-        graph->Get<details::ProgramDescs>(details::kProgramDescs);
-    for (auto &main_program : main_programs) {
-      startup_programs.emplace_back(main_program);
-    }
-    graph->Erase(details::kProgramDescs);
-  }
-
-  // Step 4: Mark all vars in startup programs to be persistable.
-  if (graph->Has(details::kStartupProgramDescs)) {
-    auto &startup_programs =
-        graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
-    for (auto &startup_program : startup_programs) {
-      for (auto &op_desc : startup_program.Block(0).AllOps()) {
-        for (auto &output : op_desc->OutputArgumentNames()) {
-          mark_var_as_persistable(output);
-        }
-      }
-    }
-  }
-
-  // Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy.
-  auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
-  auto *scope = member_->local_scopes_[0];
-  for (auto *op : ops) {
-    auto *loss_grad_op = dynamic_cast<details::ScaleLossGradOpHandle *>(op);
-    if (loss_grad_op == nullptr) continue;
-    auto loss_grad_name = loss_grad_op->LossGradName();
-    mark_var_as_persistable(loss_grad_name);
-    loss_grad_op->RunOnVar(scope->Var(loss_grad_name));
-    loss_grad_op->SetSkipRunning(true);
-  }
-#else
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "CUDA Graph is only supported on NVIDIA GPU device."));
-#endif
-}
-
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(reference_count_pass);
-USE_PASS(eager_deletion_pass);
-USE_PASS(buffer_shared_inplace_pass);
-USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
-USE_PASS(inplace_addto_op_pass);
-USE_PASS(fix_op_run_order_pass);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
deleted file mode 100644
index 43e27e81cd135..0000000000000
--- a/paddle/fluid/framework/parallel_executor.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/details/build_strategy.h"
-#include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-
-class ParallelExecutorPrivate;
-
-using details::BuildStrategy;
-using details::ExecutionStrategy;
-using details::VariableInfo;
-namespace p = paddle::platform;
-using DeviceType = paddle::platform::DeviceType;
-
-class ParallelExecutor {
-  DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
-
- public:
-  TEST_API explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                                     const std::vector<std::string> &bcast_vars,
-                                     const std::string &loss_var_name,
-                                     Scope *scope,
-                                     const std::vector<Scope *> &local_scopes,
-                                     const ExecutionStrategy &exec_strategy,
-                                     const BuildStrategy &build_strategy,
-                                     ir::Graph *graph);
-
-  // NOTE(Aurelius84): Construct a PE running on single device for @to_static
-  explicit ParallelExecutor(const platform::Place &place,
-                            Scope *scope,
-                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
-                            ir::Graph *graph);
-
-  TEST_API ~ParallelExecutor();
-
-  size_t DeviceCount() const;
-
-  std::vector<Scope *> &GetLocalScopes();
-
-  void DropLocalExeScopes();
-
-  // This API is used to check whether DropLocalExeScopes work.
-  bool NeedCreateLocalExeScope();
-
-  /**
-   * Feed tensors to local scopes. The size of tensors should be equal to the
-   * size of local scopes.
-   */
-  void FeedTensorsIntoLocalScopes(
-      const std::vector<std::unordered_map<std::string, phi::DenseTensor>>
-          &tensors);
-
-  void FeedAndSplitTensorIntoLocalScopes(
-      const std::unordered_map<std::string, phi::DenseTensor> &tensors);
-
-  FetchUnmergedList Run(const std::vector<std::string> &fetch_tensors);
-  FetchList RunAndMerge(const std::vector<std::string> &fetch_tensors);
-
-  void RunWithoutFetch(const std::vector<std::string> &skip_eager_vars);
-
-  void ResetOpHandleScopeMapOfGraphs(
-      const std::unordered_map<Scope *, Scope *> &scope_map);
-
-  TEST_API const ir::Graph &Graph() const;
-  void PrepareVariables(Scope *scope);
-
-  void SkipMemoryReuse(size_t scope_idx,
-                       const std::vector<std::string> &skip_vars);
-
- private:
-  // broadcast the parameters from the 0th device.
-  // trainer_id the trainer index in nccl distributed training.
-  void BCastParamsToDevices(const std::vector<std::string> &vars,
-                            int trainer_id = 0) const;
-  bool EnableParallelGraphExecution(const ir::Graph &graph,
-                                    const BuildStrategy &build_strategy) const;
-
-  void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy,
-                                     const BuildStrategy &build_strategy,
-                                     size_t device_count,
-                                     const ir::Graph &graph);
-
-  void CreateLocalScopes(Scope *global_scope,
-                         const std::vector<Scope *> &local_scopes,
-                         bool create_new);
-
-  std::unordered_map<Scope *, Scope *> CreateLocalExecScopes(
-      const std::vector<Scope *> &local_scopes, bool create_new);
-
-  std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);
-
-  void PreludeToRun(const std::vector<std::string> &fetch_tensors);
-
-  void PrepareNCCLCommunicator(Scope *global_scope);
-
-  std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
-      ir::Graph *graph,
-      std::vector<ir::Graph *> *graphs,
-      const std::string &loss_var_name);
-
-  void CreateVariableInfos(std::vector<VariableInfo> *var_infos,
-                           ir::Graph *graph);
-
-  std::vector<ir::Graph *> CreateSSAGraphExecutor(
-      const ExecutionStrategy &exec_strategy,
-      std::vector<ir::Graph *> *async_graphs,
-      ir::Graph *graph);
-
-  void ResetOpHandleScopeMapOfGraphs(
-      const std::vector<ir::Graph *> &final_graphs,
-      const std::unordered_map<Scope *, Scope *> &scope_map);
-
-  void SetReaderOpDeviceInfoOfGraphs(
-      const std::vector<ir::Graph *> &final_graphs);
-
-  void PrepareForCUDAGraphCapture(ir::Graph *graph);
-
-  ParallelExecutorPrivate *member_;
-  std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
-  std::vector<VariableInfo> var_infos_;
-};
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt
index 643145f78ddbf..2ed94b777c066 100644
--- a/paddle/fluid/jit/CMakeLists.txt
+++ b/paddle/fluid/jit/CMakeLists.txt
@@ -2,7 +2,7 @@ proto_library(paddle_jit_property_proto SRCS property.proto)
 
 file(GLOB_RECURSE fluid_jit_srcs "*.cc")
 set(fluid_jit_deps paddle_jit_property_proto proto_desc executor
-                   parallel_executor compiled_program)
+                   compiled_program)
 
 cc_library(
   fluid_jit
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 494f533a4d4bc..b609e89806d9b 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/executor_cache.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -218,15 +217,14 @@ template <typename T, typename DeviceContext>
 class RunProgramOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_THROW(phi::errors::InvalidArgument("Not supported yet!"));
     const auto &capture_mode = ctx.Attr<std::string>("cuda_graph_capture_mode");
-    auto is_test = ctx.Attr<bool>("is_test");
     if (capture_mode.empty()) {
-      ComputeImpl(ctx, is_test, false);
       return;
     }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto mode = details::StringToCUDAGraphCaptureMode(capture_mode);
+    auto is_test = ctx.Attr<bool>("is_test");
     PADDLE_ENFORCE_EQ(
         ctx.GetPlace().GetType() == phi::AllocationType::GPU,
         true,
@@ -242,20 +240,6 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     inner_graphs.resize(std::max<size_t>(3, inner_graphs.size()));
     size_t graph_idx = is_test ? 0 : 1;
     if (inner_graphs[graph_idx].get() == nullptr) {
-      int64_t pool_id;
-      if (inner_graphs[1 - graph_idx].get() != nullptr) {
-        pool_id = inner_graphs[1 - graph_idx]->PoolID();
-      } else {
-        pool_id = ctx.Attr<int64_t>("cuda_graph_pool_id");
-      }
-
-      framework::PEAndGraphPair pe_and_graph;
-      auto callable = [this, is_test, &pe_and_graph](
-                          const framework::ExecutionContext &exe_ctx) {
-        pe_and_graph = ComputeImpl(exe_ctx, is_test, true);
-      };
-      inner_graphs[graph_idx] = CaptureCUDAGraph(
-          callable, ctx, {"X"}, {"Out", "DOut"}, mode, pool_id);
       VLOG(10) << "Capture Forward CUDA Graph";
     } else {
       VLOG(10) << "Run Forward CUDA Graph directly";
@@ -268,162 +252,19 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
                                      "valid when using NVIDIA GPU."));
 #endif
   }
-
- private:
-  framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx,
-                                        bool is_test,
-                                        bool use_cuda_graph) const {
-    VLOG(2) << "RunProgramOpKernel Compute";
-    framework::PEAndGraphPair pe_and_graph;
-    // Step 1. prepare inputs, outputs, attrs
-    auto &input_vars = ctx.MultiInputVar("X");
-    auto &param_vars = ctx.MultiInputVar("Params");
-    auto output_vars = ctx.MultiOutputVar("Out");
-    auto dout_vars = ctx.MultiOutputVar("DOut");
-
-    auto input_var_names = ctx.InputNames("X");
-    auto output_var_names = ctx.OutputNames("Out");
-    std::vector<std::string> dout_var_names;
-    if (!dout_vars.empty()) {
-      // DOut is a dispensable out, only get the names when it exists.
-      // Otherwise, it will throw a NotFound error.
-      dout_var_names = ctx.OutputNames("DOut");
-    }
-
-    // current program may not hold parameters
-    std::vector<std::string> param_names;
-    if (!param_vars.empty()) {
-      param_names = ctx.InputNames("Params");
-    }
-
-    auto start_op_index = ctx.Attr<int64_t>("start_op_index");
-    auto end_op_index = ctx.Attr<int64_t>("end_op_index");
-    auto program_id = ctx.Attr<int64_t>("program_id");
-
-    // NOTE(chenweihang): In order not to add new variable type, use vector
-    // here. Originally, here can use scope directly.
-    auto *out_scope_vec = ctx.Output<StepScopeVar>("OutScope");
-    std::unique_ptr<framework::Scope> inner_scope{nullptr};
-    if (out_scope_vec->size() == 0) {
-      // For cuda graph under static graph mode usage.
-      // For static graph mode, we cannot set value of a tensor before any run,
-      // the OutScope variable passed to the op actually contains nothing.
-      // Just create a tmp scope to run the program.
-      PADDLE_ENFORCE_EQ(
-          use_cuda_graph,
-          true,
-          phi::errors::InvalidArgument(
-              "If not provide OutScope then must run under cuda graph mode."));
-      inner_scope = std::make_unique<framework::Scope>();
-    } else {
-      PADDLE_ENFORCE_EQ(
-          out_scope_vec->size(),
-          1,
-          phi::errors::InvalidArgument(
-              "The OutScope of RunProgramGradOp should only hold one scope."));
-    }
-
-    // Step 2. prepare executor and init persistable variables
-
-    // NOTE(Aurelius84): While training some models, forward can be called many
-    // times and then apply backpropagation all at once, such as Reinforcement
-    // Learning. Tensor data in multi-step training should be saved into single
-    // scope separately. Otherwise, the gradients can be miscalculated because
-    // always using the Tensor data of the last step in forward.
-    framework::Scope *global_inner_scope =
-        out_scope_vec->size() == 0 ? inner_scope.get() : out_scope_vec->front();
-    VLOG(2) << "The number of sub scopes before forward: "
-            << global_inner_scope->kids().size();
-    framework::Scope &scope = global_inner_scope->NewScope();
-
-    // share input_vars & parameters into scope
-    details::ShareVarsIntoScope(input_vars, input_var_names, &scope);
-    details::ShareVarsIntoScope(param_vars, param_names, &scope);
-
-    auto *global_block = ctx.Attr<BlockDesc *>("global_block");
-
-    if (end_op_index > start_op_index) {
-      auto *program = global_block->Program();
-      bool is_new_created;
-      if (use_cuda_graph) {
-        pe_and_graph = framework::CreateFixOrderExecutorInfo(
-            *program, ctx.GetPlace(), start_op_index, end_op_index, &scope);
-        is_new_created = true;
-      } else {
-        auto cache_info = framework::GetExecutorInfoFromCache(*program,
-                                                              ctx.GetPlace(),
-                                                              start_op_index,
-                                                              end_op_index,
-                                                              /*is_grad=*/false,
-                                                              program_id,
-                                                              &scope);
-        pe_and_graph.first = cache_info.first;
-        is_new_created = cache_info.second;
-      }
-
-      auto &parallel_executor = pe_and_graph.first;
-
-      // all out_vars are skip_eager_var
-      std::vector<std::string> tmp_vars;
-      auto &skip_eager_delete_vars =
-          use_cuda_graph
-              ? tmp_vars
-              : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
-                    program_id, false);
-      if (is_new_created) {
-        parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_var_names);
-        skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
-                                      output_var_names.begin(),
-                                      output_var_names.end());
-        skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
-                                      dout_var_names.begin(),
-                                      dout_var_names.end());
-        framework::details::ParseSafeEagerDeletionSkipVars(
-            *program, end_op_index, output_var_names, &skip_eager_delete_vars);
-      }
-
-      // Step 3. run ops
-      parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
-    }
-    // Step 4. Get Output
-    details::ShareVarsFromScope(
-        output_vars, output_var_names, *global_block, &scope);
-    details::ShareVarsFromScope(
-        dout_vars, dout_var_names, *global_block, &scope);
-
-    // Debug info: scope info when run end
-    framework::Scope *target_scope{nullptr};
-    if (out_scope_vec->size() == 0) {
-      target_scope = inner_scope.get();
-    } else {
-      target_scope = out_scope_vec->front();
-    }
-    VLOG(3) << framework::GenScopeTreeDebugInfo(target_scope);
-    // Step 5. Drop all children scopes while testing.
-    if (is_test) {
-      target_scope->DropKids();
-    }
-    VLOG(2) << "The number of sub scopes after forward: "
-            << target_scope->kids().size();
-#ifdef PADDLE_WITH_DNNL
-    if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace());
-#endif
-    return pe_and_graph;
-  }
 };
 
 template <typename T, typename DeviceContext>
 class RunProgramGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_THROW(phi::errors::InvalidArgument("Not supported yet!"));
     const auto &capture_mode = ctx.Attr<std::string>("cuda_graph_capture_mode");
     if (capture_mode.empty()) {
-      ComputeImpl(ctx, false);
       return;
     }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto mode = details::StringToCUDAGraphCaptureMode(capture_mode);
     PADDLE_ENFORCE_EQ(
         ctx.GetPlace().GetType() == phi::AllocationType::GPU,
         true,
@@ -440,21 +281,6 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
             ->GetMutable<std::vector<std::unique_ptr<CUDAGraphWithInOuts>>>());
     const size_t graph_idx = 2;
     if (inner_graphs[graph_idx].get() == nullptr) {
-      framework::PEAndGraphPair pe_and_graph;
-      auto callable =
-          [this, &pe_and_graph](const framework::ExecutionContext &exe_ctx) {
-            pe_and_graph = ComputeImpl(exe_ctx, true);
-          };
-      int64_t pool_id = inner_graphs[0].get() != nullptr
-                            ? inner_graphs[0]->PoolID()
-                            : inner_graphs[1]->PoolID();
-      inner_graphs[graph_idx] =
-          CaptureCUDAGraph(callable,
-                           ctx,
-                           {framework::GradVarName("Out")},
-                           {framework::GradVarName("X")},
-                           mode,
-                           pool_id);
       VLOG(10) << "Capture Backward CUDA Graph";
     } else {
       ExecuteCUDAGraph(ctx,
@@ -469,123 +295,6 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
                                      "valid when using NVIDIA GPU."));
 #endif
   }
-
- private:
-  framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx,
-                                        bool use_cuda_graph) const {
-    VLOG(2) << "RunProgramGradOpKernel Compute";
-    framework::PEAndGraphPair pe_and_graph;
-    // Step 1. prepare inputs and outputs
-    auto &output_grad_vars = ctx.MultiInputVar(framework::GradVarName("Out"));
-    auto input_grad_vars = ctx.MultiOutputVar(framework::GradVarName("X"));
-    auto param_grad_vars = ctx.MultiOutputVar(framework::GradVarName("Params"));
-
-    // if all output vars are set to stop_gradient, grad op no need to executed
-    if (input_grad_vars.empty() && param_grad_vars.empty()) {
-      return pe_and_graph;
-    }
-
-    auto output_grad_var_names = ctx.InputNames(framework::GradVarName("Out"));
-    // NOTE: after PR22939 [Add double grad] merged, the grad op maker's
-    //   SetOutput will set to None if the input var stop_gradient=True,
-    //   it will cause an NotFound error when ctx.OutputNames() is called
-    std::vector<std::string> input_grad_var_names;
-    std::vector<std::string> param_grad_names;
-    if (!input_grad_vars.empty()) {
-      input_grad_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    }
-    if (!param_grad_vars.empty()) {
-      param_grad_names = ctx.OutputNames(framework::GradVarName("Params"));
-    }
-
-    auto *block = ctx.Attr<BlockDesc *>("global_block");
-    auto orig_end_op_index = ctx.Attr<int64_t>("end_op_index");
-    auto program_id = ctx.Attr<int64_t>("program_id");
-    // NOTE: skip `shape` and `fill_constant` op created by
-    // fluid.backward.gradients, one forward output will generate one `shape`
-    // and `fill_constant`
-    int64_t start_op_index = orig_end_op_index + (output_grad_vars.size() * 2);
-    int64_t end_op_index = block->OpSize();
-
-    auto *out_scope_vec = ctx.Input<StepScopeVar>("OutScope");
-    PADDLE_ENFORCE_EQ(
-        out_scope_vec->size(),
-        1,
-        phi::errors::InvalidArgument(
-            "The OutScope of RunProgramGradOp should only hold one scope."));
-
-    framework::Scope *global_inner_scope = out_scope_vec->front();
-    auto sub_scope_num = global_inner_scope->kids().size();
-    VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
-    PADDLE_ENFORCE_GT(sub_scope_num,
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The OutScope of RunProgramGradOp should hold at "
-                          "least one sub scope."));
-
-    auto &scope = *(global_inner_scope->kids().front());
-    auto *global_block = ctx.Attr<BlockDesc *>("global_block");
-
-    if (end_op_index > start_op_index) {
-      // Step 2. prepare executor and scope
-      auto *program = global_block->Program();
-      bool is_new_created;
-      if (use_cuda_graph) {
-        pe_and_graph = framework::CreateFixOrderExecutorInfo(
-            *program, ctx.GetPlace(), start_op_index, end_op_index, &scope);
-        is_new_created = true;
-      } else {
-        auto cache_info = framework::GetExecutorInfoFromCache(*program,
-                                                              ctx.GetPlace(),
-                                                              start_op_index,
-                                                              end_op_index,
-                                                              /*is_grad*/ true,
-                                                              program_id,
-                                                              &scope);
-        pe_and_graph.first = cache_info.first;
-        is_new_created = cache_info.second;
-      }
-
-      auto &parallel_executor = pe_and_graph.first;
-      std::vector<std::string> tmp_vars;
-      auto &skip_eager_delete_vars =
-          use_cuda_graph
-              ? tmp_vars
-              : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
-                    program_id, true);
-      if (is_new_created) {
-        parallel_executor->SkipMemoryReuse(/*scope_idx=*/0,
-                                           output_grad_var_names);
-
-        skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
-                                      input_grad_var_names.begin(),
-                                      input_grad_var_names.end());
-        framework::details::AppendSkipDeletionVars(param_grad_names,
-                                                   &skip_eager_delete_vars);
-      }
-
-      details::ShareVarsIntoScope(
-          output_grad_vars, output_grad_var_names, &scope);
-      // Debug info: scope info when run end
-      VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front());
-
-      // Step 3. run ops
-      parallel_executor->RunWithoutFetch(
-          /*skip_eager_delete_vars=*/skip_eager_delete_vars);
-    }
-
-    // Step 4. get outputs
-    details::ShareVarsFromScope(
-        input_grad_vars, input_grad_var_names, *global_block, &scope);
-    details::ShareVarsFromScope(
-        param_grad_vars, param_grad_names, *global_block, &scope);
-
-    // Step5. drop current scope
-    global_inner_scope->DeleteScope(&scope);
-    VLOG(2) << "The number of sub scopes after backward: "
-            << global_inner_scope->kids().size();
-    return pe_and_graph;
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/pybind/compiled_program.cc b/paddle/fluid/pybind/compiled_program.cc
index 7b3ac4a2467f0..fdc9b1c468a00 100644
--- a/paddle/fluid/pybind/compiled_program.cc
+++ b/paddle/fluid/pybind/compiled_program.cc
@@ -996,12 +996,6 @@ void BindCompiledProgram(pybind11::module &m) {  // NOLINT
                 optimization passes should be defined in this way. BuildStrategy
                 cannot be updated after being finalized.)DOC");
 
-  m.def("_set_cached_executor_build_strategy",
-        [](int64_t program_id, const BuildStrategy &build_strategy) {
-          auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
-          cached_exe_info.SetBuildStrategy(program_id, build_strategy);
-        });
-
   cp.def(py::init<const std::vector<platform::Place> &,
                   const std::vector<std::string> &,
                   const std::string &,
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index adf5852aabb64..71c0699c7ca44 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -55,7 +55,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ae49f2594ce0a..64a7e212d8b1e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2955,7 +2955,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif  // PADDLE_WITH_CUDA
   m.def("clear_executor_cache", []() {
     pybind11::gil_scoped_release release;
-    framework::ExecutorInfoCache::Instance().Finalize();
     framework::InterpreterCoreInfoCache::Instance().Finalize();
   });
 
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 098d7182f5a02..39cb9c645e537 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -55,7 +55,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index f412a954c0bb0..fdf5e97e3452d 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -308,7 +308,6 @@ def to_list(s):
         _RecordEvent,
         _Scope,
         _set_amp_op_list,
-        _set_cached_executor_build_strategy,
         _set_current_stream,
         _set_eager_deletion_mode,
         _set_fuse_parameter_group_size,
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index f4fc6ea387f97..1525071934805 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -442,9 +442,6 @@ def _train_pure_fp16_forward_backward_program(self):
     @LazyInitialized
     def _train_program_id(self):
         program_id = paddle.utils._hash_with_id(self._train_program, self)
-        core._set_cached_executor_build_strategy(
-            program_id, self._build_strategy
-        )
         return program_id
 
     @LazyInitialized
@@ -454,9 +451,6 @@ def _infer_program_id(self):
     @LazyInitialized
     def _train_amp_program_id(self):
         program_id = paddle.utils._hash_with_id(self._train_amp_program, self)
-        core._set_cached_executor_build_strategy(
-            program_id, self._build_strategy
-        )
         return program_id
 
     @LazyInitialized
@@ -468,9 +462,6 @@ def _train_pure_fp16_program_id(self):
         program_id = paddle.utils._hash_with_id(
             self._train_pure_fp16_program, self
         )
-        core._set_cached_executor_build_strategy(
-            program_id, self._build_strategy
-        )
         return program_id
 
     @LazyInitialized
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index ff6ee46c8a1f9..cb05d818d3a86 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -789,9 +789,6 @@ def pass_fn(forward_program, backward_program):
     @cached_property
     def _train_program_id(self):
         program_id = paddle.utils._hash_with_id(self.train_program, self)
-        core._set_cached_executor_build_strategy(
-            program_id, self._build_strategy
-        )
         return program_id
 
     @cached_property
diff --git a/test/deprecated/legacy_test/CMakeLists.txt b/test/deprecated/legacy_test/CMakeLists.txt
index 61046057f7c7a..8eab1efdf26a3 100644
--- a/test/deprecated/legacy_test/CMakeLists.txt
+++ b/test/deprecated/legacy_test/CMakeLists.txt
@@ -405,7 +405,6 @@ list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_layers_deprecated)
 list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
-list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
 list(REMOVE_ITEM TEST_OPS test_install_check)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 98b741742bad8..9e1b92ab17bcf 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -427,7 +427,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
-list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 5d259e101b56d..7e66016dfec1b 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -291,8 +291,8 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_trt_convert_unsqueeze2$|\
 ^test_simplify_with_basic_ops_pass_autoscan$|\
 ^test_trt_convert_nearest_interp$|\
-^test_trt_pool_op$|\ 
-^test_trt_convert_clip$|\ 
+^test_trt_pool_op$|\
+^test_trt_convert_clip$|\
 ^test_trt_convert_grid_sampler$|\
 ^test_trt_convert_p_norm$|\
 ^disable_wingpu_cuda12_test$"
@@ -427,6 +427,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\
 ^disable_win_inference_test$|\
 ^test_imperative_double_grad$|\
 ^test_comp_eager_matmul_double_grad$|\
+^test_cuda_graph_partial_graph_static_run$|\
 ^test_imperative_triple_grad$"
 
 
@@ -544,9 +545,9 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
         python ${PADDLE_ROOT}/tools/get_pr_ut.py || echo "Failed to obtain ut_list !"
     fi
-    
+
     python ${PADDLE_ROOT}/tools/group_case_for_parallel.py ${PADDLE_ROOT}
-    
+
 fi
 
 failed_test_lists=''
@@ -715,7 +716,7 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then
     done < $PADDLE_ROOT/tools/single_card_tests_new
     single_ut_endTime_s=`date +%s`
     single_ut_Time_s=`expr $single_ut_endTime_s - $single_ut_startTime_s`
-    echo "ipipe_log_param_1_TestCases_Total_Time: $single_ut_Time_s s" 
+    echo "ipipe_log_param_1_TestCases_Total_Time: $single_ut_Time_s s"
 
     multiple_ut_mem_0_startTime_s=`date +%s`
     while read line
@@ -724,8 +725,8 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then
     done < $PADDLE_ROOT/tools/multiple_card_tests_mem0_new
     multiple_ut_mem_0_endTime_s=`date +%s`
     multiple_ut_mem_0_Time_s=`expr $multiple_ut_mem_0_endTime_s - $multiple_ut_mem_0_startTime_s`
-    echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $multiple_ut_mem_0_Time_s s" 
-    
+    echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $multiple_ut_mem_0_Time_s s"
+
     multiple_ut_startTime_s=`date +%s`
     while read line
     do
@@ -749,7 +750,7 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then
     done < $PADDLE_ROOT/tools/exclusive_card_tests_mem0_new
     exclusive_ut_mem_0_endTime_s=`date +%s`
     exclusive_ut_mem_0_Time_s=`expr $exclusive_ut_mem_0_endTime_s - $exclusive_ut_mem_0_startTime_s`
-    echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $exclusive_ut_mem_0_Time_s s" 
+    echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $exclusive_ut_mem_0_Time_s s"
 
     exclusive_ut_startTime_s=`date +%s`
     while read line