From a4a378dcf1b54e0c2301b21b78e29e876c3d09c1 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Mon, 18 Dec 2023 11:48:36 +0000
Subject: [PATCH 1/9] fix bug of RunWithExternalStream API in new executor

---
 .../new_executor/program_interpreter.cc       | 49 +++++++++++++++++++
 .../new_executor/program_interpreter.h        |  1 +
 .../fluid/inference/api/analysis_predictor.cc |  4 ++
 paddle/phi/core/flags.cc                      | 14 ++++++
 .../api/analysis_predictor_tester.cc          |  1 +
 5 files changed, 69 insertions(+)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 3e849670e4699..688a919bcb0fb 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -41,6 +41,9 @@
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PHI_DECLARE_bool(inference_switch_stream);
+#endif
 PD_DECLARE_bool(enable_host_event_recorder_hook);
 PD_DECLARE_bool(log_memory_stats);
 PHI_DECLARE_string(static_runtime_data_save_path);
@@ -163,6 +166,12 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
     is_build_ = true;
     is_shared_results_build_ = true;
   } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (FLAGS_inference_switch_stream) {
+      UpdateDevCtx(&op_func_nodes);
+      FLAGS_inference_switch_stream = false;
+    }
+#endif
     RunImpl();
   }
 
@@ -879,6 +888,46 @@ void ProgramInterpreter::Convert(
   AnalyseExecuteOrderForTrace();
 }
 
+void ProgramInterpreter::UpdateDevCtx(
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  auto nodes = *op_func_nodes;
+  auto op_nums = nodes.size();
+  vec_instruction_.clear();
+  vec_instruction_.reserve(op_nums);
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    auto& op_func_node = nodes[op_idx];
+    stream_analyzer_.SetForceEventsToWaitInfo(force_evnets_to_wait_);
+    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
+#ifdef PADDLE_WITH_CUDA
+    if (FLAGS_new_executor_use_cuda_graph) {
+      auto& op = op_func_node.operator_base_;
+      auto& op_type = op->Type();
+      if (op_type == interpreter::kMemcpyD2H ||
+          op_type == interpreter::kMemcpyH2D) {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
+      }
+      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Device context of op %s must be [%s] while using "
+                            "cuda graph, but got [%s].",
+                            op_type,
+                            typeid(phi::GPUContext).name(),
+                            typeid(*dev_ctx_).name()));
+      // cuda graph needs to record all stream
+      phi::backends::gpu::CUDAGraphContextManager::Instance()
+          .RecordCapturingDeviceContext(dev_ctx_);
+    }
+#endif
+    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    vec_instruction_.back().UpdataRecordStreamForGcInfo();
+#endif
+  }
+}
+
 void ProgramInterpreter::BuildSkipShareLoDInfo() {
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     bool can_skip_lod = true;
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index b19e3a06a4258..2a97c7d04e8a8 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -125,6 +125,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   void BuildSkipShareLoDInfo();
   void UpdateSyncOpNum();
   void AnalyseExecuteOrderForTrace();
+  void UpdateDevCtx(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
 
   // inplace
   void BuildInplace();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 86fe675e61aa5..23ca4586d4ae3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -119,6 +119,9 @@
 
 PHI_DECLARE_bool(enable_pir_in_executor);
 PHI_DECLARE_bool(pir_apply_inplace_pass);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PHI_DECLARE_bool(inference_switch_stream);
+#endif
 
 namespace paddle {
 namespace {
@@ -2362,6 +2365,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
         }));
     auto &pool = paddle::experimental::DeviceContextPool::Instance();
     pool.SyncDeviceContext(place_);
+    FLAGS_inference_switch_stream = true;
   }
 
   return ZeroCopyRun();
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index ea1af5eee4d0b..eec49fffda1d3 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1108,6 +1108,20 @@ PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
                          false,
                          "Use CUDA Graph in new executor");
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+/*
+ * Inference switch stream related FLAG
+ * Name: FLAGS_inference_switch_stream
+ * Since Version: 2.6
+ * Value Range: bool, default=false
+ * Example: FLAGS_inference_switch_stream=true would switch
+ * It is possible for this flag to be set to true in RunWithExternalStream API.
+ */
+PHI_DEFINE_EXPORTED_bool(inference_switch_stream,
+                         false,
+                         "Swich stream when inference");
+#endif
+
 /*
  * Executor related FLAG
  * Name: FLAGS_executor_log_deps_every_microseconds
diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index 3d841954a89d6..ee86957b5a100 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -705,6 +705,7 @@ TEST(Tensor, RunWithExternalStream) {
   cudaStreamCreate(&external_stream);
   Config tmp_config(config);
   tmp_config.SetExecStream(external_stream);
+  tmp_config.EnableNewExecutor();
   predictor->Run();
   paddle_infer::experimental::InternalUtils::RunWithExternalStream(
       predictor.get(), external_stream);

From a944a6586bf4951fe30009774315a9f0e78e026e Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Mon, 25 Dec 2023 05:36:08 +0000
Subject: [PATCH 2/9] add test

---
 test/cpp/inference/api/analysis_predictor_tester.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index ee86957b5a100..3d87140d9c05a 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -668,6 +668,7 @@ TEST(Tensor, RunWithExternalStream) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   config.SetExecStream(stream);
+  config.EnableNewExecutor();
   auto predictor = CreatePredictor(config);
 
   auto w0 = predictor->GetInputHandle("firstw");
@@ -703,9 +704,7 @@ TEST(Tensor, RunWithExternalStream) {
 
   cudaStream_t external_stream;
   cudaStreamCreate(&external_stream);
-  Config tmp_config(config);
-  tmp_config.SetExecStream(external_stream);
-  tmp_config.EnableNewExecutor();
+
   predictor->Run();
   paddle_infer::experimental::InternalUtils::RunWithExternalStream(
       predictor.get(), external_stream);

From ce2f9e8825fbb3d7ec2745e173aa291711d2dbf8 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Fri, 29 Dec 2023 07:07:57 +0000
Subject: [PATCH 3/9] fix bug of RunWithExternalStream API in new executor

---
 .../framework/new_executor/pir_interpreter.cc | 15 +++++++
 .../new_executor/program_interpreter.cc       | 42 ++-----------------
 .../new_executor/program_interpreter.h        |  3 +-
 paddle/phi/core/flags.cc                      |  2 +-
 4 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 82bf2973345ad..ba4abc2bffacc 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -77,6 +77,9 @@
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PHI_DECLARE_bool(inference_switch_stream);
+#endif
 PHI_DECLARE_bool(enable_pir_in_executor);
 PHI_DECLARE_bool(enable_pir_in_executor_trace_run);
 
@@ -1305,6 +1308,12 @@ paddle::framework::FetchList PirInterpreter::Run(
       LOG_FIRST_N(INFO, 1) << "pir interpreter is running by trace mode ...";
       TraceRunImpl();
     } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      if (FLAGS_inference_switch_stream) {
+        BuildInstruction();
+        FLAGS_inference_switch_stream = false;
+      }
+#endif
       LOG_FIRST_N(INFO, 1)
           << "pir interpreter is running by multi-thread mode ...";
       MultiThreadRunImpl();
@@ -1396,6 +1405,12 @@ FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
     is_build_ = true;
     is_shared_results_build_ = true;
   } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (FLAGS_inference_switch_stream) {
+      BuildInstruction();
+      FLAGS_inference_switch_stream = false;
+    }
+#endif
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||
         execution_config_.used_for_inference ||
         ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 6fb1fbf710f9e..9fa7c0791fc58 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -168,7 +168,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (FLAGS_inference_switch_stream) {
-      UpdateDevCtx(&op_func_nodes);
+      BuildOpFuncNode(&op_func_nodes);
       FLAGS_inference_switch_stream = false;
     }
 #endif
@@ -680,42 +680,7 @@ std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
 void ProgramInterpreter::Convert(
     std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
   auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
-  auto nodes = *op_func_nodes;
-  auto op_nums = nodes.size();
-  vec_instruction_.clear();
-  vec_instruction_.reserve(op_nums);
-  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
-    auto& op_func_node = nodes[op_idx];
-    stream_analyzer_.SetForceEventsToWaitInfo(force_evnets_to_wait_);
-    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
-#ifdef PADDLE_WITH_CUDA
-    if (FLAGS_new_executor_use_cuda_graph) {
-      auto& op = op_func_node.operator_base_;
-      auto& op_type = op->Type();
-      if (op_type == interpreter::kMemcpyD2H ||
-          op_type == interpreter::kMemcpyH2D) {
-        PADDLE_THROW(paddle::platform::errors::Fatal(
-            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
-      }
-      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Device context of op %s must be [%s] while using "
-                            "cuda graph, but got [%s].",
-                            op_type,
-                            typeid(phi::GPUContext).name(),
-                            typeid(*dev_ctx_).name()));
-      // cuda graph needs to record all stream
-      phi::backends::gpu::CUDAGraphContextManager::Instance()
-          .RecordCapturingDeviceContext(dev_ctx_);
-    }
-#endif
-    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    vec_instruction_.back().UpdataRecordStreamForGcInfo();
-#endif
-  }
+  BuildOpFuncNode(op_func_nodes);
 
   BuildOperatorDependences();
 
@@ -752,6 +717,7 @@ void ProgramInterpreter::Convert(
   }
 
   // calculate last_live_ops_
+  auto op_nums = (*op_func_nodes).size();
   for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
     Instruction& instr = vec_instruction_[op_idx];
     OpInOutInfo info;
@@ -888,7 +854,7 @@ void ProgramInterpreter::Convert(
   AnalyseExecuteOrderForTrace();
 }
 
-void ProgramInterpreter::UpdateDevCtx(
+void ProgramInterpreter::BuildOpFuncNode(
     std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
   auto nodes = *op_func_nodes;
   auto op_nums = nodes.size();
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 2a97c7d04e8a8..580a292f4fcf5 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -125,7 +125,8 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   void BuildSkipShareLoDInfo();
   void UpdateSyncOpNum();
   void AnalyseExecuteOrderForTrace();
-  void UpdateDevCtx(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
+  void BuildOpFuncNode(
+      std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
 
   // inplace
   void BuildInplace();
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index eec49fffda1d3..abb0b6f66e440 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1112,7 +1112,7 @@ PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
 /*
  * Inference switch stream related FLAG
  * Name: FLAGS_inference_switch_stream
- * Since Version: 2.6
+ * Since Version: 2.7
  * Value Range: bool, default=false
  * Example: FLAGS_inference_switch_stream=true would switch
  * It is possible for this flag to be set to true in RunWithExternalStream API.

From f61d24a7e1b2fb184274a733fbd4c49da8e2f485 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Tue, 2 Jan 2024 03:32:07 +0000
Subject: [PATCH 4/9] reset flage in RunWithExternalStream

---
 paddle/fluid/framework/new_executor/pir_interpreter.cc     | 1 -
 paddle/fluid/framework/new_executor/program_interpreter.cc | 1 -
 paddle/fluid/inference/api/analysis_predictor.cc           | 5 +++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index ba4abc2bffacc..13e193684dfe1 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -1311,7 +1311,6 @@ paddle::framework::FetchList PirInterpreter::Run(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (FLAGS_inference_switch_stream) {
         BuildInstruction();
-        FLAGS_inference_switch_stream = false;
       }
 #endif
       LOG_FIRST_N(INFO, 1)
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 9fa7c0791fc58..7c8249b247654 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -169,7 +169,6 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (FLAGS_inference_switch_stream) {
       BuildOpFuncNode(&op_func_nodes);
-      FLAGS_inference_switch_stream = false;
     }
 #endif
     RunImpl();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index fef1237e2af46..1f10dc6b9703e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2390,8 +2390,9 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
     pool.SyncDeviceContext(place_);
     FLAGS_inference_switch_stream = true;
   }
-
-  return ZeroCopyRun();
+  auto run_ret = ZeroCopyRun();
+  FLAGS_inference_switch_stream = false;
+  return run_ret;
 }
 #endif
 

From 08b692b9fbbbb494f9f1c589c097058bb1655720 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Tue, 2 Jan 2024 07:38:38 +0000
Subject: [PATCH 5/9] fix bug

---
 paddle/fluid/framework/new_executor/pir_interpreter.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 13e193684dfe1..f1a09ac7446e6 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -1407,7 +1407,6 @@ FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (FLAGS_inference_switch_stream) {
       BuildInstruction();
-      FLAGS_inference_switch_stream = false;
     }
 #endif
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||

From 59b39f2b24482636e5879f64c7bb5bda76bd254a Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Wed, 3 Jan 2024 07:55:39 +0000
Subject: [PATCH 6/9] add param swith_stream

---
 paddle/fluid/framework/naive_executor.cc      |  6 +++--
 paddle/fluid/framework/naive_executor.h       |  3 ++-
 .../new_executor/interpreter_base_impl.h      |  6 +++--
 .../framework/new_executor/interpretercore.cc | 16 +++++++++----
 .../framework/new_executor/interpretercore.h  |  6 +++--
 .../framework/new_executor/pir_interpreter.cc | 23 ++++++++++---------
 .../framework/new_executor/pir_interpreter.h  |  6 +++--
 .../new_executor/program_interpreter.cc       | 18 +++++++--------
 .../new_executor/program_interpreter.h        |  9 +++++---
 .../fluid/inference/api/analysis_predictor.cc | 15 ++++--------
 .../fluid/inference/api/analysis_predictor.h  |  3 ++-
 paddle/fluid/inference/api/paddle_api.h       |  3 ++-
 paddle/phi/core/flags.cc                      | 14 -----------
 13 files changed, 65 insertions(+), 63 deletions(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 3bfacc950325c..14224417ba795 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -72,12 +72,14 @@ void NaiveExecutor::PrepareInterpreterCore(
 }
 
 void NaiveExecutor::RunInterpreterCore(
-    const std::vector<std::string> &feed_names, bool need_fetch) {
+    const std::vector<std::string> &feed_names,
+    bool need_fetch,
+    bool switch_stream) {
   platform::ScopedFlushDenormal flush;
 #ifdef PADDLE_WITH_NVTX
   platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow);
 #endif
-  interpreter_core_->Run(feed_names, need_fetch);
+  interpreter_core_->Run(feed_names, need_fetch, true, false, switch_stream);
 #ifdef PADDLE_WITH_NVTX
   platform::CudaNvtxRangePop();
 #endif
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 5a558f3bd6921..8388bfe3a37fc 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -77,7 +77,8 @@ class NaiveExecutor {
   void Run();
 
   void RunInterpreterCore(const std::vector<std::string>& feed_names = {},
-                          bool need_fetch = false);
+                          bool need_fetch = false,
+                          bool switch_stream = false);
 
   // Get an tensor to operating directly, without the need for feed_ops.
   phi::DenseTensor* FindTensor(const std::string& name);
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index ff5832ba8335e..a7a618ac90284 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -68,13 +68,15 @@ class InterpreterBaseImpl {
       const std::vector<std::string>& feed_names,
       const std::vector<phi::DenseTensor>& feed_tensors,
       bool need_fetch = true,
-      bool enable_job_schedule_profiler = false) = 0;
+      bool enable_job_schedule_profiler = false,
+      bool switch_stream = false) = 0;
 
   virtual paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
       bool need_fetch = true,
       bool enable_job_schedule_profiler = false,
-      bool enable_op_profiling = false) = 0;
+      bool enable_op_profiling = false,
+      bool switch_stream = false) = 0;
 
   virtual void ShareWorkQueueFrom(InterpreterBaseImpl* src) = 0;
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index b0bbd11aef0db..8fdddb1548d9d 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -67,19 +67,25 @@ FetchList InterpreterCore::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors,
     bool need_fetch,
-    bool enable_job_schedule_profiler) {
-  return impl_->Run(
-      feed_names, feed_tensors, need_fetch, enable_job_schedule_profiler);
+    bool enable_job_schedule_profiler,
+    bool switch_stream) {
+  return impl_->Run(feed_names,
+                    feed_tensors,
+                    need_fetch,
+                    enable_job_schedule_profiler,
+                    switch_stream);
 }
 
 FetchList InterpreterCore::Run(const std::vector<std::string>& feed_names,
                                bool need_fetch,
                                bool enable_job_schedule_profiler,
-                               bool enable_op_profiling) {
+                               bool enable_op_profiling,
+                               bool switch_stream) {
   return impl_->Run(feed_names,
                     need_fetch,
                     enable_job_schedule_profiler,
-                    enable_op_profiling);
+                    enable_op_profiling,
+                    switch_stream);
 }
 
 void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index b8c1913d931dc..7731620565fb8 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -49,12 +49,14 @@ class InterpreterCore {
       const std::vector<std::string>& feed_names,
       const std::vector<phi::DenseTensor>& feed_tensors,
       bool need_fetch = true,
-      bool enable_job_schedule_profiler = false);
+      bool enable_job_schedule_profiler = false,
+      bool switch_stream = false);
 
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
                                    bool need_fetch = true,
                                    bool enable_job_schedule_profiler = false,
-                                   bool enable_op_profiling = false);
+                                   bool enable_op_profiling = false,
+                                   bool switch_stream = false);
 
   void RunProfile(const std::vector<std::string>& feed_names);
 
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index f1a09ac7446e6..ffbc156721e54 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -77,9 +77,6 @@
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PHI_DECLARE_bool(inference_switch_stream);
-#endif
 PHI_DECLARE_bool(enable_pir_in_executor);
 PHI_DECLARE_bool(enable_pir_in_executor_trace_run);
 
@@ -1253,7 +1250,8 @@ paddle::framework::FetchList PirInterpreter::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors,
     bool need_fetch,
-    bool enable_job_schedule_profiler) {
+    bool enable_job_schedule_profiler,
+    bool switch_stream) {
   enable_job_schedule_profiler_ = enable_job_schedule_profiler;
 
   auto FeedInput = [&] {
@@ -1308,11 +1306,6 @@ paddle::framework::FetchList PirInterpreter::Run(
       LOG_FIRST_N(INFO, 1) << "pir interpreter is running by trace mode ...";
       TraceRunImpl();
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      if (FLAGS_inference_switch_stream) {
-        BuildInstruction();
-      }
-#endif
       LOG_FIRST_N(INFO, 1)
           << "pir interpreter is running by multi-thread mode ...";
       MultiThreadRunImpl();
@@ -1321,6 +1314,12 @@ paddle::framework::FetchList PirInterpreter::Run(
     is_build_ = true;
     is_shared_results_build_ = true;
   } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (switch_stream) {
+      BuildInstruction();
+      VLOG(4) << "Done BuildInstruction";
+    }
+#endif
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||
         execution_config_.used_for_inference ||
         ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
@@ -1353,7 +1352,8 @@ paddle::framework::FetchList PirInterpreter::Run(
 FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
                               bool need_fetch,
                               bool enable_job_schedule_profiler,
-                              bool enable_op_profiling) {
+                              bool enable_op_profiling,
+                              bool switch_stream) {
   enable_job_schedule_profiler_ = enable_job_schedule_profiler;
 
   if (enable_op_profiling) {
@@ -1405,8 +1405,9 @@ FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
     is_shared_results_build_ = true;
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (FLAGS_inference_switch_stream) {
+    if (switch_stream) {
       BuildInstruction();
+      VLOG(4) << "Done BuildInstruction";
     }
 #endif
     if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 ||
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
index 1684aeffef8cf..3f197f53e12f8 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -57,12 +57,14 @@ class PirInterpreter : public InterpreterBaseImpl {
       const std::vector<std::string>& feed_names,
       const std::vector<phi::DenseTensor>& feed_tensors,
       bool need_fetch = true,
-      bool enable_job_schedule_profiler = false) override;
+      bool enable_job_schedule_profiler = false,
+      bool switch_stream = false) override;
 
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
                                    bool need_fetch = true,
                                    bool enable_job_schedule_profiler = false,
-                                   bool enable_op_profiling = false) override;
+                                   bool enable_op_profiling = false,
+                                   bool switch_stream = false) override;
 
   void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
 
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 7c8249b247654..c87f9e7f530c6 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -41,9 +41,6 @@
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PHI_DECLARE_bool(inference_switch_stream);
-#endif
 PD_DECLARE_bool(enable_host_event_recorder_hook);
 PD_DECLARE_bool(log_memory_stats);
 PHI_DECLARE_string(static_runtime_data_save_path);
@@ -147,7 +144,8 @@ void ProgramInterpreter::RunImpl() {
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
                                   bool need_fetch,
                                   bool enable_job_schedule_profiler,
-                                  bool enable_op_profiling) {
+                                  bool enable_op_profiling,
+                                  bool switch_stream) {
   enable_job_schedule_profiler_ = enable_job_schedule_profiler;
   is_in_op_profiling_mode_ = enable_op_profiling;
 
@@ -167,7 +165,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
     is_shared_results_build_ = true;
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (FLAGS_inference_switch_stream) {
+    if (switch_stream) {
       BuildOpFuncNode(&op_func_nodes);
     }
 #endif
@@ -241,7 +239,8 @@ FetchList ProgramInterpreter::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors,
     bool need_fetch,
-    bool enable_job_schedule_profiler) {
+    bool enable_job_schedule_profiler,
+    bool switch_stream) {
   enable_job_schedule_profiler_ = enable_job_schedule_profiler;
 
   SetDeviceId(place_);
@@ -252,7 +251,7 @@ FetchList ProgramInterpreter::Run(
 #endif
 
   bool is_build = is_build_;
-  Prepare(feed_names, feed_tensors, is_build);
+  Prepare(feed_names, feed_tensors, is_build, switch_stream);
 
   if (is_build) {
     RunImpl();
@@ -1512,7 +1511,8 @@ void ProgramInterpreter::CheckGC(const Instruction& instr) {
 void ProgramInterpreter::Prepare(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors,
-    bool prepare_feed) {
+    bool prepare_feed,
+    bool switch_stream) {
   PADDLE_ENFORCE_EQ(feed_names.size(),
                     feed_tensors.size(),
                     platform::errors::PreconditionNotMet(
@@ -1535,7 +1535,7 @@ void ProgramInterpreter::Prepare(
     }
   };
 
-  if (!is_build_) {
+  if (!is_build_ || switch_stream) {
     paddle::framework::interpreter::BuildVariableScope(
         block_, execution_config_, &var_scope_);
     FeedInput();
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 580a292f4fcf5..5359c41fddcdc 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -49,12 +49,14 @@ class ProgramInterpreter : public InterpreterBaseImpl {
       const std::vector<std::string>& feed_names,
       const std::vector<phi::DenseTensor>& feed_tensors,
       bool need_fetch = true,
-      bool enable_job_schedule_profiler = false) override;
+      bool enable_job_schedule_profiler = false,
+      bool switch_stream = false) override;
 
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
                                    bool need_fetch = true,
                                    bool enable_job_schedule_profiler = false,
-                                   bool enable_op_profiling = false) override;
+                                   bool enable_op_profiling = false,
+                                   bool switch_stream = false) override;
 
   std::shared_ptr<ProgramDesc> GetMutableCopyProgram() override;
 
@@ -152,7 +154,8 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   // only used when program contains no feed op
   void Prepare(const std::vector<std::string>& feed_names,
                const std::vector<phi::DenseTensor>& feed_tensors,
-               bool prepare_feed);
+               bool prepare_feed,
+               bool switch_stream = false);
 
   void RecordMemcpyD2H(const Instruction& instr_node);
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 1f10dc6b9703e..2b33e95e26c8a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -124,9 +124,6 @@
 
 PHI_DECLARE_bool(enable_pir_in_executor);
 PHI_DECLARE_bool(pir_apply_inplace_pass);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PHI_DECLARE_bool(inference_switch_stream);
-#endif
 
 namespace paddle {
 namespace {
@@ -2254,7 +2251,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
   return res;
 }
 
-bool AnalysisPredictor::ZeroCopyRun() {
+bool AnalysisPredictor::ZeroCopyRun(bool switch_stream) {
   inference::DisplayMemoryInfo(place_, "before run");
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   if (config_.dist_config().use_dist_model()) {
@@ -2317,7 +2314,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
 #endif
 
   if (config_.new_executor_enabled()) {
-    executor_->RunInterpreterCore();
+    executor_->RunInterpreterCore({}, false, switch_stream);
   } else {
     executor_->Run();
   }
@@ -2358,7 +2355,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
         "Please use config.SetExecStream to init gpu resources, and then we "
         "will bind gpu resources to execution stream."));
   }
-
+  bool switch_stream = false;
   if (stream != predictor_stream_) {
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
@@ -2388,11 +2385,9 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
         }));
     auto &pool = paddle::experimental::DeviceContextPool::Instance();
     pool.SyncDeviceContext(place_);
-    FLAGS_inference_switch_stream = true;
+    switch_stream = true;
   }
-  auto run_ret = ZeroCopyRun();
-  FLAGS_inference_switch_stream = false;
-  return run_ret;
+  return ZeroCopyRun(switch_stream);
 }
 #endif
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 4a5cfb229a459..0f2091478af2a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -204,9 +204,10 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   /// \brief Run the prediction engine
   ///
+  /// \param switch_stream Whether the stream is switched
   /// \return Whether the function executed successfully
   ///
-  bool ZeroCopyRun() override;
+  bool ZeroCopyRun(bool switch_stream = false) override;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // Note: Can only be used under thread_local semantics.
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 3fefba9ef22be..89540a91e3789 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -295,8 +295,9 @@ class PD_INFER_DECL PaddlePredictor {
   /// To use it, one should call the AnalysisConfig.SwitchUseFeedFetchOp(false)
   /// and then use the `GetInputTensor` and `GetOutputTensor`
   /// to directly write or read the input/output tensors.
+  /// \param switch_stream Whether the stream is switched.
   /// \return Whether the run is successful
-  virtual bool ZeroCopyRun() { return false; }
+  virtual bool ZeroCopyRun(bool switch_stream = false) { return false; }
 
   ///
   /// \brief Clear the intermediate tensors of the predictor
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index abb0b6f66e440..ea1af5eee4d0b 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1108,20 +1108,6 @@ PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
                          false,
                          "Use CUDA Graph in new executor");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-/*
- * Inference switch stream related FLAG
- * Name: FLAGS_inference_switch_stream
- * Since Version: 2.7
- * Value Range: bool, default=false
- * Example: FLAGS_inference_switch_stream=true would switch
- * It is possible for this flag to be set to true in RunWithExternalStream API.
- */
-PHI_DEFINE_EXPORTED_bool(inference_switch_stream,
-                         false,
-                         "Swich stream when inference");
-#endif
-
 /*
  * Executor related FLAG
  * Name: FLAGS_executor_log_deps_every_microseconds

From a84ff382059493e98d6e87ab4f2592ccff691119 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Wed, 3 Jan 2024 08:38:53 +0000
Subject: [PATCH 7/9] fix bug

---
 paddle/fluid/inference/api/onnxruntime_predictor.cc | 2 +-
 paddle/fluid/inference/api/onnxruntime_predictor.h  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index 2597044046916..f2d8f7478d902 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -333,7 +333,7 @@ bool ONNXRuntimePredictor::Run(const std::vector<PaddleTensor> &inputs,
   return false;
 }
 
-bool ONNXRuntimePredictor::ZeroCopyRun() {
+bool ONNXRuntimePredictor::ZeroCopyRun(bool switch_stream) {
   try {
     const char *device_name = platform::is_cpu_place(place_) ? "Cpu" : "Cuda";
     std::vector<Ort::Value> inputs;
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index 971632c4b3c7a..c983f8acdae28 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -175,9 +175,10 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   /// \brief Run the prediction engine
   ///
+  /// \param switch_stream Whether the stream is switched
   /// \return Whether the function executed successfully
   ///
-  bool ZeroCopyRun() override;
+  bool ZeroCopyRun(bool switch_stream = false) override;
 
   ///
   /// \brief Release all tmp tensor to compress the size of the memory pool.

From e5ee2250fe3d41decd65f4fa6c2acd2776d28840 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Thu, 4 Jan 2024 07:08:19 +0000
Subject: [PATCH 8/9] modify python api

---
 paddle/fluid/pybind/inference_api.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 03a95e870b810..94df6a0ee0d41 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -691,7 +691,9 @@ void BindPaddlePredictor(py::module *m) {
       .def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
       .def("get_input_names", &PaddlePredictor::GetInputNames)
       .def("get_output_names", &PaddlePredictor::GetOutputNames)
-      .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
+      .def("zero_copy_run",
+           &PaddlePredictor::ZeroCopyRun,
+           py::arg("switch_stream") = false)
       .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); })
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("clone",
@@ -740,7 +742,9 @@ void BindNativePredictor(py::module *m) {
            })
       .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
       .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
-      .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
+      .def("zero_copy_run",
+           &NativePaddlePredictor::ZeroCopyRun,
+           py::arg("switch_stream") = false)
       .def("clone",
            [](NativePaddlePredictor &self) { return self.Clone(nullptr); })
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1130,7 +1134,9 @@ void BindAnalysisPredictor(py::module *m) {
       .def("get_input_names", &AnalysisPredictor::GetInputNames)
       .def("get_output_names", &AnalysisPredictor::GetOutputNames)
       .def("get_input_tensor_shape", &AnalysisPredictor::GetInputTensorShape)
-      .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
+      .def("zero_copy_run",
+           &AnalysisPredictor::ZeroCopyRun,
+           py::arg("switch_stream") = false)
       .def("clear_intermediate_tensor",
            &AnalysisPredictor::ClearIntermediateTensor)
       .def("try_shrink_memory", &AnalysisPredictor::TryShrinkMemory)

From 1672bb81fd94353671ced8e841599c96cfab0d49 Mon Sep 17 00:00:00 2001
From: minghaipeng <minghaipeng@baidu.com>
Date: Thu, 4 Jan 2024 11:19:00 +0000
Subject: [PATCH 9/9] fix bug

---
 paddle/fluid/framework/naive_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 8c6e5d472a168..90f5b93dcb2ef 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -79,7 +79,7 @@ void NaiveExecutor::RunInterpreterCore(
 #ifdef PADDLE_WITH_NVTX
   platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow);
 #endif
-  interpreter_core_->Run(feed_names, need_fetch, true, false, switch_stream);
+  interpreter_core_->Run(feed_names, need_fetch, false, false, switch_stream);
 #ifdef PADDLE_WITH_NVTX
   platform::CudaNvtxRangePop();
 #endif