PaddlePaddle · yuanlehome · Jan 5, 2024 · Dec 18, 2023 · Dec 25, 2023 · Dec 28, 2023
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -41,6 +41,9 @@
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PHI_DECLARE_bool(inference_switch_stream);
+#endif
 PD_DECLARE_bool(enable_host_event_recorder_hook);
 PD_DECLARE_bool(log_memory_stats);
 PHI_DECLARE_string(static_runtime_data_save_path);
@@ -163,6 +166,12 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
     is_build_ = true;
     is_shared_results_build_ = true;
   } else {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (FLAGS_inference_switch_stream) {
+      UpdateDevCtx(&op_func_nodes);
+      FLAGS_inference_switch_stream = false;
+    }
+#endif
     RunImpl();
   }
 
@@ -879,6 +888,46 @@ void ProgramInterpreter::Convert(
   AnalyseExecuteOrderForTrace();
 }
 
+void ProgramInterpreter::UpdateDevCtx(
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  auto nodes = *op_func_nodes;
+  auto op_nums = nodes.size();
+  vec_instruction_.clear();
+  vec_instruction_.reserve(op_nums);
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    auto& op_func_node = nodes[op_idx];
+    stream_analyzer_.SetForceEventsToWaitInfo(force_evnets_to_wait_);
+    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
+#ifdef PADDLE_WITH_CUDA
+    if (FLAGS_new_executor_use_cuda_graph) {
+      auto& op = op_func_node.operator_base_;
+      auto& op_type = op->Type();
+      if (op_type == interpreter::kMemcpyD2H ||
+          op_type == interpreter::kMemcpyH2D) {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
+      }
+      PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Device context of op %s must be [%s] while using "
+                            "cuda graph, but got [%s].",
+                            op_type,
+                            typeid(phi::GPUContext).name(),
+                            typeid(*dev_ctx_).name()));
+      // cuda graph needs to record all stream
+      phi::backends::gpu::CUDAGraphContextManager::Instance()
+          .RecordCapturingDeviceContext(dev_ctx_);
+    }
+#endif
+    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    vec_instruction_.back().UpdataRecordStreamForGcInfo();
+#endif
+  }
+}
+
 void ProgramInterpreter::BuildSkipShareLoDInfo() {
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     bool can_skip_lod = true;

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -125,6 +125,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   void BuildSkipShareLoDInfo();
   void UpdateSyncOpNum();
   void AnalyseExecuteOrderForTrace();
+  void UpdateDevCtx(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
 
   // inplace
   void BuildInplace();

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -119,6 +119,9 @@
 
 PHI_DECLARE_bool(enable_pir_in_executor);
 PHI_DECLARE_bool(pir_apply_inplace_pass);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PHI_DECLARE_bool(inference_switch_stream);
+#endif
 
 namespace paddle {
 namespace {
@@ -2362,6 +2365,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
         }));
     auto &pool = paddle::experimental::DeviceContextPool::Instance();
     pool.SyncDeviceContext(place_);
+    FLAGS_inference_switch_stream = true;
   }
 
   return ZeroCopyRun();

diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
@@ -1108,6 +1108,20 @@ PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph,
                          false,
                          "Use CUDA Graph in new executor");
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+/*
+ * Inference switch stream related FLAG
+ * Name: FLAGS_inference_switch_stream
+ * Since Version: 2.6
+ * Value Range: bool, default=false
+ * Example: FLAGS_inference_switch_stream=true would switch
+ * It is possible for this flag to be set to true in RunWithExternalStream API.
+ */
+PHI_DEFINE_EXPORTED_bool(inference_switch_stream,
+                         false,
+                         "Swich stream when inference");
+#endif
+
 /*
  * Executor related FLAG
  * Name: FLAGS_executor_log_deps_every_microseconds

diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -668,6 +668,7 @@ TEST(Tensor, RunWithExternalStream) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   config.SetExecStream(stream);
+  config.EnableNewExecutor();
   auto predictor = CreatePredictor(config);
 
   auto w0 = predictor->GetInputHandle("firstw");
@@ -703,8 +704,7 @@ TEST(Tensor, RunWithExternalStream) {
 
   cudaStream_t external_stream;
   cudaStreamCreate(&external_stream);
-  Config tmp_config(config);
-  tmp_config.SetExecStream(external_stream);
+
   predictor->Run();
   paddle_infer::experimental::InternalUtils::RunWithExternalStream(
       predictor.get(), external_stream);