From a4a378dcf1b54e0c2301b21b78e29e876c3d09c1 Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Mon, 18 Dec 2023 11:48:36 +0000 Subject: [PATCH 1/9] fix bug of RunWithExternalStream API in new executor --- .../new_executor/program_interpreter.cc | 49 +++++++++++++++++++ .../new_executor/program_interpreter.h | 1 + .../fluid/inference/api/analysis_predictor.cc | 4 ++ paddle/phi/core/flags.cc | 14 ++++++ .../api/analysis_predictor_tester.cc | 1 + 5 files changed, 69 insertions(+) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 3e849670e4699..688a919bcb0fb 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -41,6 +41,9 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PHI_DECLARE_bool(inference_switch_stream); +#endif PD_DECLARE_bool(enable_host_event_recorder_hook); PD_DECLARE_bool(log_memory_stats); PHI_DECLARE_string(static_runtime_data_save_path); @@ -163,6 +166,12 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, is_build_ = true; is_shared_results_build_ = true; } else { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_inference_switch_stream) { + UpdateDevCtx(&op_func_nodes); + FLAGS_inference_switch_stream = false; + } +#endif RunImpl(); } @@ -879,6 +888,46 @@ void ProgramInterpreter::Convert( AnalyseExecuteOrderForTrace(); } +void ProgramInterpreter::UpdateDevCtx( + std::vector* op_func_nodes) { + auto nodes = *op_func_nodes; + auto op_nums = nodes.size(); + vec_instruction_.clear(); + vec_instruction_.reserve(op_nums); + for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { + auto& op_func_node = nodes[op_idx]; + stream_analyzer_.SetForceEventsToWaitInfo(force_evnets_to_wait_); + auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node); +#ifdef PADDLE_WITH_CUDA + if (FLAGS_new_executor_use_cuda_graph) { + auto& op = op_func_node.operator_base_; + auto& op_type = op->Type(); + if (op_type == interpreter::kMemcpyD2H || + op_type == interpreter::kMemcpyH2D) { + PADDLE_THROW(paddle::platform::errors::Fatal( + "Cuda memory copy d2h/h2d is not allowed while using cuda graph.")); + } + PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext), + true, + platform::errors::InvalidArgument( + "Device context of op %s must be [%s] while using " + "cuda graph, but got [%s].", + op_type, + typeid(phi::GPUContext).name(), + typeid(*dev_ctx_).name())); + // cuda graph needs to record all stream + phi::backends::gpu::CUDAGraphContextManager::Instance() + .RecordCapturingDeviceContext(dev_ctx_); + } +#endif + vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + vec_instruction_.back().UpdataRecordStreamForGcInfo(); +#endif + } +} + void ProgramInterpreter::BuildSkipShareLoDInfo() { for (size_t i = 0; i < vec_instruction_.size(); ++i) { bool can_skip_lod = true; diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index b19e3a06a4258..2a97c7d04e8a8 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -125,6 +125,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { void BuildSkipShareLoDInfo(); void UpdateSyncOpNum(); void AnalyseExecuteOrderForTrace(); + void UpdateDevCtx(std::vector* op_func_nodes); // inplace void BuildInplace(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 86fe675e61aa5..23ca4586d4ae3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -119,6 +119,9 @@ PHI_DECLARE_bool(enable_pir_in_executor); PHI_DECLARE_bool(pir_apply_inplace_pass); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PHI_DECLARE_bool(inference_switch_stream); +#endif namespace paddle { namespace { @@ -2362,6 +2365,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { })); auto &pool = paddle::experimental::DeviceContextPool::Instance(); pool.SyncDeviceContext(place_); + FLAGS_inference_switch_stream = true; } return ZeroCopyRun(); diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index ea1af5eee4d0b..eec49fffda1d3 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -1108,6 +1108,20 @@ PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph, false, "Use CUDA Graph in new executor"); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +/* + * Inference switch stream related FLAG + * Name: FLAGS_inference_switch_stream + * Since Version: 2.6 + * Value Range: bool, default=false + * Example: FLAGS_inference_switch_stream=true would switch + * It is possible for this flag to be set to true in RunWithExternalStream API. + */ +PHI_DEFINE_EXPORTED_bool(inference_switch_stream, + false, + "Swich stream when inference"); +#endif + /* * Executor related FLAG * Name: FLAGS_executor_log_deps_every_microseconds diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc index 3d841954a89d6..ee86957b5a100 100644 --- a/test/cpp/inference/api/analysis_predictor_tester.cc +++ b/test/cpp/inference/api/analysis_predictor_tester.cc @@ -705,6 +705,7 @@ TEST(Tensor, RunWithExternalStream) { cudaStreamCreate(&external_stream); Config tmp_config(config); tmp_config.SetExecStream(external_stream); + tmp_config.EnableNewExecutor(); predictor->Run(); paddle_infer::experimental::InternalUtils::RunWithExternalStream( predictor.get(), external_stream); From a944a6586bf4951fe30009774315a9f0e78e026e Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Mon, 25 Dec 2023 05:36:08 +0000 Subject: [PATCH 2/9] add test --- test/cpp/inference/api/analysis_predictor_tester.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc index ee86957b5a100..3d87140d9c05a 100644 --- a/test/cpp/inference/api/analysis_predictor_tester.cc +++ b/test/cpp/inference/api/analysis_predictor_tester.cc @@ -668,6 +668,7 @@ TEST(Tensor, RunWithExternalStream) { cudaStream_t stream; cudaStreamCreate(&stream); config.SetExecStream(stream); + config.EnableNewExecutor(); auto predictor = CreatePredictor(config); auto w0 = predictor->GetInputHandle("firstw"); @@ -703,9 +704,7 @@ TEST(Tensor, RunWithExternalStream) { cudaStream_t external_stream; cudaStreamCreate(&external_stream); - Config tmp_config(config); - tmp_config.SetExecStream(external_stream); - tmp_config.EnableNewExecutor(); + predictor->Run(); paddle_infer::experimental::InternalUtils::RunWithExternalStream( predictor.get(), external_stream); From ce2f9e8825fbb3d7ec2745e173aa291711d2dbf8 Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Fri, 29 Dec 2023 07:07:57 +0000 Subject: [PATCH 3/9] fix bug of RunWithExternalStream API in new executor --- .../framework/new_executor/pir_interpreter.cc | 15 +++++++ .../new_executor/program_interpreter.cc | 42 ++----------------- .../new_executor/program_interpreter.h | 3 +- paddle/phi/core/flags.cc | 2 +- 4 files changed, 22 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 82bf2973345ad..ba4abc2bffacc 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -77,6 +77,9 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); #endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PHI_DECLARE_bool(inference_switch_stream); +#endif PHI_DECLARE_bool(enable_pir_in_executor); PHI_DECLARE_bool(enable_pir_in_executor_trace_run); @@ -1305,6 +1308,12 @@ paddle::framework::FetchList PirInterpreter::Run( LOG_FIRST_N(INFO, 1) << "pir interpreter is running by trace mode ..."; TraceRunImpl(); } else { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_inference_switch_stream) { + BuildInstruction(); + FLAGS_inference_switch_stream = false; + } +#endif LOG_FIRST_N(INFO, 1) << "pir interpreter is running by multi-thread mode ..."; MultiThreadRunImpl(); @@ -1396,6 +1405,12 @@ FetchList PirInterpreter::Run(const std::vector& feed_names, is_build_ = true; is_shared_results_build_ = true; } else { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_inference_switch_stream) { + BuildInstruction(); + FLAGS_inference_switch_stream = false; + } +#endif if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 || execution_config_.used_for_inference || ((execution_config_.used_for_jit || execution_config_.used_for_cinn) && diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 6fb1fbf710f9e..9fa7c0791fc58 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -168,7 +168,7 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_inference_switch_stream) { - UpdateDevCtx(&op_func_nodes); + BuildOpFuncNode(&op_func_nodes); FLAGS_inference_switch_stream = false; } #endif @@ -680,42 +680,7 @@ std::tuple ProgramInterpreter::InterpreterRunTime() { void ProgramInterpreter::Convert( std::vector* op_func_nodes) { auto& vec_meta_info = var_scope_.MutableVecMetaInfo(); - auto nodes = *op_func_nodes; - auto op_nums = nodes.size(); - vec_instruction_.clear(); - vec_instruction_.reserve(op_nums); - for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { - auto& op_func_node = nodes[op_idx]; - stream_analyzer_.SetForceEventsToWaitInfo(force_evnets_to_wait_); - auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node); -#ifdef PADDLE_WITH_CUDA - if (FLAGS_new_executor_use_cuda_graph) { - auto& op = op_func_node.operator_base_; - auto& op_type = op->Type(); - if (op_type == interpreter::kMemcpyD2H || - op_type == interpreter::kMemcpyH2D) { - PADDLE_THROW(paddle::platform::errors::Fatal( - "Cuda memory copy d2h/h2d is not allowed while using cuda graph.")); - } - PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext), - true, - platform::errors::InvalidArgument( - "Device context of op %s must be [%s] while using " - "cuda graph, but got [%s].", - op_type, - typeid(phi::GPUContext).name(), - typeid(*dev_ctx_).name())); - // cuda graph needs to record all stream - phi::backends::gpu::CUDAGraphContextManager::Instance() - .RecordCapturingDeviceContext(dev_ctx_); - } -#endif - vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - vec_instruction_.back().UpdataRecordStreamForGcInfo(); -#endif - } + BuildOpFuncNode(op_func_nodes); BuildOperatorDependences(); @@ -752,6 +717,7 @@ void ProgramInterpreter::Convert( } // calculate last_live_ops_ + auto op_nums = (*op_func_nodes).size(); for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { Instruction& instr = vec_instruction_[op_idx]; OpInOutInfo info; @@ -888,7 +854,7 @@ void ProgramInterpreter::Convert( AnalyseExecuteOrderForTrace(); } -void ProgramInterpreter::UpdateDevCtx( +void ProgramInterpreter::BuildOpFuncNode( std::vector* op_func_nodes) { auto nodes = *op_func_nodes; auto op_nums = nodes.size(); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 2a97c7d04e8a8..580a292f4fcf5 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -125,7 +125,8 @@ class ProgramInterpreter : public InterpreterBaseImpl { void BuildSkipShareLoDInfo(); void UpdateSyncOpNum(); void AnalyseExecuteOrderForTrace(); - void UpdateDevCtx(std::vector* op_func_nodes); + void BuildOpFuncNode( + std::vector* op_func_nodes); // inplace void BuildInplace(); diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index eec49fffda1d3..abb0b6f66e440 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -1112,7 +1112,7 @@ PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph, /* * Inference switch stream related FLAG * Name: FLAGS_inference_switch_stream - * Since Version: 2.6 + * Since Version: 2.7 * Value Range: bool, default=false * Example: FLAGS_inference_switch_stream=true would switch * It is possible for this flag to be set to true in RunWithExternalStream API. From f61d24a7e1b2fb184274a733fbd4c49da8e2f485 Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Tue, 2 Jan 2024 03:32:07 +0000 Subject: [PATCH 4/9] reset flage in RunWithExternalStream --- paddle/fluid/framework/new_executor/pir_interpreter.cc | 1 - paddle/fluid/framework/new_executor/program_interpreter.cc | 1 - paddle/fluid/inference/api/analysis_predictor.cc | 5 +++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index ba4abc2bffacc..13e193684dfe1 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -1311,7 +1311,6 @@ paddle::framework::FetchList PirInterpreter::Run( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_inference_switch_stream) { BuildInstruction(); - FLAGS_inference_switch_stream = false; } #endif LOG_FIRST_N(INFO, 1) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 9fa7c0791fc58..7c8249b247654 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -169,7 +169,6 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_inference_switch_stream) { BuildOpFuncNode(&op_func_nodes); - FLAGS_inference_switch_stream = false; } #endif RunImpl(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index fef1237e2af46..1f10dc6b9703e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2390,8 +2390,9 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { pool.SyncDeviceContext(place_); FLAGS_inference_switch_stream = true; } - - return ZeroCopyRun(); + auto run_ret = ZeroCopyRun(); + FLAGS_inference_switch_stream = false; + return run_ret; } #endif From 08b692b9fbbbb494f9f1c589c097058bb1655720 Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Tue, 2 Jan 2024 07:38:38 +0000 Subject: [PATCH 5/9] fix bug --- paddle/fluid/framework/new_executor/pir_interpreter.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 13e193684dfe1..f1a09ac7446e6 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -1407,7 +1407,6 @@ FetchList PirInterpreter::Run(const std::vector& feed_names, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_inference_switch_stream) { BuildInstruction(); - FLAGS_inference_switch_stream = false; } #endif if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 || From 59b39f2b24482636e5879f64c7bb5bda76bd254a Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Wed, 3 Jan 2024 07:55:39 +0000 Subject: [PATCH 6/9] add param swith_stream --- paddle/fluid/framework/naive_executor.cc | 6 +++-- paddle/fluid/framework/naive_executor.h | 3 ++- .../new_executor/interpreter_base_impl.h | 6 +++-- .../framework/new_executor/interpretercore.cc | 16 +++++++++---- .../framework/new_executor/interpretercore.h | 6 +++-- .../framework/new_executor/pir_interpreter.cc | 23 ++++++++++--------- .../framework/new_executor/pir_interpreter.h | 6 +++-- .../new_executor/program_interpreter.cc | 18 +++++++-------- .../new_executor/program_interpreter.h | 9 +++++--- .../fluid/inference/api/analysis_predictor.cc | 15 ++++-------- .../fluid/inference/api/analysis_predictor.h | 3 ++- paddle/fluid/inference/api/paddle_api.h | 3 ++- paddle/phi/core/flags.cc | 14 ----------- 13 files changed, 65 insertions(+), 63 deletions(-) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 3bfacc950325c..14224417ba795 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -72,12 +72,14 @@ void NaiveExecutor::PrepareInterpreterCore( } void NaiveExecutor::RunInterpreterCore( - const std::vector &feed_names, bool need_fetch) { + const std::vector &feed_names, + bool need_fetch, + bool switch_stream) { platform::ScopedFlushDenormal flush; #ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow); #endif - interpreter_core_->Run(feed_names, need_fetch); + interpreter_core_->Run(feed_names, need_fetch, true, false, switch_stream); #ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePop(); #endif diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 5a558f3bd6921..8388bfe3a37fc 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -77,7 +77,8 @@ class NaiveExecutor { void Run(); void RunInterpreterCore(const std::vector& feed_names = {}, - bool need_fetch = false); + bool need_fetch = false, + bool switch_stream = false); // Get an tensor to operating directly, without the need for feed_ops. phi::DenseTensor* FindTensor(const std::string& name); diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index ff5832ba8335e..a7a618ac90284 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -68,13 +68,15 @@ class InterpreterBaseImpl { const std::vector& feed_names, const std::vector& feed_tensors, bool need_fetch = true, - bool enable_job_schedule_profiler = false) = 0; + bool enable_job_schedule_profiler = false, + bool switch_stream = false) = 0; virtual paddle::framework::FetchList Run( const std::vector& feed_names, bool need_fetch = true, bool enable_job_schedule_profiler = false, - bool enable_op_profiling = false) = 0; + bool enable_op_profiling = false, + bool switch_stream = false) = 0; virtual void ShareWorkQueueFrom(InterpreterBaseImpl* src) = 0; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index b0bbd11aef0db..8fdddb1548d9d 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -67,19 +67,25 @@ FetchList InterpreterCore::Run( const std::vector& feed_names, const std::vector& feed_tensors, bool need_fetch, - bool enable_job_schedule_profiler) { - return impl_->Run( - feed_names, feed_tensors, need_fetch, enable_job_schedule_profiler); + bool enable_job_schedule_profiler, + bool switch_stream) { + return impl_->Run(feed_names, + feed_tensors, + need_fetch, + enable_job_schedule_profiler, + switch_stream); } FetchList InterpreterCore::Run(const std::vector& feed_names, bool need_fetch, bool enable_job_schedule_profiler, - bool enable_op_profiling) { + bool enable_op_profiling, + bool switch_stream) { return impl_->Run(feed_names, need_fetch, enable_job_schedule_profiler, - enable_op_profiling); + enable_op_profiling, + switch_stream); } void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr src) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index b8c1913d931dc..7731620565fb8 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -49,12 +49,14 @@ class InterpreterCore { const std::vector& feed_names, const std::vector& feed_tensors, bool need_fetch = true, - bool enable_job_schedule_profiler = false); + bool enable_job_schedule_profiler = false, + bool switch_stream = false); paddle::framework::FetchList Run(const std::vector& feed_names, bool need_fetch = true, bool enable_job_schedule_profiler = false, - bool enable_op_profiling = false); + bool enable_op_profiling = false, + bool switch_stream = false); void RunProfile(const std::vector& feed_names); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index f1a09ac7446e6..ffbc156721e54 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -77,9 +77,6 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PHI_DECLARE_bool(inference_switch_stream); -#endif PHI_DECLARE_bool(enable_pir_in_executor); PHI_DECLARE_bool(enable_pir_in_executor_trace_run); @@ -1253,7 +1250,8 @@ paddle::framework::FetchList PirInterpreter::Run( const std::vector& feed_names, const std::vector& feed_tensors, bool need_fetch, - bool enable_job_schedule_profiler) { + bool enable_job_schedule_profiler, + bool switch_stream) { enable_job_schedule_profiler_ = enable_job_schedule_profiler; auto FeedInput = [&] { @@ -1308,11 +1306,6 @@ paddle::framework::FetchList PirInterpreter::Run( LOG_FIRST_N(INFO, 1) << "pir interpreter is running by trace mode ..."; TraceRunImpl(); } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_inference_switch_stream) { - BuildInstruction(); - } -#endif LOG_FIRST_N(INFO, 1) << "pir interpreter is running by multi-thread mode ..."; MultiThreadRunImpl(); @@ -1321,6 +1314,12 @@ paddle::framework::FetchList PirInterpreter::Run( is_build_ = true; is_shared_results_build_ = true; } else { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (switch_stream) { + BuildInstruction(); + VLOG(4) << "Done BuildInstruction"; + } +#endif if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 || execution_config_.used_for_inference || ((execution_config_.used_for_jit || execution_config_.used_for_cinn) && @@ -1353,7 +1352,8 @@ paddle::framework::FetchList PirInterpreter::Run( FetchList PirInterpreter::Run(const std::vector& feed_names, bool need_fetch, bool enable_job_schedule_profiler, - bool enable_op_profiling) { + bool enable_op_profiling, + bool switch_stream) { enable_job_schedule_profiler_ = enable_job_schedule_profiler; if (enable_op_profiling) { @@ -1405,8 +1405,9 @@ FetchList PirInterpreter::Run(const std::vector& feed_names, is_shared_results_build_ = true; } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_inference_switch_stream) { + if (switch_stream) { BuildInstruction(); + VLOG(4) << "Done BuildInstruction"; } #endif if (FLAGS_enable_pir_in_executor_trace_run || nccl_op_num_ > 1 || diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h index 1684aeffef8cf..3f197f53e12f8 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.h +++ b/paddle/fluid/framework/new_executor/pir_interpreter.h @@ -57,12 +57,14 @@ class PirInterpreter : public InterpreterBaseImpl { const std::vector& feed_names, const std::vector& feed_tensors, bool need_fetch = true, - bool enable_job_schedule_profiler = false) override; + bool enable_job_schedule_profiler = false, + bool switch_stream = false) override; paddle::framework::FetchList Run(const std::vector& feed_names, bool need_fetch = true, bool enable_job_schedule_profiler = false, - bool enable_op_profiling = false) override; + bool enable_op_profiling = false, + bool switch_stream = false) override; void ShareWorkQueueFrom(InterpreterBaseImpl* src) override; diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 7c8249b247654..c87f9e7f530c6 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -41,9 +41,6 @@ PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PHI_DECLARE_bool(inference_switch_stream); -#endif PD_DECLARE_bool(enable_host_event_recorder_hook); PD_DECLARE_bool(log_memory_stats); PHI_DECLARE_string(static_runtime_data_save_path); @@ -147,7 +144,8 @@ void ProgramInterpreter::RunImpl() { FetchList ProgramInterpreter::Run(const std::vector& feed_names, bool need_fetch, bool enable_job_schedule_profiler, - bool enable_op_profiling) { + bool enable_op_profiling, + bool switch_stream) { enable_job_schedule_profiler_ = enable_job_schedule_profiler; is_in_op_profiling_mode_ = enable_op_profiling; @@ -167,7 +165,7 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, is_shared_results_build_ = true; } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_inference_switch_stream) { + if (switch_stream) { BuildOpFuncNode(&op_func_nodes); } #endif @@ -241,7 +239,8 @@ FetchList ProgramInterpreter::Run( const std::vector& feed_names, const std::vector& feed_tensors, bool need_fetch, - bool enable_job_schedule_profiler) { + bool enable_job_schedule_profiler, + bool switch_stream) { enable_job_schedule_profiler_ = enable_job_schedule_profiler; SetDeviceId(place_); @@ -252,7 +251,7 @@ FetchList ProgramInterpreter::Run( #endif bool is_build = is_build_; - Prepare(feed_names, feed_tensors, is_build); + Prepare(feed_names, feed_tensors, is_build, switch_stream); if (is_build) { RunImpl(); @@ -1512,7 +1511,8 @@ void ProgramInterpreter::CheckGC(const Instruction& instr) { void ProgramInterpreter::Prepare( const std::vector& feed_names, const std::vector& feed_tensors, - bool prepare_feed) { + bool prepare_feed, + bool switch_stream) { PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(), platform::errors::PreconditionNotMet( @@ -1535,7 +1535,7 @@ void ProgramInterpreter::Prepare( } }; - if (!is_build_) { + if (!is_build_ || switch_stream) { paddle::framework::interpreter::BuildVariableScope( block_, execution_config_, &var_scope_); FeedInput(); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 580a292f4fcf5..5359c41fddcdc 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -49,12 +49,14 @@ class ProgramInterpreter : public InterpreterBaseImpl { const std::vector& feed_names, const std::vector& feed_tensors, bool need_fetch = true, - bool enable_job_schedule_profiler = false) override; + bool enable_job_schedule_profiler = false, + bool switch_stream = false) override; paddle::framework::FetchList Run(const std::vector& feed_names, bool need_fetch = true, bool enable_job_schedule_profiler = false, - bool enable_op_profiling = false) override; + bool enable_op_profiling = false, + bool switch_stream = false) override; std::shared_ptr GetMutableCopyProgram() override; @@ -152,7 +154,8 @@ class ProgramInterpreter : public InterpreterBaseImpl { // only used when program contains no feed op void Prepare(const std::vector& feed_names, const std::vector& feed_tensors, - bool prepare_feed); + bool prepare_feed, + bool switch_stream = false); void RecordMemcpyD2H(const Instruction& instr_node); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 1f10dc6b9703e..2b33e95e26c8a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -124,9 +124,6 @@ PHI_DECLARE_bool(enable_pir_in_executor); PHI_DECLARE_bool(pir_apply_inplace_pass); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PHI_DECLARE_bool(inference_switch_stream); -#endif namespace paddle { namespace { @@ -2254,7 +2251,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( return res; } -bool AnalysisPredictor::ZeroCopyRun() { +bool AnalysisPredictor::ZeroCopyRun(bool switch_stream) { inference::DisplayMemoryInfo(place_, "before run"); #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { @@ -2317,7 +2314,7 @@ bool AnalysisPredictor::ZeroCopyRun() { #endif if (config_.new_executor_enabled()) { - executor_->RunInterpreterCore(); + executor_->RunInterpreterCore({}, false, switch_stream); } else { executor_->Run(); } @@ -2358,7 +2355,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { "Please use config.SetExecStream to init gpu resources, and then we " "will bind gpu resources to execution stream.")); } - + bool switch_stream = false; if (stream != predictor_stream_) { #ifdef PADDLE_WITH_HIP hipStreamSynchronize(static_cast(predictor_stream_)); @@ -2388,11 +2385,9 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { })); auto &pool = paddle::experimental::DeviceContextPool::Instance(); pool.SyncDeviceContext(place_); - FLAGS_inference_switch_stream = true; + switch_stream = true; } - auto run_ret = ZeroCopyRun(); - FLAGS_inference_switch_stream = false; - return run_ret; + return ZeroCopyRun(switch_stream); } #endif diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 4a5cfb229a459..0f2091478af2a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -204,9 +204,10 @@ class AnalysisPredictor : public PaddlePredictor { /// /// \brief Run the prediction engine /// + /// \param switch_stream Whether the stream is switched /// \return Whether the function executed successfully /// - bool ZeroCopyRun() override; + bool ZeroCopyRun(bool switch_stream = false) override; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Note: Can only be used under thread_local semantics. diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 3fefba9ef22be..89540a91e3789 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -295,8 +295,9 @@ class PD_INFER_DECL PaddlePredictor { /// To use it, one should call the AnalysisConfig.SwitchUseFeedFetchOp(false) /// and then use the `GetInputTensor` and `GetOutputTensor` /// to directly write or read the input/output tensors. + /// \param switch_stream Whether the stream is switched. /// \return Whether the run is successful - virtual bool ZeroCopyRun() { return false; } + virtual bool ZeroCopyRun(bool switch_stream = false) { return false; } /// /// \brief Clear the intermediate tensors of the predictor diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index abb0b6f66e440..ea1af5eee4d0b 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -1108,20 +1108,6 @@ PHI_DEFINE_EXPORTED_bool(new_executor_use_cuda_graph, false, "Use CUDA Graph in new executor"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -/* - * Inference switch stream related FLAG - * Name: FLAGS_inference_switch_stream - * Since Version: 2.7 - * Value Range: bool, default=false - * Example: FLAGS_inference_switch_stream=true would switch - * It is possible for this flag to be set to true in RunWithExternalStream API. - */ -PHI_DEFINE_EXPORTED_bool(inference_switch_stream, - false, - "Swich stream when inference"); -#endif - /* * Executor related FLAG * Name: FLAGS_executor_log_deps_every_microseconds From a84ff382059493e98d6e87ab4f2592ccff691119 Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Wed, 3 Jan 2024 08:38:53 +0000 Subject: [PATCH 7/9] fix bug --- paddle/fluid/inference/api/onnxruntime_predictor.cc | 2 +- paddle/fluid/inference/api/onnxruntime_predictor.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc index 2597044046916..f2d8f7478d902 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.cc +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -333,7 +333,7 @@ bool ONNXRuntimePredictor::Run(const std::vector &inputs, return false; } -bool ONNXRuntimePredictor::ZeroCopyRun() { +bool ONNXRuntimePredictor::ZeroCopyRun(bool switch_stream) { try { const char *device_name = platform::is_cpu_place(place_) ? "Cpu" : "Cuda"; std::vector inputs; diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h index 971632c4b3c7a..c983f8acdae28 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.h +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -175,9 +175,10 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// /// \brief Run the prediction engine /// + /// \param switch_stream Whether the stream is switched /// \return Whether the function executed successfully /// - bool ZeroCopyRun() override; + bool ZeroCopyRun(bool switch_stream = false) override; /// /// \brief Release all tmp tensor to compress the size of the memory pool. From e5ee2250fe3d41decd65f4fa6c2acd2776d28840 Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Thu, 4 Jan 2024 07:08:19 +0000 Subject: [PATCH 8/9] modify python api --- paddle/fluid/pybind/inference_api.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 03a95e870b810..94df6a0ee0d41 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -691,7 +691,9 @@ void BindPaddlePredictor(py::module *m) { .def("get_output_tensor", &PaddlePredictor::GetOutputTensor) .def("get_input_names", &PaddlePredictor::GetInputNames) .def("get_output_names", &PaddlePredictor::GetOutputNames) - .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun) + .def("zero_copy_run", + &PaddlePredictor::ZeroCopyRun, + py::arg("switch_stream") = false) .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); }) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) .def("clone", @@ -740,7 +742,9 @@ void BindNativePredictor(py::module *m) { }) .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor) .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor) - .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun) + .def("zero_copy_run", + &NativePaddlePredictor::ZeroCopyRun, + py::arg("switch_stream") = false) .def("clone", [](NativePaddlePredictor &self) { return self.Clone(nullptr); }) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -1130,7 +1134,9 @@ void BindAnalysisPredictor(py::module *m) { .def("get_input_names", &AnalysisPredictor::GetInputNames) .def("get_output_names", &AnalysisPredictor::GetOutputNames) .def("get_input_tensor_shape", &AnalysisPredictor::GetInputTensorShape) - .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun) + .def("zero_copy_run", + &AnalysisPredictor::ZeroCopyRun, + py::arg("switch_stream") = false) .def("clear_intermediate_tensor", &AnalysisPredictor::ClearIntermediateTensor) .def("try_shrink_memory", &AnalysisPredictor::TryShrinkMemory) From 1672bb81fd94353671ced8e841599c96cfab0d49 Mon Sep 17 00:00:00 2001 From: minghaipeng Date: Thu, 4 Jan 2024 11:19:00 +0000 Subject: [PATCH 9/9] fix bug --- paddle/fluid/framework/naive_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 8c6e5d472a168..90f5b93dcb2ef 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -79,7 +79,7 @@ void NaiveExecutor::RunInterpreterCore( #ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow); #endif - interpreter_core_->Run(feed_names, need_fetch, true, false, switch_stream); + interpreter_core_->Run(feed_names, need_fetch, false, false, switch_stream); #ifdef PADDLE_WITH_NVTX platform::CudaNvtxRangePop(); #endif