From 72553a2c89c2be1276be248a92712b10227466a5 Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Mon, 6 Jun 2022 07:25:20 +0000
Subject: [PATCH 1/8] code

---
 paddle/fluid/framework/operator.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7dc885f54ab6c..ec0acf2a63340 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1535,7 +1535,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
             new CacheImpl(new phi::KernelContext(),
                           new RuntimeInferShapeContext(*this, *runtime_ctx));
         BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
-
         (*pt_kernel_)(impl_->getKernelContext());
       } else {
         phi::KernelContext pt_kernel_context;
@@ -2398,7 +2397,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
     // calcute the start and end index of the input tensors
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second);
-
     // deal with optional here
     if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
@@ -2412,6 +2410,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
       auto end_idx = start_idx + 1;
       pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx),
                                           i);
+
       continue;
     }
     auto ins_vector = it->second;
@@ -2426,6 +2425,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
         tensor_in = &(var->Get<phi::SelectedRows>());
         pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<framework::LoDTensorArray>()) {
+        need_prepare_phi_data_ = true;
         paddle::small_vector<const phi::TensorBase*> tensor_vector;
         auto& tensor_array = var->Get<framework::LoDTensorArray>();
         for (auto& t : tensor_array) {
@@ -2526,6 +2526,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
                   attr_names[i]));
           }
         } else {  // scalar is in the input
+          need_prepare_phi_data_ = true;
           auto& ins_vector = ctx.inputs.at(attr_names[i]);
           pt_kernel_context->EmplaceBackAttr(std::move(
               experimental::MakePhiScalarFromVar(*ins_vector.front())));
@@ -2557,6 +2558,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
                   attr_names[i]));
           }
         } else {  // shape is in the input
+          need_prepare_phi_data_ = true;
           auto& ins_vector = ctx.inputs.at(attr_names[i]);
           if (ins_vector.size() == 1) {  // ShapeTensor
             pt_kernel_context->EmplaceBackAttr(std::move(

From c49e2672f4954382128cf99448b4aed71e87ec24 Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Wed, 22 Jun 2022 09:22:44 +0000
Subject: [PATCH 2/8] cuda graph

---
 paddle/fluid/framework/naive_executor.cc | 30 ++++++++++++++++++++----
 paddle/fluid/framework/naive_executor.h  |  2 ++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 1c2740c2b2ee7..ef50a078825a0 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -46,11 +46,33 @@ void NaiveExecutor::Run() {
   platform::RegisterModelLayout(ops_, place_);
 #endif
   platform::ScopedFlushDenormal flush;
+  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
+  auto stream = ctx->stream();
+
   for (auto &op : ops_) {
-    VLOG(4) << std::this_thread::get_id() << " run "
-            << op->DebugStringEx(scope_) << " on scope " << scope_;
-    op->SetIsCalledByExecutor(false);
-    op->Run(*scope_, place_);
+    if(std::count(graphed_ops.begin(), graphed_ops.end(), op->Type())) {
+      if(graph_instances_.count(op.get())){
+         cudaGraphLaunch(graph_instances_[op.get()], stream);
+         cudaStreamSynchronize(stream);
+      } else {   
+        cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+        VLOG(4) << std::this_thread::get_id() << " run "
+                << op->DebugStringEx(scope_) << " on scope " << scope_;
+        op->SetIsCalledByExecutor(false);
+        op->Run(*scope_, place_);
+        cudaGraph_t graph_;
+        cudaGraphExec_t instance_;
+        cudaStreamEndCapture(stream, &graph_);
+        cudaGraphInstantiate(&instance_, graph_, NULL, NULL, 0);
+        graph_instances_[op.get()] = instance_;
+      }
+    } else {   
+      VLOG(4) << std::this_thread::get_id() << " run "
+              << op->DebugStringEx(scope_) << " on scope " << scope_;
+      op->SetIsCalledByExecutor(false);
+      op->Run(*scope_, place_);
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 498ad1d058827..2756208c530f0 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -76,6 +76,8 @@ class NaiveExecutor {
   const platform::Place place_;
   // Catch the required resource to avoid recreate.
   std::vector<std::unique_ptr<OperatorBase>> ops_;
+  std::map<OperatorBase*, cudaGraphExec_t> graph_instances_;
+  std::vector<std::string> graphed_ops {"conv2d"};
   Scope* scope_;
 };
 

From 14a73886b1d9cca4e63289807743b15858bc4eb0 Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Thu, 23 Jun 2022 03:16:19 +0000
Subject: [PATCH 3/8] code

---
 paddle/fluid/framework/operator.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index dbf6bec676c90..8a73557d78483 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2398,6 +2398,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
     // calcute the start and end index of the input tensors
     size_t start_idx =
         (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second);
+
     // deal with optional here
     if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
@@ -2411,7 +2412,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
       auto end_idx = start_idx + 1;
       pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx),
                                           i);
-
       continue;
     }
     auto ins_vector = it->second;
@@ -2426,7 +2426,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
         tensor_in = &(var->Get<phi::SelectedRows>());
         pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<framework::LoDTensorArray>()) {
-        need_prepare_phi_data_ = true;
         paddle::small_vector<const phi::TensorBase*> tensor_vector;
         auto& tensor_array = var->Get<framework::LoDTensorArray>();
         for (auto& t : tensor_array) {
@@ -2527,7 +2526,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                   attr_names[i]));
           }
         } else {  // scalar is in the input
-          need_prepare_phi_data_ = true;
           auto& ins_vector = ctx.inputs.at(attr_names[i]);
           pt_kernel_context->EmplaceBackAttr(std::move(
               experimental::MakePhiScalarFromVar(*ins_vector.front())));
@@ -2559,7 +2557,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                   attr_names[i]));
           }
         } else {  // shape is in the input
-          need_prepare_phi_data_ = true;
           auto& ins_vector = ctx.inputs.at(attr_names[i]);
           if (ins_vector.size() == 1) {  // ShapeTensor
             pt_kernel_context->EmplaceBackAttr(std::move(

From 53cf34c89aa31d8027a16c334dff2afa07404053 Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Thu, 23 Jun 2022 03:38:34 +0000
Subject: [PATCH 4/8] code

---
 paddle/fluid/framework/operator.cc | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8a73557d78483..7395a8e0da8e8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1529,20 +1529,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       // Do data transform before building KernelContext
       // TODO(zhiqiu): support TransferInplaceVarsBack
       PreparePhiData(exec_scope, *pt_kernel_, *kernel_signature_, runtime_ctx);
-      if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
-          !need_prepare_data_) {
-        impl_ =
-            new CacheImpl(new phi::KernelContext(),
-                          new RuntimeInferShapeContext(*this, *runtime_ctx));
-        BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
-        (*pt_kernel_)(impl_->getKernelContext());
-      } else {
-        phi::KernelContext pt_kernel_context;
-        // Do data transform before building KernelContext
-        // TODO(zhiqiu): support TransferInplaceVarsBack
-        BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
-        (*pt_kernel_)(&pt_kernel_context);
-      }
+      BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
+      (*pt_kernel_)(&pt_kernel_context);
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));

From 59a4de80eae7b95b8ab3d97f3c0bd947636effe4 Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Thu, 23 Jun 2022 03:45:52 +0000
Subject: [PATCH 5/8] code

---
 paddle/fluid/framework/naive_executor.cc | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ef50a078825a0..e451cc172e84c 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -45,17 +45,19 @@ void NaiveExecutor::Run() {
   platform::AttachPointerHashToMKLDNNKey(this, place_);
   platform::RegisterModelLayout(ops_, place_);
 #endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::ScopedFlushDenormal flush;
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
   auto stream = ctx->stream();
 
   for (auto &op : ops_) {
-    if(std::count(graphed_ops.begin(), graphed_ops.end(), op->Type())) {
-      if(graph_instances_.count(op.get())){
-         cudaGraphLaunch(graph_instances_[op.get()], stream);
-         cudaStreamSynchronize(stream);
-      } else {   
+    if (std::count(graphed_ops.begin(), graphed_ops.end(), op->Type())) {
+      if (graph_instances_.count(op.get())) {
+        cudaGraphLaunch(graph_instances_[op.get()], stream);
+        cudaStreamSynchronize(stream);
+      } else {
         cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
         VLOG(4) << std::this_thread::get_id() << " run "
                 << op->DebugStringEx(scope_) << " on scope " << scope_;
@@ -67,13 +69,21 @@ void NaiveExecutor::Run() {
         cudaGraphInstantiate(&instance_, graph_, NULL, NULL, 0);
         graph_instances_[op.get()] = instance_;
       }
-    } else {   
+    } else {
       VLOG(4) << std::this_thread::get_id() << " run "
               << op->DebugStringEx(scope_) << " on scope " << scope_;
       op->SetIsCalledByExecutor(false);
       op->Run(*scope_, place_);
     }
   }
+#else
+  for (auto &op : ops_) {
+    VLOG(4) << std::this_thread::get_id() << " run "
+            << op->DebugStringEx(scope_) << " on scope " << scope_;
+    op->SetIsCalledByExecutor(false);
+    op->Run(*scope_, place_);
+  }
+#endif
 }
 
 void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,

From 4a07287cbe412f73d7be4b9efe507f2856b7f32b Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Thu, 23 Jun 2022 03:48:35 +0000
Subject: [PATCH 6/8] code

---
 paddle/fluid/framework/naive_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index e451cc172e84c..b4e64a0935f97 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -46,8 +46,8 @@ void NaiveExecutor::Run() {
   platform::RegisterModelLayout(ops_, place_);
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::ScopedFlushDenormal flush;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
   auto stream = ctx->stream();

From 3599cf573b6734ecc486feb475dc89975cfd9caf Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Mon, 27 Jun 2022 06:41:50 +0000
Subject: [PATCH 7/8] code

---
 paddle/fluid/framework/operator.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7395a8e0da8e8..002adb83796e8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1261,6 +1261,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
     pre_scope_ = cur_scope;
+  } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
+             !need_prepare_phi_data_) {
+    if (!all_kernels_must_compute_runtime_shape_)
+      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+    (*pt_kernel_)(impl_->getKernelContext());
   } else {
     if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
       std::lock_guard<std::mutex> lock(cache_update_mutex_);

From 11cce8ce57fbab536a5dab2e320a8f5ee1e4d16d Mon Sep 17 00:00:00 2001
From: DannyIsFunny <912790387@qq.com>
Date: Mon, 27 Jun 2022 06:46:19 +0000
Subject: [PATCH 8/8] code

---
 paddle/fluid/framework/naive_executor.cc | 32 ------------------------
 paddle/fluid/framework/naive_executor.h  |  2 --
 2 files changed, 34 deletions(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index b4e64a0935f97..1c2740c2b2ee7 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -45,45 +45,13 @@ void NaiveExecutor::Run() {
   platform::AttachPointerHashToMKLDNNKey(this, place_);
   platform::RegisterModelLayout(ops_, place_);
 #endif
-
   platform::ScopedFlushDenormal flush;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
-      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
-  auto stream = ctx->stream();
-
-  for (auto &op : ops_) {
-    if (std::count(graphed_ops.begin(), graphed_ops.end(), op->Type())) {
-      if (graph_instances_.count(op.get())) {
-        cudaGraphLaunch(graph_instances_[op.get()], stream);
-        cudaStreamSynchronize(stream);
-      } else {
-        cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
-        VLOG(4) << std::this_thread::get_id() << " run "
-                << op->DebugStringEx(scope_) << " on scope " << scope_;
-        op->SetIsCalledByExecutor(false);
-        op->Run(*scope_, place_);
-        cudaGraph_t graph_;
-        cudaGraphExec_t instance_;
-        cudaStreamEndCapture(stream, &graph_);
-        cudaGraphInstantiate(&instance_, graph_, NULL, NULL, 0);
-        graph_instances_[op.get()] = instance_;
-      }
-    } else {
-      VLOG(4) << std::this_thread::get_id() << " run "
-              << op->DebugStringEx(scope_) << " on scope " << scope_;
-      op->SetIsCalledByExecutor(false);
-      op->Run(*scope_, place_);
-    }
-  }
-#else
   for (auto &op : ops_) {
     VLOG(4) << std::this_thread::get_id() << " run "
             << op->DebugStringEx(scope_) << " on scope " << scope_;
     op->SetIsCalledByExecutor(false);
     op->Run(*scope_, place_);
   }
-#endif
 }
 
 void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index 2756208c530f0..498ad1d058827 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -76,8 +76,6 @@ class NaiveExecutor {
   const platform::Place place_;
   // Catch the required resource to avoid recreate.
   std::vector<std::unique_ptr<OperatorBase>> ops_;
-  std::map<OperatorBase*, cudaGraphExec_t> graph_instances_;
-  std::vector<std::string> graphed_ops {"conv2d"};
   Scope* scope_;
 };