From c514fbd803fff4ca53b11738f818d5ca9cc1eed6 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Wed, 18 Oct 2023 04:43:03 +0000
Subject: [PATCH 01/52] merge from openvino master

---
 .../new_executor/interpreter_base_impl.h      |  1 +
 .../framework/new_executor/interpretercore.cc |  4 ++
 .../new_executor/program_interpreter.cc       | 43 +++++++++++++++++++
 .../new_executor/program_interpreter.h        |  4 ++
 paddle/phi/kernels/autotune/gpu_timer.h       | 18 ++++++++
 5 files changed, 70 insertions(+)

diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 369216e0078c4..fd10265312c33 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -37,6 +37,7 @@
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/device_event.h"
 #include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/kernels/autotune/gpu_timer.h"
 
 PD_DECLARE_bool(new_executor_serial_run);
 PD_DECLARE_bool(new_executor_static_build);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 8e052d3b2685e..c6a9209db19dc 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -34,6 +34,10 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             true,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
+PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler,
+                            true,
+                            "Enable auto parallel profiler, collecting the "
+                            "runtime of jobs in different devices");
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index f1646f50471a4..10170fedb3815 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -39,6 +39,7 @@
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
+PHI_DECLARE_bool(auto_parallel_profiler);
 
 namespace paddle {
 namespace framework {
@@ -103,6 +104,12 @@ ProgramInterpreter::~ProgramInterpreter() {
 }
 
 void ProgramInterpreter::RunImpl() {
+  if (FLAGS_auto_parallel_profiler) {
+    for (auto& timer : gpu_stream_timer_) {
+      timer.Start();
+    }
+  }
+
   // lazy initialization of gc, do not create gc is the program only run once
   if (!gc_) {
     gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
@@ -127,6 +134,11 @@ void ProgramInterpreter::RunImpl() {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
 #endif
+  if (FLAGS_auto_parallel_profiler) {
+    for (auto& timer : gpu_stream_timer_) {
+      timer.Stop();
+    }
+  }
 }
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
@@ -153,6 +165,12 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
     ClearLoDTensorArrayInLocalScope();
   }
 
+  if (FLAGS_auto_parallel_profiler) {
+    for (auto& timer : gpu_stream_timer_) {
+      VLOG(3) << "GPU stream event time: " << timer.ElapsedTime();
+    }
+  }
+
   // return Fetch Tensors
   Scope* inner_scope =
       HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
@@ -622,6 +640,27 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
   }
 }
 
+void ProgramInterpreter::AddGpuStreamEvents() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  for (auto& instr : vec_instruction_) {
+    if (instr.KernelType() != OpFuncType::kGpuAsync) {
+      return;
+    }
+
+    if (instr.DeviceContext().GetPlace().GetType() ==
+        phi::AllocationType::CUSTOM) {
+      return;
+    }
+
+    gpuStream_t stream =
+        reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext())
+            .stream();
+
+    gpu_stream_timer_.emplace_back(stream);
+  }
+#endif
+}
+
 void ProgramInterpreter::Convert(
     std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
   auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
@@ -658,6 +697,10 @@ void ProgramInterpreter::Convert(
     vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
   }
 
+  if (FLAGS_auto_parallel_profiler) {
+    AddGpuStreamEvents();
+  }
+
   BuildOperatorDependences();
 
   // NOTE(Ruibiao): For cross-step stream synchronization, an event may be
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index bef6385c211fb..79f2be4ca5785 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -149,6 +149,8 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   // For log and debug
   std::string GetDepsString() const;
 
+  void AddGpuStreamEvents();
+
   bool is_build_{false};
   bool static_build_{false};
   // Note(sonder): share the op dependency and event analysis procedure.
@@ -210,6 +212,8 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
 
   std::vector<HookFunc> hookfuncs_;
+
+  std::vector<phi::GpuTimer> gpu_stream_timer_;
 };
 
 }  // namespace framework
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 87eca2613a7b5..33f8eb8291b7f 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -42,6 +42,20 @@ class GpuTimer {
         stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
   }
 
+  explicit GpuTimer(gpuStream_t stream) : stream_(stream) {
+#ifdef PADDLE_WITH_HIP
+    hipEventCreate(&start_);
+    hipEventCreate(&stop_);
+#else
+    cudaEventCreate(&start_);
+    cudaEventCreate(&stop_);
+#endif
+    PADDLE_ENFORCE_NOT_NULL(
+        start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
+    PADDLE_ENFORCE_NOT_NULL(
+        stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
+  }
+
   ~GpuTimer() {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(start_);
@@ -68,6 +82,9 @@ class GpuTimer {
 #endif
   }
 
+  void Start() { Start(stream_); }
+  void Stop() { Stop(stream_); }
+
   float ElapsedTime() {
     float milliseconds = 0;
 #ifdef PADDLE_WITH_HIP
@@ -83,6 +100,7 @@ class GpuTimer {
  private:
   gpuEvent_t start_;
   gpuEvent_t stop_;
+  gpuStream_t stream_;
 };
 
 }  // namespace phi

From 0147f70b06e9eeb68d823f4b4e83c6e0940abaf2 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 20 Oct 2023 05:53:58 +0000
Subject: [PATCH 02/52] add InterpreterRunTime() to record interpreter's run
 time

---
 .../new_executor/interpreter_base_impl.h      |  2 +
 .../framework/new_executor/interpretercore.cc |  4 ++
 .../framework/new_executor/interpretercore.h  |  2 +
 .../new_executor/new_ir_interpreter.cc        |  5 ++
 .../new_executor/new_ir_interpreter.h         |  2 +
 .../new_executor/program_interpreter.cc       | 65 ++++++++++++++-----
 .../new_executor/program_interpreter.h        |  4 +-
 .../new_executor/standalone_executor.cc       | 16 +++++
 paddle/phi/kernels/autotune/gpu_timer.h       | 33 +++++-----
 9 files changed, 98 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index fd10265312c33..d78988a244946 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -104,6 +104,8 @@ class InterpreterBaseImpl {
       std::vector<paddle::framework::OpFuncNode>* op_func_nodes) = 0;
 
   virtual bool IsStaticBuild() const = 0;
+
+  virtual std::tuple<double, double> InterpreterRunTime() = 0;
 };
 
 inline void SetDeviceId(const platform::Place& place) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index c6a9209db19dc..4da225d9068d1 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -133,5 +133,9 @@ void InterpreterCore::Build(
 
 bool InterpreterCore::IsStaticBuild() const { return impl_->IsStaticBuild(); }
 
+std::tuple<double, double> InterpreterCore::InterpreterRunTime() {
+  return impl_->InterpreterRunTime();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index d21bd9e1fc378..cd03eb46a6707 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -79,6 +79,8 @@ class InterpreterCore {
 
   bool IsStaticBuild() const;
 
+  std::tuple<double, double> InterpreterRunTime();
+
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 50af034414d6f..9fde156b9f11b 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -276,6 +276,11 @@ void NewIRInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {
           << ") to InterpreterCore(" << this << ")";
 }
 
+std::tuple<double, double> NewIRInterpreter::InterpreterRunTime() {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "NewIRInterpreter::InterpreterRunTime is not implemented."));
+}
+
 const interpreter::NewIrDependencyBuilder&
 NewIRInterpreter::GetNewIrDependencyBuilder() const {
   return ir_dependency_builder_;
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index 3a128791cdfce..aae29829c3cf4 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -60,6 +60,8 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   void ShareBuildResultsFrom(const InterpreterBaseImpl& src) override;
 
+  std::tuple<double, double> InterpreterRunTime() override;
+
   std::shared_ptr<std::vector<size_t>> GetDependencyCount() const override;
 
   bool IsSharedResultsBuild() const override;
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 10170fedb3815..8889a41cac6cc 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -105,8 +105,10 @@ ProgramInterpreter::~ProgramInterpreter() {
 
 void ProgramInterpreter::RunImpl() {
   if (FLAGS_auto_parallel_profiler) {
-    for (auto& timer : gpu_stream_timer_) {
-      timer.Start();
+    // Note(sonder): Record the start time of the each stream.
+    for (size_t i = 0; i < stream_timers_.size(); ++i) {
+      auto& stream_timer = stream_timers_[i];
+      stream_timer.Start();
     }
   }
 
@@ -135,8 +137,9 @@ void ProgramInterpreter::RunImpl() {
   }
 #endif
   if (FLAGS_auto_parallel_profiler) {
-    for (auto& timer : gpu_stream_timer_) {
-      timer.Stop();
+    for (size_t i = 0; i < stream_timers_.size(); ++i) {
+      auto& stream_timer = stream_timers_[i];
+      stream_timer.Stop();
     }
   }
 }
@@ -165,12 +168,6 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
     ClearLoDTensorArrayInLocalScope();
   }
 
-  if (FLAGS_auto_parallel_profiler) {
-    for (auto& timer : gpu_stream_timer_) {
-      VLOG(3) << "GPU stream event time: " << timer.ElapsedTime();
-    }
-  }
-
   // return Fetch Tensors
   Scope* inner_scope =
       HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
@@ -642,25 +639,57 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 
 void ProgramInterpreter::AddGpuStreamEvents() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  for (auto& instr : vec_instruction_) {
-    if (instr.KernelType() != OpFuncType::kGpuAsync) {
-      return;
-    }
+  stream_timers_.clear();
+  std::vector<gpuStream_t> streams;
+  bool has_default_stream = false;
 
-    if (instr.DeviceContext().GetPlace().GetType() ==
-        phi::AllocationType::CUSTOM) {
-      return;
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    auto& instr = vec_instruction_[i];
+    if ((instr.KernelType() != OpFuncType::kGpuAsync) ||
+        (instr.DeviceContext().GetPlace().GetType() ==
+         phi::AllocationType::CUSTOM)) {
+      continue;
     }
 
     gpuStream_t stream =
         reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext())
             .stream();
 
-    gpu_stream_timer_.emplace_back(stream);
+    if (stream != nullptr) {
+      has_default_stream = true;
+    }
+  }
+  size_t timers_size = has_default_stream ? streams.size() + 1 : streams.size();
+  stream_timers_.resize(timers_size);
+  for (size_t i = 0; i < streams.size(); ++i) {
+    stream_timers_[i].SetStream(streams[i]);
+  }
+  if (has_default_stream) {
+    stream_timers_.back().SetStream(nullptr);
   }
+
 #endif
 }
 
+std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
+  double min_start_time = DBL_MAX, max_end_time = DBL_MIN;
+  for (size_t i = 0; i < stream_timers_.size(); ++i) {
+    auto& stream_timer = stream_timers_[i];
+    double start_time = stream_timer.StartTime();
+    double end_time = stream_timer.EndTime();
+
+    min_start_time = std::min(min_start_time, start_time);
+    max_end_time = std::max(max_end_time, end_time);
+
+    VLOG(3) << "ProgramInterpreter::InterpreterRunTime:"
+            << "start_time: " << std::to_string(start_time)
+            << ", end_time: " << std::to_string(end_time) << ", min_start_time"
+            << std::to_string(min_start_time)
+            << ", max_end_time: " << std::to_string(max_end_time);
+  }
+  return std::make_tuple(min_start_time, max_end_time);
+}
+
 void ProgramInterpreter::Convert(
     std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
   auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 79f2be4ca5785..98aac21bac880 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -98,6 +98,8 @@ class ProgramInterpreter : public InterpreterBaseImpl {
 
   bool IsStaticBuild() const override { return static_build_; }
 
+  std::tuple<double, double> InterpreterRunTime() override;
+
  private:
   // build graph
   void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
@@ -213,7 +215,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
 
   std::vector<HookFunc> hookfuncs_;
 
-  std::vector<phi::GpuTimer> gpu_stream_timer_;
+  std::vector<phi::GpuTimer> stream_timers_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index f06bee2c884e3..5b12a70fe6c53 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -30,6 +30,7 @@
 PHI_DECLARE_bool(enable_new_ir_in_executor);
 PHI_DECLARE_bool(enable_pir_api);
 PHI_DECLARE_bool(new_ir_apply_inplace_pass);
+PHI_DECLARE_bool(auto_parallel_profiler);
 
 namespace paddle {
 namespace framework {
@@ -205,6 +206,21 @@ paddle::framework::FetchList StandaloneExecutor::Run(
     }
   }
 
+  // record each job's run time
+  if (FLAGS_auto_parallel_profiler) {
+    for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
+      const auto& job = jobs[job_idx];
+      const std::string& job_type = job->Type();
+      double start_time, end_time;
+      std::tie(start_time, end_time) =
+          interpretercores_[job_idx]->InterpreterRunTime();
+      VLOG(3) << "Profiler Info: Job (" << job_idx << "), type = " << job_type
+              << ", micro_batch_id = " << job->MicroBatchId()
+              << ", job_start_time = " << std::to_string(start_time)
+              << ", job_end_time = " << std::to_string(end_time);
+    }
+  }
+
   // return Fetch Tensors
   if (FLAGS_enable_new_ir_in_executor) {
     framework::FetchList fetch_res;
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 33f8eb8291b7f..e23613f94a9d0 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <sys/time.h>
+
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
@@ -42,20 +44,6 @@ class GpuTimer {
         stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
   }
 
-  explicit GpuTimer(gpuStream_t stream) : stream_(stream) {
-#ifdef PADDLE_WITH_HIP
-    hipEventCreate(&start_);
-    hipEventCreate(&stop_);
-#else
-    cudaEventCreate(&start_);
-    cudaEventCreate(&stop_);
-#endif
-    PADDLE_ENFORCE_NOT_NULL(
-        start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
-    PADDLE_ENFORCE_NOT_NULL(
-        stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
-  }
-
   ~GpuTimer() {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(start_);
@@ -82,10 +70,18 @@ class GpuTimer {
 #endif
   }
 
-  void Start() { Start(stream_); }
+  void Start() {
+    struct timeval time_now {};
+    gettimeofday(&time_now, nullptr);
+    start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000);
+    Start(stream_);
+  }
+
   void Stop() { Stop(stream_); }
 
-  float ElapsedTime() {
+  void SetStream(gpuStream_t stream) { stream_ = stream; }
+
+  double ElapsedTime() {
     float milliseconds = 0;
 #ifdef PADDLE_WITH_HIP
     hipEventSynchronize(stop_);
@@ -97,10 +93,15 @@ class GpuTimer {
     return milliseconds;
   }
 
+  double StartTime() { return start_time_; }
+
+  double EndTime() { return ElapsedTime() + start_time_; }
+
  private:
   gpuEvent_t start_;
   gpuEvent_t stop_;
   gpuStream_t stream_;
+  double start_time_;
 };
 
 }  // namespace phi

From 6d1dc3d07bf6dc4c8cc475cd6380558705968f98 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 20 Oct 2023 09:50:00 +0000
Subject: [PATCH 03/52] add profiler helper static to produce json file

---
 .../new_executor/standalone_executor.cc       |  2 +-
 paddle/phi/kernels/autotune/gpu_timer.h       |  2 +-
 .../pp_utils/profiler_helper_static.py        | 85 +++++++++++++++++++
 ...est_distributed_fused_lamb_op_with_clip.py |  4 +-
 4 files changed, 89 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 5b12a70fe6c53..ea11bd1427758 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -214,7 +214,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
       double start_time, end_time;
       std::tie(start_time, end_time) =
           interpretercores_[job_idx]->InterpreterRunTime();
-      VLOG(3) << "Profiler Info: Job (" << job_idx << "), type = " << job_type
+      VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type
               << ", micro_batch_id = " << job->MicroBatchId()
               << ", job_start_time = " << std::to_string(start_time)
               << ", job_end_time = " << std::to_string(end_time);
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index e23613f94a9d0..9ef228906d1d6 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -73,7 +73,7 @@ class GpuTimer {
   void Start() {
     struct timeval time_now {};
     gettimeofday(&time_now, nullptr);
-    start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000);
+    start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
     Start(stream_);
   }
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
new file mode 100644
index 0000000000000..e6055524870d0
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+from argparse import ArgumentParser
+
+import paddle
+
+
+def parse_args():
+    parser = ArgumentParser()
+    device_count = paddle.device.cuda.device_count()
+    all_devices = ",".join([str(i) for i in range(device_count)])
+    parser.add_argument("--devices", type=str, default=all_devices)
+    parser.add_argument(
+        "--log_dir", type=str, default="build/Testing/Temporary/"
+    )
+    args = parser.parse_args()
+    return args
+
+
+def process_log_data(log_data, device_id):
+    log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)'
+    matches = re.findall(log_pattern, log_data)
+    events = []
+    for match in matches:
+        job_id, job_type, micro_batch_id, job_start_time, job_end_time = match
+        if job_type in ["lr"]:
+            continue
+
+        event_start = {
+            "name": job_type[0].upper() + "_" + str(job_id),
+            "cat": job_type,
+            "ph": "B",
+            "ts": float(job_start_time.strip()) * 1000,
+            "pid": "Main",
+            "tid": "GPU: " + str(device_id),
+        }
+        event_end = {
+            "name": job_type[0].upper() + "_" + str(job_id),
+            "cat": job_type,
+            "ph": "E",
+            "pid": "Main",
+            "ts": float(job_end_time.strip()) * 1000,
+            "tid": "GPU: " + str(device_id),
+        }
+        events.append(event_start)
+        events.append(event_end)
+
+    return events
+
+
+def main():
+    args = parse_args()
+    all_events = []
+    for device_id in args.devices.split(","):
+        print(f"Process device {device_id}")
+        device_id = int(device_id)
+        log_file = os.path.join(args.log_dir, "workerlog." + str(device_id))
+        with open(log_file, "r") as f:
+            log_data = f.read()
+        events = process_log_data(log_data, device_id)
+        all_events.extend(events)
+
+    save_path = os.path.join(args.log_dir, "pipeline_profile.json")
+    with open(save_path, "w") as f:
+        f.write(json.dumps({"traceEvents": all_events}))
+    print(f"Save pipeline profile to {save_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
index 62a94832d1ae9..118d4f4a4ffb1 100644
--- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
+++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
@@ -49,14 +49,14 @@ def run_test(
     if os.name == 'nt':
         return
     args = locals()
-    log_dir = os.path.join(temp_dir.name, f'log_{os.getpid()}')
+    log_dir = "/home/root/Paddle/build/Testing/Temporary/"
     cmd = [
         sys.executable,
         '-u',
         '-m',
         'paddle.distributed.launch',
         '--devices',
-        '0,1',
+        '2,3',
         '--log_dir',
         log_dir,
         get_test_file(),

From 6f4f67ce65b800e92b0725a4d42f5884c1e87619 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 23 Oct 2023 06:30:12 +0000
Subject: [PATCH 04/52] add color map and support perfetto format

---
 .../pp_utils/profiler_helper_static.py        | 44 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
index e6055524870d0..704fa0c6b9624 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
@@ -36,6 +36,12 @@ def process_log_data(log_data, device_id):
     log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)'
     matches = re.findall(log_pattern, log_data)
     events = []
+    color_map = {
+        "forward": "thread_state_running",  # RGB: 126, 200, 148
+        "backward": "rail_idle",  # RGB: 238, 142, 0
+        "optimizer": "rail_response",  # RGB: 238, 142, 0
+        "default": "thread_state_unknown",  # RGB: 199, 155, 125
+    }
     for match in matches:
         job_id, job_type, micro_batch_id, job_start_time, job_end_time = match
         if job_type in ["lr"]:
@@ -47,7 +53,8 @@ def process_log_data(log_data, device_id):
             "ph": "B",
             "ts": float(job_start_time.strip()) * 1000,
             "pid": "Main",
-            "tid": "GPU: " + str(device_id),
+            "tid": "GPU" + str(device_id),
+            "cname": color_map[job_type],
         }
         event_end = {
             "name": job_type[0].upper() + "_" + str(job_id),
@@ -55,7 +62,8 @@ def process_log_data(log_data, device_id):
             "ph": "E",
             "pid": "Main",
             "ts": float(job_end_time.strip()) * 1000,
-            "tid": "GPU: " + str(device_id),
+            "tid": "GPU" + str(device_id),
+            "cname": color_map[job_type],
         }
         events.append(event_start)
         events.append(event_end)
@@ -80,6 +88,38 @@ def main():
         f.write(json.dumps({"traceEvents": all_events}))
     print(f"Save pipeline profile to {save_path}")
 
+    # support Perfetto format
+    save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json")
+    all_events.extend(
+        [
+            {
+                "args": {"name": "GPU"},
+                "cat": "__metadata",
+                "name": "thread_name",
+                "ph": "M",
+                "pid": "Main",
+                "tid": 0,
+                "ts": 0,
+            },
+            {
+                "args": {"name": "GPU"},
+                "cat": "__metadata",
+                "name": "thread_name",
+                "ph": "M",
+                "pid": "Main",
+                "tid": 1,
+                "ts": 0,
+            },
+        ]
+    )
+    json_str = json.dumps({"traceEvents": all_events})
+    for i in range(len(args.devices.split(","))):
+        json_str = json_str.replace(f'"GPU{i}"', f'{i}')
+
+    with open(save_path, "w") as f:
+        f.write(json_str)
+    print(f"Save pipeline profile to {save_path}")
+
 
 if __name__ == "__main__":
     main()

From 4d51610a9cec4793d1bb953b13bbccc9dffa020d Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 23 Oct 2023 07:03:24 +0000
Subject: [PATCH 05/52] recover codes

---
 test/legacy_test/test_distributed_fused_lamb_op_with_clip.py | 4 ++--
 third_party/mkldnn                                           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
index 118d4f4a4ffb1..62a94832d1ae9 100644
--- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
+++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
@@ -49,14 +49,14 @@ def run_test(
     if os.name == 'nt':
         return
     args = locals()
-    log_dir = "/home/root/Paddle/build/Testing/Temporary/"
+    log_dir = os.path.join(temp_dir.name, f'log_{os.getpid()}')
     cmd = [
         sys.executable,
         '-u',
         '-m',
         'paddle.distributed.launch',
         '--devices',
-        '2,3',
+        '0,1',
         '--log_dir',
         log_dir,
         get_test_file(),
diff --git a/third_party/mkldnn b/third_party/mkldnn
index 01204edbda1c2..403667673f61a 160000
--- a/third_party/mkldnn
+++ b/third_party/mkldnn
@@ -1 +1 @@
-Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56
+Subproject commit 403667673f61a56289622fd5bc587b1856296fbc

From c70d9f9690ee11d517d5545c0b6bb07aa6630bc2 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 23 Oct 2023 07:46:38 +0000
Subject: [PATCH 06/52] control include env for gpu_timer.h

---
 paddle/fluid/framework/new_executor/interpreter_base_impl.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index d78988a244946..e0063c4b7cdf4 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -37,7 +37,10 @@
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/device_event.h"
 #include "paddle/phi/backends/device_manager.h"
+
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
+#endif
 
 PD_DECLARE_bool(new_executor_serial_run);
 PD_DECLARE_bool(new_executor_static_build);

From ad0f17a2f2867c649d29a90e824a82b30a0884c1 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 23 Oct 2023 07:50:37 +0000
Subject: [PATCH 07/52] fix logic for profiler_helper_static.py

---
 .../pp_utils/profiler_helper_static.py        | 36 ++++++++-----------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
index 704fa0c6b9624..6567b813adc92 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
@@ -90,28 +90,20 @@ def main():
 
     # support Perfetto format
     save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json")
-    all_events.extend(
-        [
-            {
-                "args": {"name": "GPU"},
-                "cat": "__metadata",
-                "name": "thread_name",
-                "ph": "M",
-                "pid": "Main",
-                "tid": 0,
-                "ts": 0,
-            },
-            {
-                "args": {"name": "GPU"},
-                "cat": "__metadata",
-                "name": "thread_name",
-                "ph": "M",
-                "pid": "Main",
-                "tid": 1,
-                "ts": 0,
-            },
-        ]
-    )
+    for i in range(len(args.devices.split(","))):
+        all_events.extend(
+            [
+                {
+                    "args": {"name": "GPU"},
+                    "cat": "__metadata",
+                    "name": "thread_name",
+                    "ph": "M",
+                    "pid": "Main",
+                    "tid": i,
+                    "ts": 0,
+                }
+            ]
+        )
     json_str = json.dumps({"traceEvents": all_events})
     for i in range(len(args.devices.split(","))):
         json_str = json_str.replace(f'"GPU{i}"', f'{i}')

From e0442c6803d307df4cf9486f5b5dd879e44c4839 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 23 Oct 2023 10:00:57 +0000
Subject: [PATCH 08/52] fix build error

---
 paddle/fluid/framework/new_executor/program_interpreter.cc | 5 +++++
 paddle/fluid/framework/new_executor/program_interpreter.h  | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index a89afc7d09a82..78c0adc98e125 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -104,6 +104,7 @@ ProgramInterpreter::~ProgramInterpreter() {
 }
 
 void ProgramInterpreter::RunImpl() {
+#if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
     // Note(sonder): Record the start time of the each stream.
     for (size_t i = 0; i < stream_timers_.size(); ++i) {
@@ -111,6 +112,7 @@ void ProgramInterpreter::RunImpl() {
       stream_timer.Start();
     }
   }
+#endif
 
   // lazy initialization of gc, do not create gc is the program only run once
   if (!gc_) {
@@ -136,12 +138,15 @@ void ProgramInterpreter::RunImpl() {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
 #endif
+
+#if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
     for (size_t i = 0; i < stream_timers_.size(); ++i) {
       auto& stream_timer = stream_timers_[i];
       stream_timer.Stop();
     }
   }
+#endif
 }
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 98aac21bac880..79809a4e8edb7 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -215,7 +215,9 @@ class ProgramInterpreter : public InterpreterBaseImpl {
 
   std::vector<HookFunc> hookfuncs_;
 
+#if defined(PADDLE_WITH_CUDA)
   std::vector<phi::GpuTimer> stream_timers_;
+#endif
 };
 
 }  // namespace framework

From a8a37bb123867b4f7697b39886368b5ccadda02e Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 23 Oct 2023 14:18:46 +0000
Subject: [PATCH 09/52] fix build error

---
 paddle/fluid/framework/new_executor/program_interpreter.cc | 5 ++++-
 paddle/fluid/framework/new_executor/standalone_executor.cc | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 78c0adc98e125..ca2e67d5aeb4d 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -677,7 +677,9 @@ void ProgramInterpreter::AddGpuStreamEvents() {
 }
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
-  double min_start_time = DBL_MAX, max_end_time = DBL_MIN;
+  double min_start_time = std::numeric_limits<double>::max(),
+         max_end_time = std::numeric_limits<double>::min();
+#if defined(PADDLE_WITH_CUDA)
   for (size_t i = 0; i < stream_timers_.size(); ++i) {
     auto& stream_timer = stream_timers_[i];
     double start_time = stream_timer.StartTime();
@@ -692,6 +694,7 @@ std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
             << std::to_string(min_start_time)
             << ", max_end_time: " << std::to_string(max_end_time);
   }
+#endif
   return std::make_tuple(min_start_time, max_end_time);
 }
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index ea11bd1427758..60c132110f1d4 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -207,6 +207,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   }
 
   // record each job's run time
+#if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
@@ -220,6 +221,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
               << ", job_end_time = " << std::to_string(end_time);
     }
   }
+#endif
 
   // return Fetch Tensors
   if (FLAGS_enable_new_ir_in_executor) {

From a20e6ce1a80e91392362689335c898de05b975f3 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 23 Oct 2023 14:50:22 +0000
Subject: [PATCH 10/52] recover thirdparty

---
 third_party/mkldnn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/mkldnn b/third_party/mkldnn
index 403667673f61a..01204edbda1c2 160000
--- a/third_party/mkldnn
+++ b/third_party/mkldnn
@@ -1 +1 @@
-Subproject commit 403667673f61a56289622fd5bc587b1856296fbc
+Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56

From 3e10a6d2b2e35bd61bfb08d075cff5cc5d1a3a24 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Tue, 24 Oct 2023 10:19:36 +0000
Subject: [PATCH 11/52] add flag control: not support new ir now

---
 paddle/fluid/framework/new_executor/standalone_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 60c132110f1d4..45f6f103627e4 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -208,7 +208,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
 
   // record each job's run time
 #if defined(PADDLE_WITH_CUDA)
-  if (FLAGS_auto_parallel_profiler) {
+  if (FLAGS_auto_parallel_profiler && !FLAGS_enable_new_ir_in_executor) {
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
       const std::string& job_type = job->Type();

From 59b425eefad053a84f6d08d3e34c70dfca2e41b0 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Wed, 25 Oct 2023 00:51:04 +0000
Subject: [PATCH 12/52] set auto_parallel_profiler flag to false

---
 paddle/fluid/framework/new_executor/interpretercore.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 4da225d9068d1..3b28974efb5c6 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -35,7 +35,7 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
 PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler,
-                            true,
+                            false,
                             "Enable auto parallel profiler, collecting the "
                             "runtime of jobs in different devices");
 

From ddc5038b0293f9942304ebd148056e4262c802a8 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 26 Oct 2023 06:40:25 +0000
Subject: [PATCH 13/52] fix

---
 .../new_executor/interpreter_base_impl.h          |  4 ----
 .../framework/new_executor/program_interpreter.cc | 15 +++------------
 .../framework/new_executor/program_interpreter.h  |  4 ++++
 .../framework/new_executor/standalone_executor.cc |  7 ++++++-
 paddle/phi/kernels/autotune/gpu_timer.h           |  5 +++++
 .../static}/profiler_helper_static.py             |  0
 6 files changed, 18 insertions(+), 17 deletions(-)
 rename python/paddle/distributed/{fleet/meta_parallel/pp_utils => auto_parallel/static}/profiler_helper_static.py (100%)

diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index e0063c4b7cdf4..beab995a2c0bf 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -38,10 +38,6 @@
 #include "paddle/fluid/platform/device_event.h"
 #include "paddle/phi/backends/device_manager.h"
 
-#if defined(PADDLE_WITH_CUDA)
-#include "paddle/phi/kernels/autotune/gpu_timer.h"
-#endif
-
 PD_DECLARE_bool(new_executor_serial_run);
 PD_DECLARE_bool(new_executor_static_build);
 PD_DECLARE_bool(new_executor_use_inplace);
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index ca2e67d5aeb4d..9a5f8c8b255a9 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -106,7 +106,6 @@ ProgramInterpreter::~ProgramInterpreter() {
 void ProgramInterpreter::RunImpl() {
 #if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
-    // Note(sonder): Record the start time of the each stream.
     for (size_t i = 0; i < stream_timers_.size(); ++i) {
       auto& stream_timer = stream_timers_[i];
       stream_timer.Start();
@@ -646,7 +645,6 @@ void ProgramInterpreter::AddGpuStreamEvents() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   stream_timers_.clear();
   std::vector<gpuStream_t> streams;
-  bool has_default_stream = false;
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     auto& instr = vec_instruction_[i];
@@ -659,20 +657,13 @@ void ProgramInterpreter::AddGpuStreamEvents() {
     gpuStream_t stream =
         reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext())
             .stream();
-
-    if (stream != nullptr) {
-      has_default_stream = true;
-    }
+    streams.emplace_back(stream);
   }
-  size_t timers_size = has_default_stream ? streams.size() + 1 : streams.size();
-  stream_timers_.resize(timers_size);
+
+  stream_timers_.resize(streams.size());
   for (size_t i = 0; i < streams.size(); ++i) {
     stream_timers_[i].SetStream(streams[i]);
   }
-  if (has_default_stream) {
-    stream_timers_.back().SetStream(nullptr);
-  }
-
 #endif
 }
 
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 79809a4e8edb7..5a80f71d423a7 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -16,6 +16,10 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 
+#if defined(PADDLE_WITH_CUDA)
+#include "paddle/phi/kernels/autotune/gpu_timer.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 45f6f103627e4..a6156c40927b7 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -208,13 +208,18 @@ paddle::framework::FetchList StandaloneExecutor::Run(
 
   // record each job's run time
 #if defined(PADDLE_WITH_CUDA)
-  if (FLAGS_auto_parallel_profiler && !FLAGS_enable_new_ir_in_executor) {
+  if (FLAGS_auto_parallel_profiler) {
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
       const std::string& job_type = job->Type();
       double start_time, end_time;
       std::tie(start_time, end_time) =
           interpretercores_[job_idx]->InterpreterRunTime();
+      // Note(sonder): Used to record the runtime of each job in order to
+      // generate a parallel pipeline timeline. Job runtime information can be
+      // extracted from the logs using the scripts "profiler_helper_static.py".
+      // Do not modify, as it may affect the results of regular expression
+      // matching.
       VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type
               << ", micro_batch_id = " << job->MicroBatchId()
               << ", job_start_time = " << std::to_string(start_time)
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 9ef228906d1d6..3b99b27447472 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -71,6 +71,11 @@ class GpuTimer {
   }
 
   void Start() {
+    // Note(sonder): Since it is not possible to directly obtain the start time
+    // of the event, "gettimeofday" is used here to retrieve it. Subsequently,
+    // the end time of the event is calculated by adding the start time on the
+    // CPU with ElapsedTime(). Using the CPU time ensures that the timing is
+    // synchronized across multiple GPUs.
     struct timeval time_now {};
     gettimeofday(&time_now, nullptr);
     start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
similarity index 100%
rename from python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py
rename to python/paddle/distributed/auto_parallel/static/profiler_helper_static.py

From 1dfc8162f4363c5067abe6cddb2ee51f74898180 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 26 Oct 2023 09:04:39 +0000
Subject: [PATCH 14/52] add auto_parallel_profiler as command parameter

---
 paddle/fluid/framework/new_executor/pir_interpreter.cc   | 4 ++--
 python/paddle/distributed/auto_parallel/constants.py     | 1 +
 python/paddle/distributed/auto_parallel/static/engine.py | 3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 27522076dd77b..ec2ee2d365adc 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -281,9 +281,9 @@ void PirInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {
           << ") to InterpreterCore(" << this << ")";
 }
 
-std::tuple<double, double> NewIRInterpreter::InterpreterRunTime() {
+std::tuple<double, double> PirInterpreter::InterpreterRunTime() {
   PADDLE_THROW(platform::errors::Unimplemented(
-      "NewIRInterpreter::InterpreterRunTime is not implemented."));
+      "PirInterpreter::InterpreterRunTime is not implemented."));
 }
 
 const interpreter::NewIrDependencyBuilder&
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 9e41f5f732760..d78e944706905 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -112,6 +112,7 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(PIPELINE, "accumulate_steps", 1)
 set_field_default_config(PIPELINE, "generation_batch_size", 1)
 set_field_default_config(PIPELINE, "enable_send_recv_overlap", False)
+set_field_default_config(PIPELINE, "auto_parallel_profiler", False)
 
 #########################################
 # quantization configuration
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 999ad0cf94f92..6eeb583bd8294 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -251,6 +251,9 @@ def __init__(
         paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
         paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1})
 
+        if self._strategy.pipline.auto_parallel_profiler:
+            paddle.framework.set_flags({'FLAGS_auto_parallel_profiler': 1})
+
     def _prepare_data_spec(self, data, split, batch_size):
         inputs_spec = []
         labels_spec = []

From 9f271ef183dd2c1724ce7229b4c379666dd11963 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 26 Oct 2023 10:05:46 +0000
Subject: [PATCH 15/52] fix value name

---
 python/paddle/distributed/auto_parallel/static/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 6eeb583bd8294..bea78ea32c835 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -251,7 +251,7 @@ def __init__(
         paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
         paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1})
 
-        if self._strategy.pipline.auto_parallel_profiler:
+        if self._strategy.pipeline.auto_parallel_profiler:
             paddle.framework.set_flags({'FLAGS_auto_parallel_profiler': 1})
 
     def _prepare_data_spec(self, data, split, batch_size):

From dabf964b4b55ab43bedf4644e501ebe729be6cc0 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 27 Oct 2023 04:49:00 +0000
Subject: [PATCH 16/52] support gettimeofday for win env

---
 paddle/phi/kernels/autotune/gpu_timer.h | 56 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 3b99b27447472..6d21cc529a1e3 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,11 +14,10 @@
 
 #pragma once
 
-#include <sys/time.h>
-
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
@@ -26,6 +25,59 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef WIN32
+#include <time.h>
+#include <windows.h>
+#else
+#include <sys/time.h>
+#endif
+
+#ifdef WIN32
+
+#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
+#else
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+struct timezone {
+  int tz_minuteswest; /* minutes W of Greenwich */
+  int tz_dsttime;     /* type of dst correction */
+};
+
+int gettimeofday(struct timeval *tv, struct timezone *tz) {
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag = 0;
+
+  if (NULL != tv) {
+    GetSystemTimeAsFileTime(&ft);
+
+    tmpres |= ft.dwHighDateTime;
+    tmpres <<= 32;
+    tmpres |= ft.dwLowDateTime;
+
+    tmpres /= 10; /*convert into microseconds*/
+    /*converting file time to unix epoch*/
+    tmpres -= DELTA_EPOCH_IN_MICROSECS;
+    tv->tv_sec = static_cast<int64_t>(tmpres / 1000000UL);
+    tv->tv_usec = static_cast<int64_t>(tmpres % 1000000UL);
+  }
+
+  if (NULL != tz) {
+    if (!tzflag) {
+      _tzset();
+      tzflag++;
+    }
+    tz->tz_minuteswest = _timezone / 60;
+    tz->tz_dsttime = _daylight;
+  }
+
+  return 0;
+}
+
+#endif
+
 namespace phi {
 
 class GpuTimer {

From 6ad6f36f4d2c52f4fcad5d815b84d30705cc45e8 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 27 Oct 2023 06:54:36 +0000
Subject: [PATCH 17/52] fix win build error

---
 paddle/phi/kernels/autotune/gpu_timer.h | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 6d21cc529a1e3..7466848d156a9 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -40,12 +40,7 @@
 #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
 #endif
 
-struct timezone {
-  int tz_minuteswest; /* minutes W of Greenwich */
-  int tz_dsttime;     /* type of dst correction */
-};
-
-int gettimeofday(struct timeval *tv, struct timezone *tz) {
+int gettimeofday(struct timeval *tv, void *tz) {
   FILETIME ft;
   unsigned __int64 tmpres = 0;
   static int tzflag = 0;
@@ -64,15 +59,6 @@ int gettimeofday(struct timeval *tv, struct timezone *tz) {
     tv->tv_usec = static_cast<int64_t>(tmpres % 1000000UL);
   }
 
-  if (NULL != tz) {
-    if (!tzflag) {
-      _tzset();
-      tzflag++;
-    }
-    tz->tz_minuteswest = _timezone / 60;
-    tz->tz_dsttime = _daylight;
-  }
-
   return 0;
 }
 

From d58cc94d2e188cb2aa1207794d93bfd5147c5f59 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 27 Oct 2023 07:27:30 +0000
Subject: [PATCH 18/52] fix win build error

---
 paddle/phi/kernels/autotune/gpu_timer.h | 40 +------------------------
 1 file changed, 1 insertion(+), 39 deletions(-)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 7466848d156a9..fec1233ff7185 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
@@ -25,45 +26,6 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef WIN32
-#include <time.h>
-#include <windows.h>
-#else
-#include <sys/time.h>
-#endif
-
-#ifdef WIN32
-
-#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
-#else
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz) {
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag = 0;
-
-  if (NULL != tv) {
-    GetSystemTimeAsFileTime(&ft);
-
-    tmpres |= ft.dwHighDateTime;
-    tmpres <<= 32;
-    tmpres |= ft.dwLowDateTime;
-
-    tmpres /= 10; /*convert into microseconds*/
-    /*converting file time to unix epoch*/
-    tmpres -= DELTA_EPOCH_IN_MICROSECS;
-    tv->tv_sec = static_cast<int64_t>(tmpres / 1000000UL);
-    tv->tv_usec = static_cast<int64_t>(tmpres % 1000000UL);
-  }
-
-  return 0;
-}
-
-#endif
-
 namespace phi {
 
 class GpuTimer {

From e9886aef6ee178abe0e1b9a8cfa504c467fef611 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 27 Oct 2023 08:19:23 +0000
Subject: [PATCH 19/52] use job_type_to_id

---
 .../fluid/framework/new_executor/standalone_executor.cc  | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 3581510771b13..dc2534f7d3e26 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -205,18 +205,25 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   // record each job's run time
 #if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
+    std::unordered_map<std::string, int> job_type_to_id;
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
       const std::string& job_type = job->Type();
       double start_time, end_time;
       std::tie(start_time, end_time) =
           interpretercores_[job_idx]->InterpreterRunTime();
+      if (job_type_to_id.count(job_type) == 0) {
+        job_type_to_id[job_type] = 0;
+      } else {
+        job_type_to_id[job_type] += 1;
+      }
       // Note(sonder): Used to record the runtime of each job in order to
       // generate a parallel pipeline timeline. Job runtime information can be
       // extracted from the logs using the scripts "profiler_helper_static.py".
       // Do not modify, as it may affect the results of regular expression
       // matching.
-      VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type
+      VLOG(0) << "Profiler Info: Job (" << job_type_to_id[job_type]
+              << "), type = " << job_type
               << ", micro_batch_id = " << job->MicroBatchId()
               << ", job_start_time = " << std::to_string(start_time)
               << ", job_end_time = " << std::to_string(end_time);

From 282285bc92c439677e31fe342622b7dbf7cc1102 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 27 Oct 2023 08:43:13 +0000
Subject: [PATCH 20/52] Fixed repeatedly timing the same stream

---
 .../fluid/framework/new_executor/program_interpreter.cc  | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index b0f74bab2d291..f2592e1aaee5e 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -644,7 +644,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 void ProgramInterpreter::AddGpuStreamEvents() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   stream_timers_.clear();
-  std::vector<gpuStream_t> streams;
+  std::unordered_set<gpuStream_t> streams;
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     auto& instr = vec_instruction_[i];
@@ -657,12 +657,13 @@ void ProgramInterpreter::AddGpuStreamEvents() {
     gpuStream_t stream =
         reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext())
             .stream();
-    streams.emplace_back(stream);
+    streams.insert(stream);
   }
 
   stream_timers_.resize(streams.size());
-  for (size_t i = 0; i < streams.size(); ++i) {
-    stream_timers_[i].SetStream(streams[i]);
+  int i = 0;
+  for (auto& stream : streams) {
+    stream_timers_[i++].SetStream(stream);
   }
 #endif
 }

From fdc3f6ddf1396576c94fc6cc5862404dc1fcdf69 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Wed, 1 Nov 2023 12:02:15 +0000
Subject: [PATCH 21/52] add step line for timeline

---
 .../new_executor/program_interpreter.cc       | 67 ++++---------------
 .../new_executor/program_interpreter.h        |  4 +-
 .../new_executor/standalone_executor.cc       |  9 +--
 .../static/profiler_helper_static.py          | 54 +++++++++++++--
 4 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index eeb79a13f976e..7c43cd324bbdf 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -106,11 +106,9 @@ ProgramInterpreter::~ProgramInterpreter() {
 void ProgramInterpreter::RunImpl() {
 #if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
-    for (size_t i = 0; i < stream_timers_.size(); ++i) {
-      auto& stream_timer = stream_timers_[i];
-      stream_timer.Start();
-    }
+    calculate_stream_timer_.Start();
   }
+
 #endif
 
   // lazy initialization of gc, do not create gc is the program only run once
@@ -141,10 +139,7 @@ void ProgramInterpreter::RunImpl() {
 
 #if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
-    for (size_t i = 0; i < stream_timers_.size(); ++i) {
-      auto& stream_timer = stream_timers_[i];
-      stream_timer.Stop();
-    }
+    calculate_stream_timer_.Stop();
   }
 #endif
 }
@@ -642,53 +637,13 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
   }
 }
 
-void ProgramInterpreter::AddGpuStreamEvents() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  stream_timers_.clear();
-  std::unordered_set<gpuStream_t> streams;
-
-  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    auto& instr = vec_instruction_[i];
-    if ((instr.KernelType() != OpFuncType::kGpuAsync) ||
-        (instr.DeviceContext().GetPlace().GetType() ==
-         phi::AllocationType::CUSTOM)) {
-      continue;
-    }
-
-    gpuStream_t stream =
-        reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext())
-            .stream();
-    streams.insert(stream);
-  }
-
-  stream_timers_.resize(streams.size());
-  int i = 0;
-  for (auto& stream : streams) {
-    stream_timers_[i++].SetStream(stream);
-  }
-#endif
-}
-
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
-  double min_start_time = std::numeric_limits<double>::max(),
-         max_end_time = std::numeric_limits<double>::min();
+  double start_time, end_time;
 #if defined(PADDLE_WITH_CUDA)
-  for (size_t i = 0; i < stream_timers_.size(); ++i) {
-    auto& stream_timer = stream_timers_[i];
-    double start_time = stream_timer.StartTime();
-    double end_time = stream_timer.EndTime();
-
-    min_start_time = std::min(min_start_time, start_time);
-    max_end_time = std::max(max_end_time, end_time);
-
-    VLOG(3) << "ProgramInterpreter::InterpreterRunTime:"
-            << "start_time: " << std::to_string(start_time)
-            << ", end_time: " << std::to_string(end_time) << ", min_start_time"
-            << std::to_string(min_start_time)
-            << ", max_end_time: " << std::to_string(max_end_time);
-  }
+  start_time = calculate_stream_timer_.StartTime();
+  end_time = calculate_stream_timer_.EndTime();
 #endif
-  return std::make_tuple(min_start_time, max_end_time);
+  return std::make_tuple(start_time, end_time);
 }
 
 void ProgramInterpreter::Convert(
@@ -728,7 +683,13 @@ void ProgramInterpreter::Convert(
   }
 
   if (FLAGS_auto_parallel_profiler) {
-    AddGpuStreamEvents();
+#if defined(PADDLE_WITH_CUDA)
+    gpuStream_t calculate_stream =
+        dynamic_cast<phi::GPUContext*>(
+            platform::DeviceContextPool::Instance().Get(place_))
+            ->stream();
+    calculate_stream_timer_.SetStream(calculate_stream);
+#endif
   }
 
   BuildOperatorDependences();
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 5a80f71d423a7..e770b1e9abcbf 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -155,8 +155,6 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   // For log and debug
   std::string GetDepsString() const;
 
-  void AddGpuStreamEvents();
-
   bool is_build_{false};
   bool static_build_{false};
   // Note(sonder): share the op dependency and event analysis procedure.
@@ -220,7 +218,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   std::vector<HookFunc> hookfuncs_;
 
 #if defined(PADDLE_WITH_CUDA)
-  std::vector<phi::GpuTimer> stream_timers_;
+  phi::GpuTimer calculate_stream_timer_;
 #endif
 };
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 3807639e53279..42e6a701eae3e 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -219,24 +219,19 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   // record each job's run time
 #if defined(PADDLE_WITH_CUDA)
   if (FLAGS_auto_parallel_profiler) {
-    std::unordered_map<std::string, int> job_type_to_id;
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
       const std::string& job_type = job->Type();
       double start_time, end_time;
       std::tie(start_time, end_time) =
           interpretercores_[job_idx]->InterpreterRunTime();
-      if (job_type_to_id.count(job_type) == 0) {
-        job_type_to_id[job_type] = 0;
-      } else {
-        job_type_to_id[job_type] += 1;
-      }
+
       // Note(sonder): Used to record the runtime of each job in order to
       // generate a parallel pipeline timeline. Job runtime information can be
       // extracted from the logs using the scripts "profiler_helper_static.py".
       // Do not modify, as it may affect the results of regular expression
       // matching.
-      VLOG(0) << "Profiler Info: Job (" << job_type_to_id[job_type]
+      VLOG(0) << "Profiler Info: Job (" << job->MicroBatchId()
               << "), type = " << job_type
               << ", micro_batch_id = " << job->MicroBatchId()
               << ", job_start_time = " << std::to_string(start_time)
diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index 6567b813adc92..aecf7c3ae684d 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -42,11 +42,38 @@ def process_log_data(log_data, device_id):
         "optimizer": "rail_response",  # RGB: 238, 142, 0
         "default": "thread_state_unknown",  # RGB: 199, 155, 125
     }
+    step = 0
     for match in matches:
         job_id, job_type, micro_batch_id, job_start_time, job_end_time = match
         if job_type in ["lr"]:
             continue
 
+        if job_type == "forward" and int(micro_batch_id) == 0:
+            if step > 0:
+                event_step_stop = {
+                    "name": "Step " + str(step - 1),
+                    "cat": "Step",
+                    "ph": "E",
+                    "ts": float(job_start_time.strip()) * 1000,
+                    "pid": "Main",
+                    "tid": "Step" + str(device_id),
+                    "cname": color_map["default"],
+                }
+                events.append(event_step_stop)
+
+            event_step_start = {
+                "name": "Step " + str(step),
+                "cat": "Step",
+                "ph": "B",
+                "ts": float(job_start_time.strip()) * 1000,
+                "pid": "Main",
+                "tid": "Step" + str(device_id),
+                "cname": color_map["default"],
+            }
+            events.append(event_step_start)
+
+            step += 1
+
         event_start = {
             "name": job_type[0].upper() + "_" + str(job_id),
             "cat": job_type,
@@ -68,6 +95,15 @@ def process_log_data(log_data, device_id):
         events.append(event_start)
         events.append(event_end)
 
+    event_step_end = {
+        "name": "Step " + str(step),
+        "cat": "Step",
+        "ph": "E",
+        "ts": events[-1]["ts"],
+        "pid": "Main",
+        "tid": "Step" + str(device_id),
+        "cname": color_map["default"],
+    }
     return events
 
 
@@ -94,19 +130,29 @@ def main():
         all_events.extend(
             [
                 {
-                    "args": {"name": "GPU"},
+                    "args": {"name": "Step"},
                     "cat": "__metadata",
                     "name": "thread_name",
                     "ph": "M",
                     "pid": "Main",
-                    "tid": i,
+                    "tid": i * 2,
                     "ts": 0,
-                }
+                },
+                {
+                    "args": {"name": f"GPU:{i}"},
+                    "cat": "__metadata",
+                    "name": "thread_name",
+                    "ph": "M",
+                    "pid": "Main",
+                    "tid": i * 2 + 1,
+                    "ts": 0,
+                },
             ]
         )
     json_str = json.dumps({"traceEvents": all_events})
     for i in range(len(args.devices.split(","))):
-        json_str = json_str.replace(f'"GPU{i}"', f'{i}')
+        json_str = json_str.replace(f'"Step{i}"', f'{i * 2}')
+        json_str = json_str.replace(f'"GPU{i}"', f'{i * 2 + 1}')
 
     with open(save_path, "w") as f:
         f.write(json_str)

From 1ceadc5fa003c674246ff82f8af79f38ef08e354 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 2 Nov 2023 11:37:24 +0000
Subject: [PATCH 22/52] add step timeline and fix logic when job overlap

---
 .../platform/device/gpu/cuda/cuda_profiler.cc |  19 +-
 .../static/profiler_helper_static.py          | 162 +++++++++++-------
 2 files changed, 118 insertions(+), 63 deletions(-)

diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
index a49d9013fb6d0..609f49a38b6be 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 
+PHI_DECLARE_bool(auto_parallel_profiler);
+
 namespace paddle {
 namespace platform {
 
@@ -46,10 +48,25 @@ void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) {
   eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
   eventAttrib.message.ascii = name.c_str();
 
+  if (FLAGS_auto_parallel_profiler) {
+    struct timeval time_now {};
+    gettimeofday(&time_now, nullptr);
+    double push_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
+    VLOG(0) << "NVTX range push: " << name
+            << ", time: " << std::to_string(push_time);
+  }
   dynload::nvtxRangePushEx(&eventAttrib);
 }
 
-void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
+void CudaNvtxRangePop() {
+  if (FLAGS_auto_parallel_profiler) {
+    struct timeval time_now {};
+    gettimeofday(&time_now, nullptr);
+    double pop_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
+    VLOG(0) << "NVTX range pop, time: " << std::to_string(pop_time);
+  }
+  dynload::nvtxRangePop();
+}
 #endif
 
 }  // namespace platform
diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index aecf7c3ae684d..619811368b6b8 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -19,6 +19,13 @@
 
 import paddle
 
+color_map = {
+    "forward": "thread_state_running",  # RGB: 126, 200, 148
+    "backward": "rail_idle",  # RGB: 238, 142, 0
+    "optimizer": "rail_response",  # RGB: 238, 142, 0
+    "default": "thread_state_unknown",  # RGB: 199, 155, 125
+}
+
 
 def parse_args():
     parser = ArgumentParser()
@@ -32,93 +39,120 @@ def parse_args():
     return args
 
 
-def process_log_data(log_data, device_id):
+def process_job_log(log_data, device_id):
     log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)'
     matches = re.findall(log_pattern, log_data)
     events = []
-    color_map = {
-        "forward": "thread_state_running",  # RGB: 126, 200, 148
-        "backward": "rail_idle",  # RGB: 238, 142, 0
-        "optimizer": "rail_response",  # RGB: 238, 142, 0
-        "default": "thread_state_unknown",  # RGB: 199, 155, 125
-    }
-    step = 0
+    last_end_time = None
+
     for match in matches:
         job_id, job_type, micro_batch_id, job_start_time, job_end_time = match
         if job_type in ["lr"]:
             continue
 
-        if job_type == "forward" and int(micro_batch_id) == 0:
-            if step > 0:
-                event_step_stop = {
-                    "name": "Step " + str(step - 1),
-                    "cat": "Step",
-                    "ph": "E",
-                    "ts": float(job_start_time.strip()) * 1000,
-                    "pid": "Main",
-                    "tid": "Step" + str(device_id),
-                    "cname": color_map["default"],
-                }
-                events.append(event_step_stop)
-
-            event_step_start = {
-                "name": "Step " + str(step),
-                "cat": "Step",
-                "ph": "B",
-                "ts": float(job_start_time.strip()) * 1000,
-                "pid": "Main",
-                "tid": "Step" + str(device_id),
-                "cname": color_map["default"],
-            }
-            events.append(event_step_start)
+        start_time = float(job_start_time.strip()) * 1000
+        end_time = float(job_end_time.strip()) * 1000
 
-            step += 1
+        if last_end_time is not None:
+            if start_time < last_end_time:
+                end_time = end_time + last_end_time - start_time
+                start_time = last_end_time
 
         event_start = {
-            "name": job_type[0].upper() + "_" + str(job_id),
+            "name": job_type + "_" + str(job_id),
             "cat": job_type,
             "ph": "B",
-            "ts": float(job_start_time.strip()) * 1000,
-            "pid": "Main",
+            "ts": start_time,
+            "pid": 0,
             "tid": "GPU" + str(device_id),
             "cname": color_map[job_type],
         }
         event_end = {
-            "name": job_type[0].upper() + "_" + str(job_id),
+            "name": job_type + "_" + str(job_id),
             "cat": job_type,
             "ph": "E",
-            "pid": "Main",
-            "ts": float(job_end_time.strip()) * 1000,
+            "pid": 0,
+            "ts": end_time,
             "tid": "GPU" + str(device_id),
             "cname": color_map[job_type],
         }
         events.append(event_start)
         events.append(event_end)
 
-    event_step_end = {
-        "name": "Step " + str(step),
-        "cat": "Step",
-        "ph": "E",
-        "ts": events[-1]["ts"],
-        "pid": "Main",
-        "tid": "Step" + str(device_id),
-        "cname": color_map["default"],
-    }
+        last_end_time = end_time
+
     return events
 
 
+def process_step_log(log_data, device_id):
+    start_pattern = r'.*?NVTX range push: (\d+), time: (\d+.\d+)'
+    end_pattern = r'.*?NVTX range pop, time: (\d+.\d+)'
+    start_matches = re.findall(start_pattern, log_data)
+    end_matches = re.findall(end_pattern, log_data)
+
+    step_info = []
+    for start_match, stop_match in zip(start_matches, end_matches):
+        step_id, start_time = start_match
+        stop_time = stop_match
+        if int(step_id) >= len(step_info):
+            for _ in range(int(step_id) - len(step_info) + 1):
+                step_info.append([float('inf'), 0])
+        step_info[int(step_id)] = [start_time, stop_time]
+    return step_info
+
+
 def main():
     args = parse_args()
     all_events = []
+    step_infos = []
     for device_id in args.devices.split(","):
         print(f"Process device {device_id}")
         device_id = int(device_id)
         log_file = os.path.join(args.log_dir, "workerlog." + str(device_id))
         with open(log_file, "r") as f:
             log_data = f.read()
-        events = process_log_data(log_data, device_id)
+        events = process_job_log(log_data, device_id)
+        step_info = process_step_log(log_data, device_id)
         all_events.extend(events)
 
+        if len(step_info) > len(step_infos):
+            for _ in range(len(step_info) - len(step_infos)):
+                step_infos.append([float('inf'), 0])
+        for i, info in enumerate(step_info):
+            start_time = float(info[0].strip()) * 1000
+            stop_time = float(info[1].strip()) * 1000
+
+            step_infos[i][0] = min(step_infos[i][0], start_time)
+            step_infos[i][1] = max(step_infos[i][1], stop_time)
+
+    for i, info in enumerate(step_infos):
+        if info[0] == float('inf'):
+            continue
+        start_time = info[0]
+        if i > 0:
+            start_time = max(start_time, step_infos[i - 1][1])
+        event_start = {
+            "name": "step" + str(i),
+            "cat": "step",
+            "ph": "B",
+            "ts": start_time,
+            "pid": 0,
+            "tid": "Step",
+            "cname": color_map["default"],
+        }
+        event_end = {
+            "name": "step" + str(i),
+            "cat": "step",
+            "ph": "E",
+            "ts": info[1],
+            "pid": 0,
+            "tid": "Step",
+            "cname": color_map["default"],
+        }
+
+        all_events.append(event_start)
+        all_events.append(event_end)
+
     save_path = os.path.join(args.log_dir, "pipeline_profile.json")
     with open(save_path, "w") as f:
         f.write(json.dumps({"traceEvents": all_events}))
@@ -126,33 +160,37 @@ def main():
 
     # support Perfetto format
     save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json")
+    all_events.extend(
+        [
+            {
+                "args": {"name": "STEP"},
+                "cat": "__metadata",
+                "name": "thread_name",
+                "ph": "M",
+                "pid": 0,
+                "tid": 2333,
+                "ts": 0,
+            }
+        ]
+    )
     for i in range(len(args.devices.split(","))):
         all_events.extend(
             [
-                {
-                    "args": {"name": "Step"},
-                    "cat": "__metadata",
-                    "name": "thread_name",
-                    "ph": "M",
-                    "pid": "Main",
-                    "tid": i * 2,
-                    "ts": 0,
-                },
                 {
                     "args": {"name": f"GPU:{i}"},
                     "cat": "__metadata",
                     "name": "thread_name",
                     "ph": "M",
-                    "pid": "Main",
-                    "tid": i * 2 + 1,
+                    "pid": 0,
+                    "tid": i + 2334,
                     "ts": 0,
-                },
+                }
             ]
         )
     json_str = json.dumps({"traceEvents": all_events})
     for i in range(len(args.devices.split(","))):
-        json_str = json_str.replace(f'"Step{i}"', f'{i * 2}')
-        json_str = json_str.replace(f'"GPU{i}"', f'{i * 2 + 1}')
+        json_str = json_str.replace('"Step"', '2333')
+        json_str = json_str.replace(f'"GPU{i}"', f'{i + 2334}')
 
     with open(save_path, "w") as f:
         f.write(json_str)

From 679cc393c8745fee85bff9b072081bc307903a93 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 6 Nov 2023 08:10:13 +0000
Subject: [PATCH 23/52] update time record logic

---
 .../new_executor/program_interpreter.cc       | 80 ++++++++++++++-----
 .../new_executor/program_interpreter.h        |  5 +-
 paddle/phi/kernels/autotune/gpu_timer.h       | 70 +++++++++++++---
 .../static/profiler_helper_static.py          |  5 --
 4 files changed, 125 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 7c43cd324bbdf..febd68d33d821 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -104,13 +104,6 @@ ProgramInterpreter::~ProgramInterpreter() {
 }
 
 void ProgramInterpreter::RunImpl() {
-#if defined(PADDLE_WITH_CUDA)
-  if (FLAGS_auto_parallel_profiler) {
-    calculate_stream_timer_.Start();
-  }
-
-#endif
-
   // lazy initialization of gc, do not create gc is the program only run once
   if (!gc_) {
     gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
@@ -131,17 +124,12 @@ void ProgramInterpreter::RunImpl() {
     async_work_queue_ = GetWorkQueue();
     ExecuteInstructionList(vec_instruction_);
   }
+
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   if (platform::is_custom_place(place_)) {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
 #endif
-
-#if defined(PADDLE_WITH_CUDA)
-  if (FLAGS_auto_parallel_profiler) {
-    calculate_stream_timer_.Stop();
-  }
-#endif
 }
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
@@ -639,9 +627,35 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time, end_time;
-#if defined(PADDLE_WITH_CUDA)
-  start_time = calculate_stream_timer_.StartTime();
-  end_time = calculate_stream_timer_.EndTime();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  //   double remaining_time = std::numeric_limits<double>::max();
+  //   double event_time, wait_time;
+  start_time = calculated_stream_timer_.StartTime();
+  end_time = calculated_stream_timer_.EndTime();
+//   gpuEvent_t stop_event = calculated_stream_timer_.StopEvent();
+
+//   for (auto& inst : vec_instruction_) {
+//     for (auto event_inter : inst.EventsToWait()) {
+//       auto* wrapper = static_cast<paddle::platform::CUDADeviceEventWrapper*>(
+//           event_inter.event_->GetEvent().get());
+
+//       gpuEvent_t start_event = wrapper->inner_event_.GetRawCudaEvent();
+//       event_time = 0;
+//       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(
+//           reinterpret_cast<float*>(&event_time), start_event, stop_event));
+//       remaining_time = std::min(remaining_time, event_time);
+//       VLOG(0) << "event time: " << std::to_string(event_time)
+//               << " remaining: " << std::to_string(remaining_time);
+//     }
+//   }
+//   if (remaining_time == std::numeric_limits<double>::max()) {
+//     remaining_time = 0;
+//   }
+//   wait_time = end_time - start_time - remaining_time;
+//   start_time = start_time + wait_time;
+//   VLOG(0) << "wait time: " << std::to_string(wait_time)
+//           << " start time: " << std::to_string(start_time)
+//           << " end time: " << std::to_string(end_time);
 #endif
   return std::make_tuple(start_time, end_time);
 }
@@ -683,12 +697,12 @@ void ProgramInterpreter::Convert(
   }
 
   if (FLAGS_auto_parallel_profiler) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     gpuStream_t calculate_stream =
         dynamic_cast<phi::GPUContext*>(
             platform::DeviceContextPool::Instance().Get(place_))
             ->stream();
-    calculate_stream_timer_.SetStream(calculate_stream);
+    calculated_stream_timer_.SetStream(calculate_stream);
 #endif
   }
 
@@ -1050,6 +1064,15 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
+    if (FLAGS_auto_parallel_profiler) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      if (!interpreter::IsCommunicationOp(instr_node) &&
+          !calculated_stream_timer_.IsStart()) {
+        VLOG(3) << "Start calculated stream timer from op: " << op->Type();
+        calculated_stream_timer_.Start();
+      }
+#endif
+    }
 
     if (!instr_node.IsArtificial()) {
       RunOperator(instr_node);
@@ -1102,6 +1125,17 @@ void ProgramInterpreter::ExecuteInstructionList(
 
   exception_holder_.Clear();
 
+  if (FLAGS_auto_parallel_profiler) {
+    for (int i = vec_instr.size() - 1; i >= 0; --i) {
+      auto& instr_node = vec_instr[i];
+      if (!interpreter::IsCommunicationOp(instr_node)) {
+        VLOG(3) << "Last calculated op type: " << instr_node.OpBase()->Type();
+        last_calculated_instr_id = i;
+        break;
+      }
+    }
+  }
+
   for (size_t i = 0; i < dependecy_count_->size(); ++i) {
     if ((*dependecy_count_)[i] == 0) {
       // NOTE(zhiqiu): hot fix for jit input var
@@ -1213,6 +1247,16 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 
     RunInstruction(instr_node);
 
+    if (FLAGS_auto_parallel_profiler) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      if (instr_id == last_calculated_instr_id) {
+        VLOG(3) << "Stop calculated stream timer from op: "
+                << instr_node.OpBase()->Type();
+        calculated_stream_timer_.Stop();
+      }
+#endif
+    }
+
     if (UNLIKELY(exception_holder_.IsCaught())) {
       VLOG(4) << "Exception caught";
       if (exception_notifier_ != nullptr) {
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index e770b1e9abcbf..c92c97529932b 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -217,9 +217,10 @@ class ProgramInterpreter : public InterpreterBaseImpl {
 
   std::vector<HookFunc> hookfuncs_;
 
-#if defined(PADDLE_WITH_CUDA)
-  phi::GpuTimer calculate_stream_timer_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  phi::GpuTimer calculated_stream_timer_;
 #endif
+  size_t last_calculated_instr_id;
 };
 
 }  // namespace framework
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index fec1233ff7185..25403abbe1578 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -28,6 +28,16 @@
 
 namespace phi {
 
+static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
+                                               cudaError_t status,
+                                               void *user_data) {
+  struct timeval time_now {};
+  gettimeofday(&time_now, nullptr);
+  double *cpu_time = static_cast<double *>(user_data);
+  *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
+  VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
+}
+
 class GpuTimer {
  public:
   GpuTimer() {
@@ -42,6 +52,7 @@ class GpuTimer {
         start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
     PADDLE_ENFORCE_NOT_NULL(
         stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
+    is_start_ = false;
   }
 
   ~GpuTimer() {
@@ -60,6 +71,7 @@ class GpuTimer {
 #else
     cudaEventRecord(start_, stream);
 #endif
+    is_start_ = true;
   }
 
   void Stop(gpuStream_t stream) {
@@ -68,21 +80,41 @@ class GpuTimer {
 #else
     cudaEventRecord(stop_, stream);
 #endif
+    is_start_ = false;
   }
 
   void Start() {
     // Note(sonder): Since it is not possible to directly obtain the start time
-    // of the event, "gettimeofday" is used here to retrieve it. Subsequently,
-    // the end time of the event is calculated by adding the start time on the
-    // CPU with ElapsedTime(). Using the CPU time ensures that the timing is
-    // synchronized across multiple GPUs.
-    struct timeval time_now {};
-    gettimeofday(&time_now, nullptr);
-    start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
+    // of the event, "gettimeofday" is used here to retrieve it. The callback is
+    // used to record the start time of the event.
     Start(stream_);
+#ifdef PADDLE_WITH_HIP
+    hipStreamAddCallback(stream_,
+                         RecordEventTimerCallback,
+                         reinterpret_cast<void *>(&start_time_),
+                         0);
+#else
+    cudaStreamAddCallback(stream_,
+                          RecordEventTimerCallback,
+                          reinterpret_cast<void *>(&start_time_),
+                          0);
+#endif
   }
 
-  void Stop() { Stop(stream_); }
+  void Stop() {
+    Stop(stream_);
+#ifdef PADDLE_WITH_HIP
+    hipStreamAddCallback(stream_,
+                         RecordEventTimerCallback,
+                         reinterpret_cast<void *>(&end_time_),
+                         0);
+#else
+    cudaStreamAddCallback(stream_,
+                          RecordEventTimerCallback,
+                          reinterpret_cast<void *>(&end_time_),
+                          0);
+#endif
+  }
 
   void SetStream(gpuStream_t stream) { stream_ = stream; }
 
@@ -98,15 +130,33 @@ class GpuTimer {
     return milliseconds;
   }
 
-  double StartTime() { return start_time_; }
+  double StartTime() {
+#ifdef PADDLE_WITH_HIP
+    hipStreamSynchronize(stream_);
+#else
+    cudaStreamSynchronize(stream_);
+#endif
+    return start_time_;
+  }
+
+  double EndTime() {
+#ifdef PADDLE_WITH_HIP
+    hipStreamSynchronize(stream_);
+#else
+    cudaStreamSynchronize(stream_);
+#endif
+    return end_time_;
+  }
 
-  double EndTime() { return ElapsedTime() + start_time_; }
+  bool IsStart() { return is_start_; }
 
  private:
   gpuEvent_t start_;
   gpuEvent_t stop_;
   gpuStream_t stream_;
   double start_time_;
+  double end_time_;
+  bool is_start_;
 };
 
 }  // namespace phi
diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index 619811368b6b8..3118e74af8c5c 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -53,11 +53,6 @@ def process_job_log(log_data, device_id):
         start_time = float(job_start_time.strip()) * 1000
         end_time = float(job_end_time.strip()) * 1000
 
-        if last_end_time is not None:
-            if start_time < last_end_time:
-                end_time = end_time + last_end_time - start_time
-                start_time = last_end_time
-
         event_start = {
             "name": job_type + "_" + str(job_id),
             "cat": job_type,

From 1a04fea1322797a9b50bf3aec17d50a5eaf7384c Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Tue, 7 Nov 2023 02:34:33 +0000
Subject: [PATCH 24/52] fix bug when start profile start from none zero step

---
 .../static/profiler_helper_static.py          | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index 3118e74af8c5c..e541977ac7251 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -39,7 +39,7 @@ def parse_args():
     return args
 
 
-def process_job_log(log_data, device_id):
+def process_job_log(log_data, device_id, log_start_time):
     log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)'
     matches = re.findall(log_pattern, log_data)
     events = []
@@ -53,6 +53,9 @@ def process_job_log(log_data, device_id):
         start_time = float(job_start_time.strip()) * 1000
         end_time = float(job_end_time.strip()) * 1000
 
+        if log_start_time > start_time:
+            continue
+
         event_start = {
             "name": job_type + "_" + str(job_id),
             "cat": job_type,
@@ -84,6 +87,7 @@ def process_step_log(log_data, device_id):
     end_pattern = r'.*?NVTX range pop, time: (\d+.\d+)'
     start_matches = re.findall(start_pattern, log_data)
     end_matches = re.findall(end_pattern, log_data)
+    end_matches = end_matches[len(end_matches) - len(start_matches) :]
 
     step_info = []
     for start_match, stop_match in zip(start_matches, end_matches):
@@ -93,7 +97,12 @@ def process_step_log(log_data, device_id):
             for _ in range(int(step_id) - len(step_info) + 1):
                 step_info.append([float('inf'), 0])
         step_info[int(step_id)] = [start_time, stop_time]
-    return step_info
+
+    start_step = 0
+    for info in step_info:
+        if info[0] == float('inf'):
+            start_step += 1
+    return step_info, start_step
 
 
 def main():
@@ -106,20 +115,24 @@ def main():
         log_file = os.path.join(args.log_dir, "workerlog." + str(device_id))
         with open(log_file, "r") as f:
             log_data = f.read()
-        events = process_job_log(log_data, device_id)
-        step_info = process_step_log(log_data, device_id)
-        all_events.extend(events)
+
+        step_info, start_step = process_step_log(log_data, device_id)
 
         if len(step_info) > len(step_infos):
             for _ in range(len(step_info) - len(step_infos)):
                 step_infos.append([float('inf'), 0])
         for i, info in enumerate(step_info):
+            if info[0] == float('inf'):
+                continue
             start_time = float(info[0].strip()) * 1000
             stop_time = float(info[1].strip()) * 1000
 
             step_infos[i][0] = min(step_infos[i][0], start_time)
             step_infos[i][1] = max(step_infos[i][1], stop_time)
 
+        events = process_job_log(log_data, device_id, step_infos[start_step][0])
+        all_events.extend(events)
+
     for i, info in enumerate(step_infos):
         if info[0] == float('inf'):
             continue

From e1c619da29411ac35751e8b5f9d8a2557bcf31f9 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Tue, 7 Nov 2023 06:00:52 +0000
Subject: [PATCH 25/52] fix note

---
 paddle/fluid/framework/new_executor/standalone_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 42e6a701eae3e..7c978fe064198 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -195,7 +195,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
     VLOG(6) << "Run job (" << job_idx << "), type = " << job_type
             << ", micro_batch_id =" << job->MicroBatchId();
 
-    // Note(sonder): Share build results don't work for new IR now.
+    // NOTE(sonder): Share build results don't work for new IR now.
     if (type_to_first_id.count(job_type) != 0 &&
         !FLAGS_enable_new_ir_in_executor) {
       interpretercores_[job_idx]->ShareBuildResultsFrom(

From 9c8b7400c1a7169b94ae9b9d1d3b24a7b4a31d39 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Tue, 7 Nov 2023 06:55:59 +0000
Subject: [PATCH 26/52] remove FLAGS_auto_parallel_profiler

---
 .../framework/new_executor/interpretercore.cc |  5 +++--
 .../framework/new_executor/interpretercore.h  |  3 ++-
 .../framework/new_executor/pir_interpreter.cc |  3 ++-
 .../framework/new_executor/pir_interpreter.h  |  6 ++++--
 .../new_executor/program_interpreter.cc       | 13 +++++++------
 .../new_executor/program_interpreter.h        |  7 +++++--
 .../new_executor/standalone_executor.cc       |  4 ++--
 .../new_executor/standalone_executor.h        |  2 ++
 .../platform/device/gpu/cuda/cuda_profiler.cc | 19 +------------------
 .../distributed/auto_parallel/constants.py    |  2 +-
 .../auto_parallel/static/engine.py            |  4 ++--
 .../static/profiler_helper_static.py          | 17 +++++++++++------
 12 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index db6cbfbfd8e09..7423d43ac4502 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -75,8 +75,9 @@ FetchList InterpreterCore::Run(
 }
 
 FetchList InterpreterCore::Run(const std::vector<std::string>& feed_names,
-                               bool need_fetch) {
-  return impl_->Run(feed_names, need_fetch);
+                               bool need_fetch,
+                               bool enable_auto_parallel_profiler) {
+  return impl_->Run(feed_names, need_fetch, enable_auto_parallel_profiler);
 }
 
 void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index a4b2e0a6d67cf..8fbec0e296786 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -51,7 +51,8 @@ class InterpreterCore {
       bool need_fetch = true);
 
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
-                                   bool need_fetch = true);
+                                   bool need_fetch = true,
+                                   bool enable_auto_parallel_profiler = false);
 
   void ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src);
 
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 43492d0871758..ae38858139e92 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -1187,7 +1187,8 @@ paddle::framework::FetchList PirInterpreter::Run(
 }
 
 FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
-                              bool need_fetch) {
+                              bool need_fetch,
+                              bool enable_auto_parallel_profiler) {
   SetDeviceId(place_);
   CheckCUDAGraphBeforeRun(feed_names);
 
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
index 48e1e6c4bde63..6ff147002df12 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -54,8 +54,10 @@ class PirInterpreter : public InterpreterBaseImpl {
       const std::vector<phi::DenseTensor>& feed_tensors,
       bool need_fetch = true) override;
 
-  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
-                                   bool need_fetch = true) override;
+  paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names,
+      bool need_fetch = true,
+      bool enable_auto_parallel_profiler = false) override;
 
   void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
 
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 89a7da9e5c0fe..87c2e9f7fc9b7 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -39,7 +39,6 @@
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
-PHI_DECLARE_bool(auto_parallel_profiler);
 
 namespace paddle {
 namespace framework {
@@ -133,7 +132,9 @@ void ProgramInterpreter::RunImpl() {
 }
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
-                                  bool need_fetch) {
+                                  bool need_fetch,
+                                  bool enable_auto_parallel_profiler) {
+  enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
   std::vector<paddle::framework::OpFuncNode> op_func_nodes;
   Build(feed_names, &op_func_nodes);
 
@@ -703,7 +704,7 @@ void ProgramInterpreter::Convert(
     vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
   }
 
-  if (FLAGS_auto_parallel_profiler) {
+  if (enable_auto_parallel_profiler_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     gpuStream_t calculate_stream =
         dynamic_cast<phi::GPUContext*>(
@@ -1075,7 +1076,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
-    if (FLAGS_auto_parallel_profiler) {
+    if (enable_auto_parallel_profiler_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (!interpreter::IsCommunicationOp(instr_node) &&
           !calculated_stream_timer_.IsStart()) {
@@ -1136,7 +1137,7 @@ void ProgramInterpreter::ExecuteInstructionList(
 
   exception_holder_.Clear();
 
-  if (FLAGS_auto_parallel_profiler) {
+  if (enable_auto_parallel_profiler_) {
     for (int i = vec_instr.size() - 1; i >= 0; --i) {
       auto& instr_node = vec_instr[i];
       if (!interpreter::IsCommunicationOp(instr_node)) {
@@ -1258,7 +1259,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 
     RunInstruction(instr_node);
 
-    if (FLAGS_auto_parallel_profiler) {
+    if (enable_auto_parallel_profiler_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (instr_id == last_calculated_instr_id) {
         VLOG(3) << "Stop calculated stream timer from op: "
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 9ae1e75e19b5d..379edbfab8f59 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -50,8 +50,10 @@ class ProgramInterpreter : public InterpreterBaseImpl {
       const std::vector<phi::DenseTensor>& feed_tensors,
       bool need_fetch = true) override;
 
-  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
-                                   bool need_fetch = true) override;
+  paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names,
+      bool need_fetch = true,
+      bool enable_auto_parallel_profiler = false) override;
 
   void Build(
       const std::vector<std::string>& feed_names,
@@ -222,6 +224,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   phi::GpuTimer calculated_stream_timer_;
 #endif
   size_t last_calculated_instr_id;
+  bool enable_auto_parallel_profiler_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 7311d190ad4d9..47bd4375d4952 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -30,7 +30,6 @@
 PHI_DECLARE_bool(enable_pir_in_executor);
 PHI_DECLARE_bool(enable_pir_api);
 PHI_DECLARE_bool(pir_apply_inplace_pass);
-PHI_DECLARE_bool(auto_parallel_profiler);
 
 namespace paddle {
 namespace framework {
@@ -50,6 +49,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     ss << scope << ", ";
   }
   VLOG(6) << ss.str();
+  enable_auto_parallel_profiler_ = false;
 
   const auto& jobs = plan_.JobList();
   for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
@@ -225,7 +225,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
 
   // record each job's run time
 #if defined(PADDLE_WITH_CUDA)
-  if (FLAGS_auto_parallel_profiler) {
+  if (enable_auto_parallel_profiler_) {
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
       const std::string& job_type = job->Type();
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 8feef6e5b2f91..e73fa65375bab 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -55,6 +55,8 @@ class StandaloneExecutor {
 
   std::vector<std::unordered_map<std::string, std::shared_ptr<EventInter>>>
       vec_force_events_to_wait_;
+
+  bool enable_auto_parallel_profiler_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
index 609f49a38b6be..a49d9013fb6d0 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc
@@ -14,8 +14,6 @@
 
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 
-PHI_DECLARE_bool(auto_parallel_profiler);
-
 namespace paddle {
 namespace platform {
 
@@ -48,25 +46,10 @@ void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) {
   eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
   eventAttrib.message.ascii = name.c_str();
 
-  if (FLAGS_auto_parallel_profiler) {
-    struct timeval time_now {};
-    gettimeofday(&time_now, nullptr);
-    double push_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
-    VLOG(0) << "NVTX range push: " << name
-            << ", time: " << std::to_string(push_time);
-  }
   dynload::nvtxRangePushEx(&eventAttrib);
 }
 
-void CudaNvtxRangePop() {
-  if (FLAGS_auto_parallel_profiler) {
-    struct timeval time_now {};
-    gettimeofday(&time_now, nullptr);
-    double pop_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
-    VLOG(0) << "NVTX range pop, time: " << std::to_string(pop_time);
-  }
-  dynload::nvtxRangePop();
-}
+void CudaNvtxRangePop() { dynload::nvtxRangePop(); }
 #endif
 
 }  // namespace platform
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 5e8040e851b8d..7d8587315c7a8 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -114,7 +114,7 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(PIPELINE, "accumulate_steps", 1)
 set_field_default_config(PIPELINE, "generation_batch_size", 1)
 set_field_default_config(PIPELINE, "enable_send_recv_overlap", False)
-set_field_default_config(PIPELINE, "auto_parallel_profiler", False)
+set_field_default_config(PIPELINE, "schedule_profiler", False)
 
 #########################################
 # quantization configuration
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 17d86130fab8a..8a875eee3024a 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -251,8 +251,8 @@ def __init__(
         paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
         paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1})
 
-        if self._strategy.pipeline.auto_parallel_profiler:
-            paddle.framework.set_flags({'FLAGS_auto_parallel_profiler': 1})
+        if self._strategy.pipeline.schedule_profiler:
+            paddle.framework.set_flags({'schedule_profiler': 1})
 
     def _prepare_data_spec(self, data, split, batch_size):
         inputs_spec = []
diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index e541977ac7251..35afd2c818612 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -13,11 +13,18 @@
 # limitations under the License.
 
 import json
+import logging
 import os
 import re
 from argparse import ArgumentParser
 
 import paddle
+from paddle.base.log_helper import get_logger
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
+
 
 color_map = {
     "forward": "thread_state_running",  # RGB: 126, 200, 148
@@ -32,9 +39,7 @@ def parse_args():
     device_count = paddle.device.cuda.device_count()
     all_devices = ",".join([str(i) for i in range(device_count)])
     parser.add_argument("--devices", type=str, default=all_devices)
-    parser.add_argument(
-        "--log_dir", type=str, default="build/Testing/Temporary/"
-    )
+    parser.add_argument("--log_dir", type=str, required=True)
     args = parser.parse_args()
     return args
 
@@ -110,7 +115,7 @@ def main():
     all_events = []
     step_infos = []
     for device_id in args.devices.split(","):
-        print(f"Process device {device_id}")
+        _logger.info(f"Process device {device_id}")
         device_id = int(device_id)
         log_file = os.path.join(args.log_dir, "workerlog." + str(device_id))
         with open(log_file, "r") as f:
@@ -164,7 +169,7 @@ def main():
     save_path = os.path.join(args.log_dir, "pipeline_profile.json")
     with open(save_path, "w") as f:
         f.write(json.dumps({"traceEvents": all_events}))
-    print(f"Save pipeline profile to {save_path}")
+    _logger.info(f"Save pipeline profile to {save_path}")
 
     # support Perfetto format
     save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json")
@@ -202,7 +207,7 @@ def main():
 
     with open(save_path, "w") as f:
         f.write(json_str)
-    print(f"Save pipeline profile to {save_path}")
+    _logger.info(f"Save pipeline profile to {save_path}")
 
 
 if __name__ == "__main__":

From 63de31b57fe7c7fbcaeccf0260913bfba8cce322 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Tue, 7 Nov 2023 14:24:05 +0000
Subject: [PATCH 27/52] use run config instead FLAGS_auto_parallelxx

---
 .../new_executor/interpreter_base_impl.h      |   4 +-
 .../new_executor/standalone_executor.cc       |  14 +-
 .../new_executor/standalone_executor.h        |   2 +
 paddle/fluid/pybind/pybind.cc                 |   6 +
 python/paddle/base/executor.py                |  11 ++
 .../auto_parallel/static/engine.py            |   8 +-
 .../static/profiler_helper_static.py          | 122 ++++++++----------
 7 files changed, 93 insertions(+), 74 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 31cb9e1bcff25..9e441f9c9bdf2 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -71,7 +71,9 @@ class InterpreterBaseImpl {
       bool need_fetch = true) = 0;
 
   virtual paddle::framework::FetchList Run(
-      const std::vector<std::string>& feed_names, bool need_fetch = true) = 0;
+      const std::vector<std::string>& feed_names,
+      bool need_fetch = true,
+      bool enable_auto_parallel_profiler = false) = 0;
 
   virtual void ShareWorkQueueFrom(InterpreterBaseImpl* src) = 0;
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 47bd4375d4952..aa378bd626cd6 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -216,9 +216,14 @@ paddle::framework::FetchList StandaloneExecutor::Run(
       if (jobs.size() > 1 && job_type != "forward") {
         const std::vector<std::string> tmp_feed_names = {};
         interpretercores_[job_idx]->Run(tmp_feed_names,
-                                        /*need_fetch = */ false);
+                                        /*need_fetch = */ false,
+                                        /*enable_auto_parallel_profiler = */
+                                        enable_auto_parallel_profiler_);
       } else {
-        interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false);
+        interpretercores_[job_idx]->Run(feed_names,
+                                        /*need_fetch = */ false,
+                                        /*enable_auto_parallel_profiler = */
+                                        enable_auto_parallel_profiler_);
       }
     }
   }
@@ -262,5 +267,10 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   }
 }
 
+void StandaloneExecutor::SetEnableAutoParallelProfiler(
+    bool enable_auto_parallel_profiler) {
+  enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index e73fa65375bab..01c3a197f298c 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -41,6 +41,8 @@ class StandaloneExecutor {
 
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names);
 
+  void SetEnableAutoParallelProfiler(bool enable_auto_parallel_profiler);
+
  private:
   bool is_interpretercore_build_result_shared_{false};
   const platform::Place place_;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cabefe569a9d7..58c578b9ae575 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2019,6 +2019,12 @@ All parameter, weight, gradient are variables in Paddle.
                ret = self.Run(feed_names);
              }
              return py::cast(std::move(ret));
+           })
+
+      .def("set_enable_auto_parallel_profiler",
+           [](StandaloneExecutor &self, bool enable_auto_parallel_profiler) {
+             pybind11::gil_scoped_release release;
+             self.SetEnableAutoParallelProfiler(enable_auto_parallel_profiler);
            });
 
   py::class_<framework::interpreter::Job,
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 1f3792f62d6e3..ea4a95e8b1064 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -823,6 +823,11 @@ def run(self, feed_names, return_numpy=True):
                 )
             return tensors
 
+    def set_enable_auto_parallel_profiler(self, enable_auto_parallel_profiler):
+        self._new_exe.set_enable_auto_parallel_profiler(
+            enable_auto_parallel_profiler
+        )
+
     def _create_new_executor(self):
         new_exe = core.StandaloneExecutor(self._place, self._plan, self._scope)
 
@@ -1179,6 +1184,8 @@ def __init__(self, place=None):
 
         self.op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
+        self.enable_auto_parallel_profiler = False
+
     def _is_optimizer_op(self, op):
         return self.op_role_key in op.attr_names and int(
             op.all_attrs()[self.op_role_key]
@@ -1888,6 +1895,10 @@ def _run_impl(
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
 
+            new_exe.set_enable_auto_parallel_profiler(
+                self.enable_auto_parallel_profiler
+            )
+
             ret = new_exe.run(list(feed.keys()), return_numpy)
             set_flags(stored_flag)
             return ret
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 8a875eee3024a..972f435c69462 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -251,8 +251,7 @@ def __init__(
         paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
         paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1})
 
-        if self._strategy.pipeline.schedule_profiler:
-            paddle.framework.set_flags({'schedule_profiler': 1})
+        self.enable_auto_parallel_profiler = False
 
     def _prepare_data_spec(self, data, split, batch_size):
         inputs_spec = []
@@ -1485,6 +1484,11 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None):
             and not self._has_prepared_reader[self._mode]
         ):
             self._prepare_reader()
+
+        self._executor.enable_auto_parallel_profiler = (
+            self.enable_auto_parallel_profiler
+        )
+
         outs = self._executor.run(
             self.main_program,
             feed=feed_dict,
diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index 35afd2c818612..69489b172f07f 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -44,76 +44,63 @@ def parse_args():
     return args
 
 
-def process_job_log(log_data, device_id, log_start_time):
+def process_job_log(log_data, device_id, start_step):
     log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)'
     matches = re.findall(log_pattern, log_data)
     events = []
     last_end_time = None
 
-    for match in matches:
+    step_times = []
+    step_start_time = 0
+    step_end_time = 0
+
+    for i, match in enumerate(matches):
         job_id, job_type, micro_batch_id, job_start_time, job_end_time = match
-        if job_type in ["lr"]:
-            continue
 
         start_time = float(job_start_time.strip()) * 1000
         end_time = float(job_end_time.strip()) * 1000
 
-        if log_start_time > start_time:
-            continue
-
-        event_start = {
-            "name": job_type + "_" + str(job_id),
-            "cat": job_type,
-            "ph": "B",
-            "ts": start_time,
-            "pid": 0,
-            "tid": "GPU" + str(device_id),
-            "cname": color_map[job_type],
-        }
-        event_end = {
-            "name": job_type + "_" + str(job_id),
-            "cat": job_type,
-            "ph": "E",
-            "pid": 0,
-            "ts": end_time,
-            "tid": "GPU" + str(device_id),
-            "cname": color_map[job_type],
-        }
-        events.append(event_start)
-        events.append(event_end)
+        if job_type == "forward" and micro_batch_id == "0":
+            if step_start_time != 0:
+                step_times.append([step_start_time, step_end_time])
+            step_start_time = start_time
+        step_end_time = end_time
+
+        if len(step_times) >= start_step:
+            event_start = {
+                "name": job_type + "_" + str(job_id),
+                "cat": job_type,
+                "ph": "B",
+                "ts": start_time,
+                "pid": 0,
+                "tid": "GPU" + str(device_id),
+                "cname": color_map[job_type],
+            }
+            event_end = {
+                "name": job_type + "_" + str(job_id),
+                "cat": job_type,
+                "ph": "E",
+                "pid": 0,
+                "ts": end_time,
+                "tid": "GPU" + str(device_id),
+                "cname": color_map[job_type],
+            }
+            events.append(event_start)
+            events.append(event_end)
 
         last_end_time = end_time
 
-    return events
-
-
-def process_step_log(log_data, device_id):
-    start_pattern = r'.*?NVTX range push: (\d+), time: (\d+.\d+)'
-    end_pattern = r'.*?NVTX range pop, time: (\d+.\d+)'
-    start_matches = re.findall(start_pattern, log_data)
-    end_matches = re.findall(end_pattern, log_data)
-    end_matches = end_matches[len(end_matches) - len(start_matches) :]
-
-    step_info = []
-    for start_match, stop_match in zip(start_matches, end_matches):
-        step_id, start_time = start_match
-        stop_time = stop_match
-        if int(step_id) >= len(step_info):
-            for _ in range(int(step_id) - len(step_info) + 1):
-                step_info.append([float('inf'), 0])
-        step_info[int(step_id)] = [start_time, stop_time]
-
-    start_step = 0
-    for info in step_info:
-        if info[0] == float('inf'):
-            start_step += 1
-    return step_info, start_step
+    step_times.append([step_start_time, step_end_time])
+    step_times = step_times[start_step:]
+    return events, step_times
 
 
 def main():
     args = parse_args()
     all_events = []
     step_infos = []
+    start_step = 0
+
     for device_id in args.devices.split(","):
         _logger.info(f"Process device {device_id}")
         device_id = int(device_id)
@@ -121,31 +108,28 @@ def main():
         with open(log_file, "r") as f:
             log_data = f.read()
 
-        step_info, start_step = process_step_log(log_data, device_id)
-
-        if len(step_info) > len(step_infos):
-            for _ in range(len(step_info) - len(step_infos)):
-                step_infos.append([float('inf'), 0])
-        for i, info in enumerate(step_info):
-            if info[0] == float('inf'):
-                continue
-            start_time = float(info[0].strip()) * 1000
-            stop_time = float(info[1].strip()) * 1000
-
-            step_infos[i][0] = min(step_infos[i][0], start_time)
-            step_infos[i][1] = max(step_infos[i][1], stop_time)
+        start_step_pattern = (
+            r'.*?Schedule Profiler start at step (\d+) and end at step.*'
+        )
+        start_step_match = re.findall(start_step_pattern, log_data)
+        start_step = (
+            int(start_step_match[0]) if len(start_step_match) > 0 else 0
+        )
 
-        events = process_job_log(log_data, device_id, step_infos[start_step][0])
+        events, step_times = process_job_log(log_data, device_id, start_step)
         all_events.extend(events)
+        for i, info in enumerate(step_times):
+            if len(step_infos) <= i:
+                step_infos.append([float("inf"), float("-inf")])
+            step_infos[i][0] = min(step_infos[i][0], info[0])
+            step_infos[i][1] = max(step_infos[i][1], info[1])
 
     for i, info in enumerate(step_infos):
-        if info[0] == float('inf'):
-            continue
         start_time = info[0]
         if i > 0:
             start_time = max(start_time, step_infos[i - 1][1])
         event_start = {
-            "name": "step" + str(i),
+            "name": "step" + str(i + start_step),
             "cat": "step",
             "ph": "B",
             "ts": start_time,
@@ -154,7 +138,7 @@ def main():
             "cname": color_map["default"],
         }
         event_end = {
-            "name": "step" + str(i),
+            "name": "step" + str(i + start_step),
             "cat": "step",
             "ph": "E",
             "ts": info[1],

From 8218ecb7ff677e17a4701224f08bcc58def99da8 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Tue, 7 Nov 2023 14:34:27 +0000
Subject: [PATCH 28/52] fix color map logic

---
 .../auto_parallel/static/profiler_helper_static.py          | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index 69489b172f07f..25430c9688b6f 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -74,7 +74,6 @@ def process_job_log(log_data, device_id, start_step):
                 "ts": start_time,
                 "pid": 0,
                 "tid": "GPU" + str(device_id),
-                "cname": color_map[job_type],
             }
             event_end = {
                 "name": job_type + "_" + str(job_id),
@@ -83,8 +82,11 @@ def process_job_log(log_data, device_id, start_step):
                 "pid": 0,
                 "ts": end_time,
                 "tid": "GPU" + str(device_id),
-                "cname": color_map[job_type],
             }
+            if job_type in color_map:
+                event_start["cname"] = (color_map[job_type],)
+                event_end["cname"] = (color_map[job_type],)
+
             events.append(event_start)
             events.append(event_end)
 

From 4b318fcf7d24d1a25db62e65038bd9c7fa4010f4 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Tue, 7 Nov 2023 14:36:32 +0000
Subject: [PATCH 29/52] fix color map logic

---
 .../auto_parallel/static/profiler_helper_static.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index 25430c9688b6f..eeb4f6ef51250 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -84,8 +84,8 @@ def process_job_log(log_data, device_id, start_step):
                 "tid": "GPU" + str(device_id),
             }
             if job_type in color_map:
-                event_start["cname"] = (color_map[job_type],)
-                event_end["cname"] = (color_map[job_type],)
+                event_start["cname"] = color_map[job_type]
+                event_end["cname"] = color_map[job_type]
 
             events.append(event_start)
             events.append(event_end)

From 9f949f24c09ca511d14d3a4a2f73cbec60f489fe Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Wed, 8 Nov 2023 10:58:01 +0000
Subject: [PATCH 30/52] fix bug when log step does not start from 0

---
 .../new_executor/program_interpreter.cc       |  53 +++-----
 .../new_executor/program_interpreter.h        |   2 +-
 .../new_executor/standalone_executor.cc       |   9 ++
 paddle/fluid/pybind/pybind.cc                 |   1 -
 paddle/phi/kernels/autotune/gpu_timer.h       | 113 +++++++++++-------
 .../static/profiler_helper_static.py          |  48 ++++----
 6 files changed, 117 insertions(+), 109 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 2cd47903bacce..93206d1f257c5 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -138,6 +138,17 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
                                   bool need_fetch,
                                   bool enable_auto_parallel_profiler) {
   enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
+
+  if (enable_auto_parallel_profiler_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    gpuStream_t calculated_stream =
+        dynamic_cast<phi::GPUContext*>(
+            platform::DeviceContextPool::Instance().Get(place_))
+            ->stream();
+    calculated_stream_timer_.SetStream(calculated_stream);
+#endif
+  }
+
   std::vector<paddle::framework::OpFuncNode> op_func_nodes;
   Build(feed_names, &op_func_nodes);
 
@@ -639,34 +650,9 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time, end_time;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  //   double remaining_time = std::numeric_limits<double>::max();
-  //   double event_time, wait_time;
   start_time = calculated_stream_timer_.StartTime();
   end_time = calculated_stream_timer_.EndTime();
-//   gpuEvent_t stop_event = calculated_stream_timer_.StopEvent();
-
-//   for (auto& inst : vec_instruction_) {
-//     for (auto event_inter : inst.EventsToWait()) {
-//       auto* wrapper = static_cast<paddle::platform::CUDADeviceEventWrapper*>(
-//           event_inter.event_->GetEvent().get());
-
-//       gpuEvent_t start_event = wrapper->inner_event_.GetRawCudaEvent();
-//       event_time = 0;
-//       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(
-//           reinterpret_cast<float*>(&event_time), start_event, stop_event));
-//       remaining_time = std::min(remaining_time, event_time);
-//       VLOG(0) << "event time: " << std::to_string(event_time)
-//               << " remaining: " << std::to_string(remaining_time);
-//     }
-//   }
-//   if (remaining_time == std::numeric_limits<double>::max()) {
-//     remaining_time = 0;
-//   }
-//   wait_time = end_time - start_time - remaining_time;
-//   start_time = start_time + wait_time;
-//   VLOG(0) << "wait time: " << std::to_string(wait_time)
-//           << " start time: " << std::to_string(start_time)
-//           << " end time: " << std::to_string(end_time);
+  calculated_stream_timer_.ResetTime();
 #endif
   return std::make_tuple(start_time, end_time);
 }
@@ -711,16 +697,6 @@ void ProgramInterpreter::Convert(
 #endif
   }
 
-  if (enable_auto_parallel_profiler_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    gpuStream_t calculate_stream =
-        dynamic_cast<phi::GPUContext*>(
-            platform::DeviceContextPool::Instance().Get(place_))
-            ->stream();
-    calculated_stream_timer_.SetStream(calculate_stream);
-#endif
-  }
-
   BuildOperatorDependences();
 
   // NOTE(Ruibiao): For cross-step stream synchronization, an event may be
@@ -1091,7 +1067,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
     if (enable_auto_parallel_profiler_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (!interpreter::IsCommunicationOp(instr_node) &&
-          !calculated_stream_timer_.IsStart()) {
+          !calculated_stream_timer_.IsStarted()) {
         VLOG(3) << "Start calculated stream timer from op: " << op->Type();
         calculated_stream_timer_.Start();
       }
@@ -1275,7 +1251,8 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 
     if (enable_auto_parallel_profiler_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      if (instr_id == last_calculated_instr_id) {
+      if (instr_id == last_calculated_instr_id &&
+          calculated_stream_timer_.IsStarted()) {
         VLOG(3) << "Stop calculated stream timer from op: "
                 << instr_node.OpBase()->Type();
         calculated_stream_timer_.Stop();
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 379edbfab8f59..ca4d1c0d21157 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -221,7 +221,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   std::vector<HookFunc> hookfuncs_;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  phi::GpuTimer calculated_stream_timer_;
+  phi::CalculatedStreamTimer calculated_stream_timer_;
 #endif
   size_t last_calculated_instr_id;
   bool enable_auto_parallel_profiler_;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index aa378bd626cd6..f9667c18fd659 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -269,6 +269,15 @@ paddle::framework::FetchList StandaloneExecutor::Run(
 
 void StandaloneExecutor::SetEnableAutoParallelProfiler(
     bool enable_auto_parallel_profiler) {
+  gpuStream_t calculated_stream =
+      dynamic_cast<phi::GPUContext*>(
+          platform::DeviceContextPool::Instance().Get(place_))
+          ->stream();
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream));
+#endif
   enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
 }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 58c578b9ae575..a8e5e4015af6e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2023,7 +2023,6 @@ All parameter, weight, gradient are variables in Paddle.
 
       .def("set_enable_auto_parallel_profiler",
            [](StandaloneExecutor &self, bool enable_auto_parallel_profiler) {
-             pybind11::gil_scoped_release release;
              self.SetEnableAutoParallelProfiler(enable_auto_parallel_profiler);
            });
 
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 25403abbe1578..482df368af8a2 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -52,7 +52,6 @@ class GpuTimer {
         start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
     PADDLE_ENFORCE_NOT_NULL(
         stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
-    is_start_ = false;
   }
 
   ~GpuTimer() {
@@ -71,7 +70,6 @@ class GpuTimer {
 #else
     cudaEventRecord(start_, stream);
 #endif
-    is_start_ = true;
   }
 
   void Stop(gpuStream_t stream) {
@@ -80,83 +78,110 @@ class GpuTimer {
 #else
     cudaEventRecord(stop_, stream);
 #endif
-    is_start_ = false;
   }
 
-  void Start() {
-    // Note(sonder): Since it is not possible to directly obtain the start time
-    // of the event, "gettimeofday" is used here to retrieve it. The callback is
-    // used to record the start time of the event.
-    Start(stream_);
+  float ElapsedTime() {
+    float milliseconds = 0;
 #ifdef PADDLE_WITH_HIP
-    hipStreamAddCallback(stream_,
-                         RecordEventTimerCallback,
-                         reinterpret_cast<void *>(&start_time_),
-                         0);
+    hipEventSynchronize(stop_);
+    hipEventElapsedTime(&milliseconds, start_, stop_);
 #else
-    cudaStreamAddCallback(stream_,
-                          RecordEventTimerCallback,
-                          reinterpret_cast<void *>(&start_time_),
-                          0);
+    cudaEventSynchronize(stop_);
+    cudaEventElapsedTime(&milliseconds, start_, stop_);
 #endif
+    return milliseconds;
   }
 
-  void Stop() {
-    Stop(stream_);
+ private:
+  gpuEvent_t start_;
+  gpuEvent_t stop_;
+};
+
+class CalculatedStreamTimer {
+ public:
+  CalculatedStreamTimer()
+      : calculated_stream_(nullptr),
+        start_time_(0),
+        end_time_(0),
+        is_started_(false) {}
+
+  void Start() {
+    // Note(sonder): Since it is not possible to directly obtain the start time
+    // of the event, "gettimeofday" is used here to retrieve it. The callback is
+    // used to record the start time of the event.
+    if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
-    hipStreamAddCallback(stream_,
-                         RecordEventTimerCallback,
-                         reinterpret_cast<void *>(&end_time_),
-                         0);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipStreamAddCallback(calculated_stream_,
+                               RecordEventTimerCallback,
+                               reinterpret_cast<void *>(&start_time_),
+                               0));
 #else
-    cudaStreamAddCallback(stream_,
-                          RecordEventTimerCallback,
-                          reinterpret_cast<void *>(&end_time_),
-                          0);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaStreamAddCallback(calculated_stream_,
+                                RecordEventTimerCallback,
+                                reinterpret_cast<void *>(&start_time_),
+                                0));
 #endif
+      is_started_ = true;
+    }
   }
 
-  void SetStream(gpuStream_t stream) { stream_ = stream; }
-
-  double ElapsedTime() {
-    float milliseconds = 0;
+  void Stop() {
+    if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
-    hipEventSynchronize(stop_);
-    hipEventElapsedTime(&milliseconds, start_, stop_);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipStreamAddCallback(calculated_stream_,
+                               RecordEventTimerCallback,
+                               reinterpret_cast<void *>(&end_time_),
+                               0));
 #else
-    cudaEventSynchronize(stop_);
-    cudaEventElapsedTime(&milliseconds, start_, stop_);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaStreamAddCallback(calculated_stream_,
+                                RecordEventTimerCallback,
+                                reinterpret_cast<void *>(&end_time_),
+                                0));
 #endif
-    return milliseconds;
+      is_started_ = false;
+    }
   }
 
   double StartTime() {
+    if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
-    hipStreamSynchronize(stream_);
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_));
 #else
-    cudaStreamSynchronize(stream_);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_));
 #endif
+    }
     return start_time_;
   }
 
   double EndTime() {
+    if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
-    hipStreamSynchronize(stream_);
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_));
 #else
-    cudaStreamSynchronize(stream_);
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_));
 #endif
+    }
     return end_time_;
   }
 
-  bool IsStart() { return is_start_; }
+  bool IsStarted() { return is_started_; }
+
+  void SetStream(gpuStream_t stream) { calculated_stream_ = stream; }
+
+  void ResetTime() {
+    start_time_ = 0;
+    end_time_ = 0;
+  }
 
  private:
-  gpuEvent_t start_;
-  gpuEvent_t stop_;
-  gpuStream_t stream_;
+  gpuStream_t calculated_stream_;
   double start_time_;
   double end_time_;
-  bool is_start_;
+  bool is_started_;
 };
 
 }  // namespace phi
diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
index eeb4f6ef51250..08a048c0bb68e 100644
--- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
+++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py
@@ -44,7 +44,7 @@ def parse_args():
     return args
 
 
-def process_job_log(log_data, device_id, start_step):
+def process_job_log(log_data, device_id):
     log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)'
     matches = re.findall(log_pattern, log_data)
     events = []
@@ -66,34 +66,32 @@ def process_job_log(log_data, device_id, start_step):
             step_start_time = start_time
         step_end_time = end_time
 
-        if len(step_times) >= start_step:
-            event_start = {
-                "name": job_type + "_" + str(job_id),
-                "cat": job_type,
-                "ph": "B",
-                "ts": start_time,
-                "pid": 0,
-                "tid": "GPU" + str(device_id),
-            }
-            event_end = {
-                "name": job_type + "_" + str(job_id),
-                "cat": job_type,
-                "ph": "E",
-                "pid": 0,
-                "ts": end_time,
-                "tid": "GPU" + str(device_id),
-            }
-            if job_type in color_map:
-                event_start["cname"] = color_map[job_type]
-                event_end["cname"] = color_map[job_type]
+        event_start = {
+            "name": job_type + "_" + str(job_id),
+            "cat": job_type,
+            "ph": "B",
+            "ts": start_time,
+            "pid": 0,
+            "tid": "GPU" + str(device_id),
+        }
+        event_end = {
+            "name": job_type + "_" + str(job_id),
+            "cat": job_type,
+            "ph": "E",
+            "pid": 0,
+            "ts": end_time,
+            "tid": "GPU" + str(device_id),
+        }
+        if job_type in color_map:
+            event_start["cname"] = color_map[job_type]
+            event_end["cname"] = color_map[job_type]
 
-            events.append(event_start)
-            events.append(event_end)
+        events.append(event_start)
+        events.append(event_end)
 
         last_end_time = end_time
 
     step_times.append([step_start_time, step_end_time])
-    step_times = step_times[start_step:]
     return events, step_times
 
 
@@ -118,7 +116,7 @@ def main():
             int(start_step_match[0]) if len(start_step_match) > 0 else 0
         )
 
-        events, step_times = process_job_log(log_data, device_id, start_step)
+        events, step_times = process_job_log(log_data, device_id)
         all_events.extend(events)
         for i, info in enumerate(step_times):
             if len(step_infos) <= i:

From ffc7b3974f7e3471567e0dca7620c71084974468 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 9 Nov 2023 02:43:30 +0000
Subject: [PATCH 31/52] fix

---
 .../fluid/framework/new_executor/standalone_executor.cc   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index f9667c18fd659..d1fbc030e6abe 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -269,16 +269,18 @@ paddle::framework::FetchList StandaloneExecutor::Run(
 
 void StandaloneExecutor::SetEnableAutoParallelProfiler(
     bool enable_auto_parallel_profiler) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t calculated_stream =
       dynamic_cast<phi::GPUContext*>(
           platform::DeviceContextPool::Instance().Get(place_))
           ->stream();
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream));
-#else
+#if defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream));
+#elif defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(calculated_stream));
 #endif
   enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
+#endif
 }
 
 }  // namespace framework

From 1925dd7a3d9ab4713a0a697f5263b6079dfe3e5a Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 9 Nov 2023 05:23:12 +0000
Subject: [PATCH 32/52] fix

---
 paddle/fluid/framework/new_executor/standalone_executor.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index d1fbc030e6abe..eb879e3970919 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -274,10 +274,10 @@ void StandaloneExecutor::SetEnableAutoParallelProfiler(
       dynamic_cast<phi::GPUContext*>(
           platform::DeviceContextPool::Instance().Get(place_))
           ->stream();
-#if defined(PADDLE_WITH_CUDA)
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream));
-#elif defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(calculated_stream));
+#elif defined(PADDLE_WITH_CUDA)
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream));
 #endif
   enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
 #endif

From d299723c87da29b8f81ba43f4ef9c2d8ec34cca4 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 9 Nov 2023 06:26:47 +0000
Subject: [PATCH 33/52] don't use set_enable_auto_parallel_profiler

---
 python/paddle/base/executor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index ea4a95e8b1064..c9e2f3d6aa4db 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -1895,10 +1895,6 @@ def _run_impl(
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
 
-            new_exe.set_enable_auto_parallel_profiler(
-                self.enable_auto_parallel_profiler
-            )
-
             ret = new_exe.run(list(feed.keys()), return_numpy)
             set_flags(stored_flag)
             return ret

From 5297b7aed0cc07ba6ce1c06c8cef400367840ffe Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 9 Nov 2023 06:30:25 +0000
Subject: [PATCH 34/52] fix bug

---
 python/paddle/base/executor.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index c9e2f3d6aa4db..65c1c4403416d 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -1895,6 +1895,11 @@ def _run_impl(
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
 
+            if self.enable_auto_parallel_profiler:
+                new_exe.set_enable_auto_parallel_profiler(
+                    self.enable_auto_parallel_profiler
+                )
+
             ret = new_exe.run(list(feed.keys()), return_numpy)
             set_flags(stored_flag)
             return ret

From 8bfb6c07b94e04c5d44e99707b7240fde621f46b Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 9 Nov 2023 06:51:36 +0000
Subject: [PATCH 35/52] disable auto_parallel_profiler when not open flag by
 command line

---
 python/paddle/base/executor.py                           | 4 ++--
 python/paddle/distributed/auto_parallel/static/engine.py | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 65c1c4403416d..72c58eb6dca01 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -1184,7 +1184,7 @@ def __init__(self, place=None):
 
         self.op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
-        self.enable_auto_parallel_profiler = False
+        self.enable_auto_parallel_profiler = None
 
     def _is_optimizer_op(self, op):
         return self.op_role_key in op.attr_names and int(
@@ -1895,7 +1895,7 @@ def _run_impl(
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
 
-            if self.enable_auto_parallel_profiler:
+            if self.enable_auto_parallel_profiler is not None:
                 new_exe.set_enable_auto_parallel_profiler(
                     self.enable_auto_parallel_profiler
                 )
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 972f435c69462..a88ccac384639 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -1485,9 +1485,10 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None):
         ):
             self._prepare_reader()
 
-        self._executor.enable_auto_parallel_profiler = (
-            self.enable_auto_parallel_profiler
-        )
+        if self._strategy.pipeline.auto_parallel_profiler:
+            self._executor.enable_auto_parallel_profiler = (
+                self.enable_auto_parallel_profiler
+            )
 
         outs = self._executor.run(
             self.main_program,

From 13b14d1755b75eef068791e1509d934969038f2c Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Thu, 9 Nov 2023 13:55:15 +0000
Subject: [PATCH 36/52] fix bug

---
 python/paddle/distributed/auto_parallel/static/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index a88ccac384639..0052e3dc075fb 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -1485,7 +1485,7 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None):
         ):
             self._prepare_reader()
 
-        if self._strategy.pipeline.auto_parallel_profiler:
+        if self._strategy.pipeline.schedule_profiler:
             self._executor.enable_auto_parallel_profiler = (
                 self.enable_auto_parallel_profiler
             )

From 5bb55e16b6bff78ff03c18f9c7de04ebe4bf351e Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Fri, 10 Nov 2023 03:17:02 +0000
Subject: [PATCH 37/52] remove resettime

---
 paddle/phi/kernels/autotune/gpu_timer.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 482df368af8a2..5e214482652cc 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -172,11 +172,6 @@ class CalculatedStreamTimer {
 
   void SetStream(gpuStream_t stream) { calculated_stream_ = stream; }
 
-  void ResetTime() {
-    start_time_ = 0;
-    end_time_ = 0;
-  }
-
  private:
   gpuStream_t calculated_stream_;
   double start_time_;

From f422b339ac2b1e7a49f36220ea428ece2c4b9323 Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 13 Nov 2023 01:57:56 +0000
Subject: [PATCH 38/52] fix build bug

---
 paddle/fluid/framework/new_executor/program_interpreter.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 93206d1f257c5..9ff8cab1845e8 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -652,7 +652,6 @@ std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   start_time = calculated_stream_timer_.StartTime();
   end_time = calculated_stream_timer_.EndTime();
-  calculated_stream_timer_.ResetTime();
 #endif
   return std::make_tuple(start_time, end_time);
 }

From ed5f7fca663a0caf303ebce82700086efa09edbe Mon Sep 17 00:00:00 2001
From: AndSonder <changlu@keter.top>
Date: Mon, 13 Nov 2023 09:59:02 +0000
Subject: [PATCH 39/52] fix

---
 .../new_executor/interpreter_base_impl.h      |  2 +-
 .../framework/new_executor/interpretercore.cc |  8 +---
 .../framework/new_executor/interpretercore.h  |  2 +-
 .../framework/new_executor/pir_interpreter.cc |  2 +-
 .../framework/new_executor/pir_interpreter.h  |  2 +-
 .../new_executor/program_interpreter.cc       | 46 ++++++++-----------
 .../new_executor/program_interpreter.h        |  8 ++--
 .../new_executor/standalone_executor.cc       | 16 +++----
 .../new_executor/standalone_executor.h        |  4 +-
 paddle/fluid/pybind/pybind.cc                 |  6 +--
 paddle/phi/kernels/autotune/gpu_timer.h       | 18 +++++++-
 python/paddle/base/executor.py                | 14 +++---
 .../auto_parallel/static/engine.py            |  6 +--
 13 files changed, 69 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 9e441f9c9bdf2..d23246a784cc8 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -73,7 +73,7 @@ class InterpreterBaseImpl {
   virtual paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
       bool need_fetch = true,
-      bool enable_auto_parallel_profiler = false) = 0;
+      bool enable_job_schedule_profiler = false) = 0;
 
   virtual void ShareWorkQueueFrom(InterpreterBaseImpl* src) = 0;
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 7423d43ac4502..4f929a709345a 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -34,10 +34,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             true,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
-PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler,
-                            false,
-                            "Enable auto parallel profiler, collecting the "
-                            "runtime of jobs in different devices");
 
 namespace paddle {
 namespace framework {
@@ -76,8 +72,8 @@ FetchList InterpreterCore::Run(
 
 FetchList InterpreterCore::Run(const std::vector<std::string>& feed_names,
                                bool need_fetch,
-                               bool enable_auto_parallel_profiler) {
-  return impl_->Run(feed_names, need_fetch, enable_auto_parallel_profiler);
+                               bool enable_job_schedule_profiler) {
+  return impl_->Run(feed_names, need_fetch, enable_job_schedule_profiler);
 }
 
 void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 8fbec0e296786..77ad1b8cbc361 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -52,7 +52,7 @@ class InterpreterCore {
 
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
                                    bool need_fetch = true,
-                                   bool enable_auto_parallel_profiler = false);
+                                   bool enable_job_schedule_profiler = false);
 
   void ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src);
 
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index ae38858139e92..ec0b4aa78920c 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -1188,7 +1188,7 @@ paddle::framework::FetchList PirInterpreter::Run(
 
 FetchList PirInterpreter::Run(const std::vector<std::string>& feed_names,
                               bool need_fetch,
-                              bool enable_auto_parallel_profiler) {
+                              bool enable_job_schedule_profiler) {
   SetDeviceId(place_);
   CheckCUDAGraphBeforeRun(feed_names);
 
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h
index 6ff147002df12..586a750cbb08e 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.h
@@ -57,7 +57,7 @@ class PirInterpreter : public InterpreterBaseImpl {
   paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
       bool need_fetch = true,
-      bool enable_auto_parallel_profiler = false) override;
+      bool enable_job_schedule_profiler = false) override;
 
   void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
 
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 9ff8cab1845e8..89fb4da41949a 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -90,6 +90,10 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
   };
 
   PrepareForCUDAGraphCapture();
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  calculate_stream_timer_ = phi::CalculateStreamTimer(place_);
+#endif
 }
 
 ProgramInterpreter::~ProgramInterpreter() {
@@ -136,18 +140,8 @@ void ProgramInterpreter::RunImpl() {
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
                                   bool need_fetch,
-                                  bool enable_auto_parallel_profiler) {
-  enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
-
-  if (enable_auto_parallel_profiler_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    gpuStream_t calculated_stream =
-        dynamic_cast<phi::GPUContext*>(
-            platform::DeviceContextPool::Instance().Get(place_))
-            ->stream();
-    calculated_stream_timer_.SetStream(calculated_stream);
-#endif
-  }
+                                  bool enable_job_schedule_profiler) {
+  enable_job_schedule_profiler_ = enable_job_schedule_profiler;
 
   std::vector<paddle::framework::OpFuncNode> op_func_nodes;
   Build(feed_names, &op_func_nodes);
@@ -650,8 +644,8 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time, end_time;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  start_time = calculated_stream_timer_.StartTime();
-  end_time = calculated_stream_timer_.EndTime();
+  start_time = calculate_stream_timer_.StartTime();
+  end_time = calculate_stream_timer_.EndTime();
 #endif
   return std::make_tuple(start_time, end_time);
 }
@@ -1063,15 +1057,15 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
-    if (enable_auto_parallel_profiler_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      if (!interpreter::IsCommunicationOp(instr_node) &&
-          !calculated_stream_timer_.IsStarted()) {
+    if (enable_job_schedule_profiler_) {
+      if (!calculate_stream_timer_.IsStarted() &&
+          !interpreter::IsCommunicationOp(instr_node)) {
         VLOG(3) << "Start calculated stream timer from op: " << op->Type();
-        calculated_stream_timer_.Start();
+        calculate_stream_timer_.Start();
       }
-#endif
     }
+#endif
 
     if (!instr_node.IsArtificial()) {
       RunOperator(instr_node);
@@ -1126,12 +1120,12 @@ void ProgramInterpreter::ExecuteInstructionList(
 
   exception_holder_.Clear();
 
-  if (enable_auto_parallel_profiler_) {
+  if (enable_job_schedule_profiler_) {
     for (int i = vec_instr.size() - 1; i >= 0; --i) {
       auto& instr_node = vec_instr[i];
       if (!interpreter::IsCommunicationOp(instr_node)) {
         VLOG(3) << "Last calculated op type: " << instr_node.OpBase()->Type();
-        last_calculated_instr_id = i;
+        last_calculate_instr_id_ = i;
         break;
       }
     }
@@ -1248,16 +1242,16 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 
     RunInstruction(instr_node);
 
-    if (enable_auto_parallel_profiler_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      if (instr_id == last_calculated_instr_id &&
-          calculated_stream_timer_.IsStarted()) {
+    if (enable_job_schedule_profiler_) {
+      if (instr_id == last_calculate_instr_id_ &&
+          calculate_stream_timer_.IsStarted()) {
         VLOG(3) << "Stop calculated stream timer from op: "
                 << instr_node.OpBase()->Type();
-        calculated_stream_timer_.Stop();
+        calculate_stream_timer_.Stop();
       }
-#endif
     }
+#endif
 
     if (UNLIKELY(exception_holder_.IsCaught())) {
       VLOG(4) << "Exception caught";
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index ca4d1c0d21157..a6fa780dc1bda 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -53,7 +53,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
       bool need_fetch = true,
-      bool enable_auto_parallel_profiler = false) override;
+      bool enable_job_schedule_profiler = false) override;
 
   void Build(
       const std::vector<std::string>& feed_names,
@@ -221,10 +221,10 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   std::vector<HookFunc> hookfuncs_;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  phi::CalculatedStreamTimer calculated_stream_timer_;
+  phi::CalculateStreamTimer calculate_stream_timer_;
 #endif
-  size_t last_calculated_instr_id;
-  bool enable_auto_parallel_profiler_;
+  size_t last_calculate_instr_id_;
+  bool enable_job_schedule_profiler_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index eb879e3970919..4c0a6ee380c25 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -49,7 +49,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     ss << scope << ", ";
   }
   VLOG(6) << ss.str();
-  enable_auto_parallel_profiler_ = false;
+  enable_job_schedule_profiler_ = false;
 
   const auto& jobs = plan_.JobList();
   for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
@@ -217,20 +217,20 @@ paddle::framework::FetchList StandaloneExecutor::Run(
         const std::vector<std::string> tmp_feed_names = {};
         interpretercores_[job_idx]->Run(tmp_feed_names,
                                         /*need_fetch = */ false,
-                                        /*enable_auto_parallel_profiler = */
-                                        enable_auto_parallel_profiler_);
+                                        /*enable_job_schedule_profiler = */
+                                        enable_job_schedule_profiler_);
       } else {
         interpretercores_[job_idx]->Run(feed_names,
                                         /*need_fetch = */ false,
-                                        /*enable_auto_parallel_profiler = */
-                                        enable_auto_parallel_profiler_);
+                                        /*enable_job_schedule_profiler = */
+                                        enable_job_schedule_profiler_);
       }
     }
   }
 
   // record each job's run time
 #if defined(PADDLE_WITH_CUDA)
-  if (enable_auto_parallel_profiler_) {
+  if (enable_job_schedule_profiler_) {
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
       const std::string& job_type = job->Type();
@@ -268,7 +268,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
 }
 
 void StandaloneExecutor::SetEnableAutoParallelProfiler(
-    bool enable_auto_parallel_profiler) {
+    bool enable_job_schedule_profiler) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t calculated_stream =
       dynamic_cast<phi::GPUContext*>(
@@ -279,7 +279,7 @@ void StandaloneExecutor::SetEnableAutoParallelProfiler(
 #elif defined(PADDLE_WITH_CUDA)
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream));
 #endif
-  enable_auto_parallel_profiler_ = enable_auto_parallel_profiler;
+  enable_job_schedule_profiler_ = enable_job_schedule_profiler;
 #endif
 }
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 01c3a197f298c..8624b7c93b41c 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -41,7 +41,7 @@ class StandaloneExecutor {
 
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names);
 
-  void SetEnableAutoParallelProfiler(bool enable_auto_parallel_profiler);
+  void SetEnableAutoParallelProfiler(bool enable_job_schedule_profiler);
 
  private:
   bool is_interpretercore_build_result_shared_{false};
@@ -58,7 +58,7 @@ class StandaloneExecutor {
   std::vector<std::unordered_map<std::string, std::shared_ptr<EventInter>>>
       vec_force_events_to_wait_;
 
-  bool enable_auto_parallel_profiler_;
+  bool enable_job_schedule_profiler_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a8e5e4015af6e..e0b615442cbb2 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2021,9 +2021,9 @@ All parameter, weight, gradient are variables in Paddle.
              return py::cast(std::move(ret));
            })
 
-      .def("set_enable_auto_parallel_profiler",
-           [](StandaloneExecutor &self, bool enable_auto_parallel_profiler) {
-             self.SetEnableAutoParallelProfiler(enable_auto_parallel_profiler);
+      .def("set_enable_job_schedule_profiler",
+           [](StandaloneExecutor &self, bool enable_job_schedule_profiler) {
+             self.SetEnableAutoParallelProfiler(enable_job_schedule_profiler);
            });
 
   py::class_<framework::interpreter::Job,
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 5e214482652cc..e93315495f983 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,7 +14,10 @@
 
 #pragma once
 
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/dynload/port.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
@@ -97,14 +100,25 @@ class GpuTimer {
   gpuEvent_t stop_;
 };
 
-class CalculatedStreamTimer {
+class CalculateStreamTimer {
  public:
-  CalculatedStreamTimer()
+  CalculateStreamTimer()
       : calculated_stream_(nullptr),
         start_time_(0),
         end_time_(0),
         is_started_(false) {}
 
+  explicit CalculateStreamTimer(const paddle::platform::Place place)
+      : calculated_stream_(nullptr),
+        start_time_(0),
+        end_time_(0),
+        is_started_(false) {
+    calculated_stream_ =
+        dynamic_cast<phi::GPUContext *>(
+            paddle::platform::DeviceContextPool::Instance().Get(place))
+            ->stream();
+  }
+
   void Start() {
     // Note(sonder): Since it is not possible to directly obtain the start time
     // of the event, "gettimeofday" is used here to retrieve it. The callback is
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 72c58eb6dca01..c7f04df8f12a4 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -823,9 +823,9 @@ def run(self, feed_names, return_numpy=True):
                 )
             return tensors
 
-    def set_enable_auto_parallel_profiler(self, enable_auto_parallel_profiler):
-        self._new_exe.set_enable_auto_parallel_profiler(
-            enable_auto_parallel_profiler
+    def set_enable_job_schedule_profiler(self, enable_job_schedule_profiler):
+        self._new_exe.set_enable_job_schedule_profiler(
+            enable_job_schedule_profiler
         )
 
     def _create_new_executor(self):
@@ -1184,7 +1184,7 @@ def __init__(self, place=None):
 
         self.op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
-        self.enable_auto_parallel_profiler = None
+        self.enable_job_schedule_profiler = None
 
     def _is_optimizer_op(self, op):
         return self.op_role_key in op.attr_names and int(
@@ -1895,9 +1895,9 @@ def _run_impl(
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
 
-            if self.enable_auto_parallel_profiler is not None:
-                new_exe.set_enable_auto_parallel_profiler(
-                    self.enable_auto_parallel_profiler
+            if self.enable_job_schedule_profiler is not None:
+                new_exe.set_enable_job_schedule_profiler(
+                    self.enable_job_schedule_profiler
                 )
 
             ret = new_exe.run(list(feed.keys()), return_numpy)
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 0052e3dc075fb..452ed114b2230 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -251,7 +251,7 @@ def __init__(
         paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
         paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1})
 
-        self.enable_auto_parallel_profiler = False
+        self.enable_job_schedule_profiler = False
 
     def _prepare_data_spec(self, data, split, batch_size):
         inputs_spec = []
@@ -1486,8 +1486,8 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None):
             self._prepare_reader()
 
         if self._strategy.pipeline.schedule_profiler:
-            self._executor.enable_auto_parallel_profiler = (
-                self.enable_auto_parallel_profiler
+            self._executor.enable_job_schedule_profiler = (
+                self.enable_job_schedule_profiler
             )
 
         outs = self._executor.run(

From 718cf1779daaed80df2fbbdd391a9dcfd616915e Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Tue, 14 Nov 2023 11:01:05 +0000
Subject: [PATCH 40/52] remove set enable

---
 .../new_executor/standalone_executor.cc       | 25 ++++---------------
 .../new_executor/standalone_executor.h        |  8 +++---
 paddle/fluid/pybind/pybind.cc                 | 15 +++++------
 python/paddle/base/executor.py                |  9 ++++---
 4 files changed, 19 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 4c0a6ee380c25..1037f3da0e5c9 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -157,7 +157,8 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
 }
 
 paddle::framework::FetchList StandaloneExecutor::Run(
-    const std::vector<std::string>& feed_names) {
+    const std::vector<std::string>& feed_names,
+    const bool enable_job_schedule_profiler) {
   platform::RecordEvent record_event(
       "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1);
 
@@ -218,19 +219,19 @@ paddle::framework::FetchList StandaloneExecutor::Run(
         interpretercores_[job_idx]->Run(tmp_feed_names,
                                         /*need_fetch = */ false,
                                         /*enable_job_schedule_profiler = */
-                                        enable_job_schedule_profiler_);
+                                        enable_job_schedule_profiler);
       } else {
         interpretercores_[job_idx]->Run(feed_names,
                                         /*need_fetch = */ false,
                                         /*enable_job_schedule_profiler = */
-                                        enable_job_schedule_profiler_);
+                                        enable_job_schedule_profiler);
       }
     }
   }
 
   // record each job's run time
 #if defined(PADDLE_WITH_CUDA)
-  if (enable_job_schedule_profiler_) {
+  if (enable_job_schedule_profiler) {
     for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
       const auto& job = jobs[job_idx];
       const std::string& job_type = job->Type();
@@ -267,21 +268,5 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   }
 }
 
-void StandaloneExecutor::SetEnableAutoParallelProfiler(
-    bool enable_job_schedule_profiler) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  gpuStream_t calculated_stream =
-      dynamic_cast<phi::GPUContext*>(
-          platform::DeviceContextPool::Instance().Get(place_))
-          ->stream();
-#if defined(PADDLE_WITH_HIP)
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(calculated_stream));
-#elif defined(PADDLE_WITH_CUDA)
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream));
-#endif
-  enable_job_schedule_profiler_ = enable_job_schedule_profiler;
-#endif
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 8624b7c93b41c..621cbf9335dfc 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -39,9 +39,9 @@ class StandaloneExecutor {
 
   ~StandaloneExecutor() {}
 
-  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names);
-
-  void SetEnableAutoParallelProfiler(bool enable_job_schedule_profiler);
+  paddle::framework::FetchList Run(
+      const std::vector<std::string>& feed_names,
+      const bool enable_job_schedule_profiler = false);
 
  private:
   bool is_interpretercore_build_result_shared_{false};
@@ -57,8 +57,6 @@ class StandaloneExecutor {
 
   std::vector<std::unordered_map<std::string, std::shared_ptr<EventInter>>>
       vec_force_events_to_wait_;
-
-  bool enable_job_schedule_profiler_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index e0b615442cbb2..2951a0ba8bf65 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2012,22 +2012,19 @@ All parameter, weight, gradient are variables in Paddle.
                     const interpreter::Plan &,
                     Scope *>())
       .def("run",
-           [](StandaloneExecutor &self, std::vector<std::string> feed_names) {
+           [](StandaloneExecutor &self,
+              std::vector<std::string> feed_names,
+              constbool enable_job_schedule_profiler = false) {
              paddle::framework::FetchList ret;
              {
                pybind11::gil_scoped_release release;
-               ret = self.Run(feed_names);
+               ret = self.Run(feed_names, enable_job_schedule_profiler);
              }
              return py::cast(std::move(ret));
            })
 
-      .def("set_enable_job_schedule_profiler",
-           [](StandaloneExecutor &self, bool enable_job_schedule_profiler) {
-             self.SetEnableAutoParallelProfiler(enable_job_schedule_profiler);
-           });
-
-  py::class_<framework::interpreter::Job,
-             std::shared_ptr<framework::interpreter::Job>>(m, "Job")
+          py::class_<framework::interpreter::Job,
+                     std::shared_ptr<framework::interpreter::Job>>(m, "Job")
       .def(py::init<const std::string &>(), py::arg("type"))
       .def("micro_batch_id", &framework::interpreter::Job::MicroBatchId)
       .def("type", &framework::interpreter::Job::Type)
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index c7f04df8f12a4..317b92218832e 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -797,6 +797,7 @@ def __init__(self, place, plan, scope):
         self._plan = plan
         self._scope = scope
         self._new_exe = self._create_new_executor()
+        self._enable_job_schedule_profiler = False
 
     def run(self, feed_names, return_numpy=True):
         """
@@ -808,7 +809,9 @@ def run(self, feed_names, return_numpy=True):
                 (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
                 the type of the return value is a list of :code:`LoDTensor`. The default is True.
         """
-        tensors = self._new_exe.run(feed_names)._move_to_list()
+        tensors = self._new_exe.run(
+            feed_names, self._enable_job_schedule_profiler
+        )._move_to_list()
         if return_numpy:
             tensors = as_numpy(tensors, copy=True)
             if not get_flags("FLAGS_enable_pir_in_executor")[
@@ -824,9 +827,7 @@ def run(self, feed_names, return_numpy=True):
             return tensors
 
     def set_enable_job_schedule_profiler(self, enable_job_schedule_profiler):
-        self._new_exe.set_enable_job_schedule_profiler(
-            enable_job_schedule_profiler
-        )
+        self._enable_job_schedule_profiler = enable_job_schedule_profiler
 
     def _create_new_executor(self):
         new_exe = core.StandaloneExecutor(self._place, self._plan, self._scope)

From f36b57b61916f9ee8c98f98f28ee775835b5e926 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 15 Nov 2023 03:15:58 +0000
Subject: [PATCH 41/52] fix build error

---
 paddle/fluid/framework/new_executor/program_interpreter.cc | 2 +-
 paddle/fluid/framework/new_executor/standalone_executor.cc | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 89fb4da41949a..ced72c782aace 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -642,7 +642,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 }
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
-  double start_time, end_time;
+  double start_time = 0, end_time = 0;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   start_time = calculate_stream_timer_.StartTime();
   end_time = calculate_stream_timer_.EndTime();
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 1037f3da0e5c9..5dac646b82dcb 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -49,7 +49,6 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     ss << scope << ", ";
   }
   VLOG(6) << ss.str();
-  enable_job_schedule_profiler_ = false;
 
   const auto& jobs = plan_.JobList();
   for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {

From 444b7a7c7ab1439c625ec61093550d736e176cbb Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 15 Nov 2023 05:44:39 +0000
Subject: [PATCH 42/52] fix build error

---
 paddle/fluid/pybind/pybind.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2951a0ba8bf65..336c00b6ab4f2 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2014,7 +2014,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](StandaloneExecutor &self,
               std::vector<std::string> feed_names,
-              constbool enable_job_schedule_profiler = false) {
+              const bool enable_job_schedule_profiler = false) {
              paddle::framework::FetchList ret;
              {
                pybind11::gil_scoped_release release;

From f49491674829cbfba45a761eaf20af9e7ae738f5 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 15 Nov 2023 05:46:26 +0000
Subject: [PATCH 43/52] fix build error

---
 paddle/fluid/pybind/pybind.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 336c00b6ab4f2..1cb0bb718aea0 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2021,10 +2021,10 @@ All parameter, weight, gradient are variables in Paddle.
                ret = self.Run(feed_names, enable_job_schedule_profiler);
              }
              return py::cast(std::move(ret));
-           })
+           });
 
-          py::class_<framework::interpreter::Job,
-                     std::shared_ptr<framework::interpreter::Job>>(m, "Job")
+  py::class_<framework::interpreter::Job,
+             std::shared_ptr<framework::interpreter::Job>>(m, "Job")
       .def(py::init<const std::string &>(), py::arg("type"))
       .def("micro_batch_id", &framework::interpreter::Job::MicroBatchId)
       .def("type", &framework::interpreter::Job::Type)

From a2b598841e8c7e0c5b6712ee4ea2f3c474792d95 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 15 Nov 2023 14:44:26 +0000
Subject: [PATCH 44/52] fix ci error

---
 paddle/phi/kernels/autotune/gpu_timer.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index e93315495f983..bfe421d380eff 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -113,10 +113,12 @@ class CalculateStreamTimer {
         start_time_(0),
         end_time_(0),
         is_started_(false) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     calculated_stream_ =
         dynamic_cast<phi::GPUContext *>(
             paddle::platform::DeviceContextPool::Instance().Get(place))
             ->stream();
+#endif
   }
 
   void Start() {

From fb748d9f512dc34eea5f13325a5f2396f385b0cf Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 15 Nov 2023 16:25:45 +0000
Subject: [PATCH 45/52] fix

---
 .../fluid/framework/new_executor/program_interpreter.cc   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index ced72c782aace..7e15497a4263b 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -91,7 +91,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
 
   PrepareForCUDAGraphCapture();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA)
   calculate_stream_timer_ = phi::CalculateStreamTimer(place_);
 #endif
 }
@@ -643,7 +643,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA)
   start_time = calculate_stream_timer_.StartTime();
   end_time = calculate_stream_timer_.EndTime();
 #endif
@@ -1057,7 +1057,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA)
     if (enable_job_schedule_profiler_) {
       if (!calculate_stream_timer_.IsStarted() &&
           !interpreter::IsCommunicationOp(instr_node)) {
@@ -1242,7 +1242,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 
     RunInstruction(instr_node);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA)
     if (enable_job_schedule_profiler_) {
       if (instr_id == last_calculate_instr_id_ &&
           calculate_stream_timer_.IsStarted()) {

From aa5570d950ab8d87d58fa52420cd36edfa6ff092 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 15 Nov 2023 19:43:46 +0000
Subject: [PATCH 46/52] fix run error

---
 paddle/phi/kernels/autotune/gpu_timer.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index bfe421d380eff..f8f4b6bf849cd 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -108,17 +108,15 @@ class CalculateStreamTimer {
         end_time_(0),
         is_started_(false) {}
 
-  explicit CalculateStreamTimer(const paddle::platform::Place place)
+  explicit CalculateStreamTimer(const paddle::platform::Place &place)
       : calculated_stream_(nullptr),
         start_time_(0),
         end_time_(0),
         is_started_(false) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     calculated_stream_ =
         dynamic_cast<phi::GPUContext *>(
             paddle::platform::DeviceContextPool::Instance().Get(place))
             ->stream();
-#endif
   }
 
   void Start() {

From 6b18e10b5eabcc7b41d2170bc2d8f33595a25a0a Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Wed, 15 Nov 2023 21:11:06 +0000
Subject: [PATCH 47/52] fix

---
 .../framework/new_executor/program_interpreter.cc     |  6 +++++-
 paddle/phi/kernels/autotune/gpu_timer.h               | 11 -----------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 7e15497a4263b..bdd9a6b452e11 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -92,7 +92,11 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
   PrepareForCUDAGraphCapture();
 
 #if defined(PADDLE_WITH_CUDA)
-  calculate_stream_timer_ = phi::CalculateStreamTimer(place_);
+  gpuStream_t calculated_stream =
+      dynamic_cast<phi::GPUContext*>(
+          paddle::platform::DeviceContextPool::Instance().Get(place))
+          ->stream();
+  calculate_stream_timer_.SetStream(calculated_stream);
 #endif
 }
 
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index f8f4b6bf849cd..338920a1ae363 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -108,17 +108,6 @@ class CalculateStreamTimer {
         end_time_(0),
         is_started_(false) {}
 
-  explicit CalculateStreamTimer(const paddle::platform::Place &place)
-      : calculated_stream_(nullptr),
-        start_time_(0),
-        end_time_(0),
-        is_started_(false) {
-    calculated_stream_ =
-        dynamic_cast<phi::GPUContext *>(
-            paddle::platform::DeviceContextPool::Instance().Get(place))
-            ->stream();
-  }
-
   void Start() {
     // Note(sonder): Since it is not possible to directly obtain the start time
     // of the event, "gettimeofday" is used here to retrieve it. The callback is

From f096253f7bc47a2c62ce343ae85c2755f6c81965 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Thu, 16 Nov 2023 00:55:05 +0000
Subject: [PATCH 48/52] fix

---
 .../new_executor/program_interpreter.cc       | 18 ++++++++-------
 paddle/fluid/pybind/pybind.cc                 |  2 +-
 python/paddle/base/executor.py                | 23 ++++++++-----------
 .../auto_parallel/static/engine.py            |  7 +++---
 4 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index bdd9a6b452e11..55763d7f021b7 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -90,14 +90,6 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
   };
 
   PrepareForCUDAGraphCapture();
-
-#if defined(PADDLE_WITH_CUDA)
-  gpuStream_t calculated_stream =
-      dynamic_cast<phi::GPUContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(place))
-          ->stream();
-  calculate_stream_timer_.SetStream(calculated_stream);
-#endif
 }
 
 ProgramInterpreter::~ProgramInterpreter() {
@@ -147,6 +139,16 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
                                   bool enable_job_schedule_profiler) {
   enable_job_schedule_profiler_ = enable_job_schedule_profiler;
 
+#if defined(PADDLE_WITH_CUDA)
+  if (enable_job_schedule_profiler_) {
+    gpuStream_t calculated_stream =
+        dynamic_cast<phi::GPUContext*>(
+            paddle::platform::DeviceContextPool::Instance().Get(place_))
+            ->stream();
+    calculate_stream_timer_.SetStream(calculated_stream);
+  }
+#endif
+
   std::vector<paddle::framework::OpFuncNode> op_func_nodes;
   Build(feed_names, &op_func_nodes);
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d7c98ec112481..e9877b5325357 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2014,7 +2014,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](StandaloneExecutor &self,
               std::vector<std::string> feed_names,
-              const bool enable_job_schedule_profiler = false) {
+              bool enable_job_schedule_profiler = false) {
              paddle::framework::FetchList ret;
              {
                pybind11::gil_scoped_release release;
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index ed6a3cb03ab86..be8dda2b0bf08 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -807,9 +807,10 @@ def __init__(self, place, plan, scope):
         self._plan = plan
         self._scope = scope
         self._new_exe = self._create_new_executor()
-        self._enable_job_schedule_profiler = False
 
-    def run(self, feed_names, return_numpy=True):
+    def run(
+        self, feed_names, return_numpy=True, enable_job_schedule_profiler=False
+    ):
         """
         Args:
             feed_names(list): This parameter represents the input names of the model.
@@ -820,7 +821,7 @@ def run(self, feed_names, return_numpy=True):
                 the type of the return value is a list of :code:`LoDTensor`. The default is True.
         """
         tensors = self._new_exe.run(
-            feed_names, self._enable_job_schedule_profiler
+            feed_names, enable_job_schedule_profiler
         )._move_to_list()
         if return_numpy:
             tensors = as_numpy(tensors, copy=True)
@@ -836,9 +837,6 @@ def run(self, feed_names, return_numpy=True):
                 )
             return tensors
 
-    def set_enable_job_schedule_profiler(self, enable_job_schedule_profiler):
-        self._enable_job_schedule_profiler = enable_job_schedule_profiler
-
     def _create_new_executor(self):
         new_exe = core.StandaloneExecutor(self._place, self._plan, self._scope)
 
@@ -1207,7 +1205,7 @@ def __init__(self, place=None):
 
         self.op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName()
 
-        self.enable_job_schedule_profiler = None
+        self.enable_job_schedule_profiler = False
 
     def _is_optimizer_op(self, op):
         return self.op_role_key in op.attr_names and int(
@@ -1929,12 +1927,11 @@ def _run_impl(
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
 
-            if self.enable_job_schedule_profiler is not None:
-                new_exe.set_enable_job_schedule_profiler(
-                    self.enable_job_schedule_profiler
-                )
-
-            ret = new_exe.run(list(feed.keys()), return_numpy)
+            ret = new_exe.run(
+                list(feed.keys()),
+                return_numpy,
+                self.enable_job_schedule_profiler,
+            )
             set_flags(stored_flag)
             return ret
 
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 16bc6b74e432e..93f1d477baf5c 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -1495,10 +1495,9 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None):
         ):
             self._prepare_reader()
 
-        if self._strategy.pipeline.schedule_profiler:
-            self._executor.enable_job_schedule_profiler = (
-                self.enable_job_schedule_profiler
-            )
+        self._executor.enable_job_schedule_profiler = (
+            self.enable_job_schedule_profiler
+        )
 
         outs = self._executor.run(
             self.main_program,

From 560fb61b077f56c2b498ddec8d46a2bacc871e81 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Thu, 16 Nov 2023 02:35:54 +0000
Subject: [PATCH 49/52] fix calculate_stream_timer logic

---
 .../new_executor/program_interpreter.cc       | 26 +++++++------------
 .../new_executor/program_interpreter.h        |  2 +-
 paddle/phi/kernels/autotune/gpu_timer.h       | 14 ++++++++++
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 55763d7f021b7..65f865a862d6d 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -90,6 +90,10 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
   };
 
   PrepareForCUDAGraphCapture();
+
+#if defined(PADDLE_WITH_CUDA)
+  calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
+#endif
 }
 
 ProgramInterpreter::~ProgramInterpreter() {
@@ -139,16 +143,6 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
                                   bool enable_job_schedule_profiler) {
   enable_job_schedule_profiler_ = enable_job_schedule_profiler;
 
-#if defined(PADDLE_WITH_CUDA)
-  if (enable_job_schedule_profiler_) {
-    gpuStream_t calculated_stream =
-        dynamic_cast<phi::GPUContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place_))
-            ->stream();
-    calculate_stream_timer_.SetStream(calculated_stream);
-  }
-#endif
-
   std::vector<paddle::framework::OpFuncNode> op_func_nodes;
   Build(feed_names, &op_func_nodes);
 
@@ -650,8 +644,8 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
 #if defined(PADDLE_WITH_CUDA)
-  start_time = calculate_stream_timer_.StartTime();
-  end_time = calculate_stream_timer_.EndTime();
+  start_time = calculate_stream_timer_->StartTime();
+  end_time = calculate_stream_timer_->EndTime();
 #endif
   return std::make_tuple(start_time, end_time);
 }
@@ -1065,10 +1059,10 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
     instr_node.WaitEvent(place_);
 #if defined(PADDLE_WITH_CUDA)
     if (enable_job_schedule_profiler_) {
-      if (!calculate_stream_timer_.IsStarted() &&
+      if (!calculate_stream_timer_->IsStarted() &&
           !interpreter::IsCommunicationOp(instr_node)) {
         VLOG(3) << "Start calculated stream timer from op: " << op->Type();
-        calculate_stream_timer_.Start();
+        calculate_stream_timer_->Start();
       }
     }
 #endif
@@ -1251,10 +1245,10 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 #if defined(PADDLE_WITH_CUDA)
     if (enable_job_schedule_profiler_) {
       if (instr_id == last_calculate_instr_id_ &&
-          calculate_stream_timer_.IsStarted()) {
+          calculate_stream_timer_->IsStarted()) {
         VLOG(3) << "Stop calculated stream timer from op: "
                 << instr_node.OpBase()->Type();
-        calculate_stream_timer_.Stop();
+        calculate_stream_timer_->Stop();
       }
     }
 #endif
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index a6fa780dc1bda..5b8b4dbb36a81 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -221,7 +221,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   std::vector<HookFunc> hookfuncs_;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  phi::CalculateStreamTimer calculate_stream_timer_;
+  std::unique_ptr<phi::CalculateStreamTimer> calculate_stream_timer_;
 #endif
   size_t last_calculate_instr_id_;
   bool enable_job_schedule_profiler_;
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 338920a1ae363..2d06de4f4cac3 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -108,10 +108,23 @@ class CalculateStreamTimer {
         end_time_(0),
         is_started_(false) {}
 
+  explicit CalculateStreamTimer(const paddle::platform::Place &place)
+      : calculated_stream_(nullptr),
+        start_time_(0),
+        end_time_(0),
+        is_started_(false),
+        place_(place) {}
+
   void Start() {
     // Note(sonder): Since it is not possible to directly obtain the start time
     // of the event, "gettimeofday" is used here to retrieve it. The callback is
     // used to record the start time of the event.
+    if (!is_started_) {
+      calculated_stream_ =
+          dynamic_cast<phi::GPUContext *>(
+              paddle::platform::DeviceContextPool::Instance().Get(place_))
+              ->stream();
+    }
     if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -180,6 +193,7 @@ class CalculateStreamTimer {
   double start_time_;
   double end_time_;
   bool is_started_;
+  const paddle::platform::Place place_;
 };
 
 }  // namespace phi

From bbb30715ded07fe80aa12c8a2874ed20e7b38f03 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Fri, 17 Nov 2023 02:07:24 +0000
Subject: [PATCH 50/52] remove fluid head

---
 paddle/phi/kernels/autotune/gpu_timer.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 2d06de4f4cac3..e160b3567e3ac 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
 
@@ -108,7 +108,7 @@ class CalculateStreamTimer {
         end_time_(0),
         is_started_(false) {}
 
-  explicit CalculateStreamTimer(const paddle::platform::Place &place)
+  explicit CalculateStreamTimer(const phi::Place &place)
       : calculated_stream_(nullptr),
         start_time_(0),
         end_time_(0),
@@ -120,10 +120,9 @@ class CalculateStreamTimer {
     // of the event, "gettimeofday" is used here to retrieve it. The callback is
     // used to record the start time of the event.
     if (!is_started_) {
-      calculated_stream_ =
-          dynamic_cast<phi::GPUContext *>(
-              paddle::platform::DeviceContextPool::Instance().Get(place_))
-              ->stream();
+      calculated_stream_ = dynamic_cast<phi::GPUContext *>(
+                               phi::DeviceContextPool::Instance().Get(place_))
+                               ->stream();
     }
     if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
@@ -193,7 +192,7 @@ class CalculateStreamTimer {
   double start_time_;
   double end_time_;
   bool is_started_;
-  const paddle::platform::Place place_;
+  const phi::Place place_;
 };
 
 }  // namespace phi

From e15c19e36616da20d94e1b35f18d8c9481fa82b6 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Fri, 17 Nov 2023 03:32:10 +0000
Subject: [PATCH 51/52] fix build error

---
 paddle/phi/kernels/autotune/gpu_timer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index e160b3567e3ac..c50a571a7fd95 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/port.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"

From 989348c33dc9a7912b0a345680558bcec42de497 Mon Sep 17 00:00:00 2001
From: andsonder <changlu@keter.top>
Date: Fri, 17 Nov 2023 15:02:48 +0000
Subject: [PATCH 52/52] set default value for enable_job_schedule_profiler

---
 paddle/fluid/framework/new_executor/program_interpreter.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 65f865a862d6d..04d116a53a525 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -54,7 +54,8 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
       block_(block),
       stream_analyzer_(place),
       execution_config_(execution_config),
-      var_scope_(scope) {
+      var_scope_(scope),
+      enable_job_schedule_profiler_(false) {
   VLOG(4) << "ProgramInterpreter(): " << this << " on " << place_;
 
   exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);