From c514fbd803fff4ca53b11738f818d5ca9cc1eed6 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Wed, 18 Oct 2023 04:43:03 +0000 Subject: [PATCH 01/52] merge from openvino master --- .../new_executor/interpreter_base_impl.h | 1 + .../framework/new_executor/interpretercore.cc | 4 ++ .../new_executor/program_interpreter.cc | 43 +++++++++++++++++++ .../new_executor/program_interpreter.h | 4 ++ paddle/phi/kernels/autotune/gpu_timer.h | 18 ++++++++ 5 files changed, 70 insertions(+) diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index 369216e0078c4..fd10265312c33 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -37,6 +37,7 @@ #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/device_event.h" #include "paddle/phi/backends/device_manager.h" +#include "paddle/phi/kernels/autotune/gpu_timer.h" PD_DECLARE_bool(new_executor_serial_run); PD_DECLARE_bool(new_executor_static_build); diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 8e052d3b2685e..c6a9209db19dc 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -34,6 +34,10 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); +PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler, + true, + "Enable auto parallel profiler, collecting the " + "runtime of jobs in different devices"); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index f1646f50471a4..10170fedb3815 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -39,6 +39,7 @@ #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif +PHI_DECLARE_bool(auto_parallel_profiler); namespace paddle { namespace framework { @@ -103,6 +104,12 @@ ProgramInterpreter::~ProgramInterpreter() { } void ProgramInterpreter::RunImpl() { + if (FLAGS_auto_parallel_profiler) { + for (auto& timer : gpu_stream_timer_) { + timer.Start(); + } + } + // lazy initialization of gc, do not create gc is the program only run once if (!gc_) { gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_); @@ -127,6 +134,11 @@ void ProgramInterpreter::RunImpl() { platform::DeviceContextPool::Instance().Get(place_)->Wait(); } #endif + if (FLAGS_auto_parallel_profiler) { + for (auto& timer : gpu_stream_timer_) { + timer.Stop(); + } + } } FetchList ProgramInterpreter::Run(const std::vector& feed_names, @@ -153,6 +165,12 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, ClearLoDTensorArrayInLocalScope(); } + if (FLAGS_auto_parallel_profiler) { + for (auto& timer : gpu_stream_timer_) { + VLOG(3) << "GPU stream event time: " << timer.ElapsedTime(); + } + } + // return Fetch Tensors Scope* inner_scope = HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope(); @@ -622,6 +640,27 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { } } +void ProgramInterpreter::AddGpuStreamEvents() { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + for (auto& instr : vec_instruction_) { + if (instr.KernelType() != OpFuncType::kGpuAsync) { + return; + } + + if (instr.DeviceContext().GetPlace().GetType() == + phi::AllocationType::CUSTOM) { + return; + } + + gpuStream_t stream = + reinterpret_cast(instr.DeviceContext()) + .stream(); + + gpu_stream_timer_.emplace_back(stream); + } +#endif +} + void ProgramInterpreter::Convert( std::vector* op_func_nodes) { auto& vec_meta_info = var_scope_.MutableVecMetaInfo(); @@ -658,6 +697,10 @@ void ProgramInterpreter::Convert( vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); } + if (FLAGS_auto_parallel_profiler) { + AddGpuStreamEvents(); + } + BuildOperatorDependences(); // NOTE(Ruibiao): For cross-step stream synchronization, an event may be diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index bef6385c211fb..79f2be4ca5785 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -149,6 +149,8 @@ class ProgramInterpreter : public InterpreterBaseImpl { // For log and debug std::string GetDepsString() const; + void AddGpuStreamEvents(); + bool is_build_{false}; bool static_build_{false}; // Note(sonder): share the op dependency and event analysis procedure. @@ -210,6 +212,8 @@ class ProgramInterpreter : public InterpreterBaseImpl { InstructionSchedulingPriorityLess instruction_scheduling_priority_less; std::vector hookfuncs_; + + std::vector gpu_stream_timer_; }; } // namespace framework diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 87eca2613a7b5..33f8eb8291b7f 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -42,6 +42,20 @@ class GpuTimer { stop_, phi::errors::PreconditionNotMet("Stop Event is not ready.")); } + explicit GpuTimer(gpuStream_t stream) : stream_(stream) { +#ifdef PADDLE_WITH_HIP + hipEventCreate(&start_); + hipEventCreate(&stop_); +#else + cudaEventCreate(&start_); + cudaEventCreate(&stop_); +#endif + PADDLE_ENFORCE_NOT_NULL( + start_, phi::errors::PreconditionNotMet("Start Event is not ready.")); + PADDLE_ENFORCE_NOT_NULL( + stop_, phi::errors::PreconditionNotMet("Stop Event is not ready.")); + } + ~GpuTimer() { #ifdef PADDLE_WITH_HIP hipEventDestroy(start_); @@ -68,6 +82,9 @@ class GpuTimer { #endif } + void Start() { Start(stream_); } + void Stop() { Stop(stream_); } + float ElapsedTime() { float milliseconds = 0; #ifdef PADDLE_WITH_HIP @@ -83,6 +100,7 @@ class GpuTimer { private: gpuEvent_t start_; gpuEvent_t stop_; + gpuStream_t stream_; }; } // namespace phi From 0147f70b06e9eeb68d823f4b4e83c6e0940abaf2 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 20 Oct 2023 05:53:58 +0000 Subject: [PATCH 02/52] add InterpreterRunTime() to record interpreter's run time --- .../new_executor/interpreter_base_impl.h | 2 + .../framework/new_executor/interpretercore.cc | 4 ++ .../framework/new_executor/interpretercore.h | 2 + .../new_executor/new_ir_interpreter.cc | 5 ++ .../new_executor/new_ir_interpreter.h | 2 + .../new_executor/program_interpreter.cc | 65 ++++++++++++++----- .../new_executor/program_interpreter.h | 4 +- .../new_executor/standalone_executor.cc | 16 +++++ paddle/phi/kernels/autotune/gpu_timer.h | 33 +++++----- 9 files changed, 98 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index fd10265312c33..d78988a244946 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -104,6 +104,8 @@ class InterpreterBaseImpl { std::vector* op_func_nodes) = 0; virtual bool IsStaticBuild() const = 0; + + virtual std::tuple InterpreterRunTime() = 0; }; inline void SetDeviceId(const platform::Place& place) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index c6a9209db19dc..4da225d9068d1 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -133,5 +133,9 @@ void InterpreterCore::Build( bool InterpreterCore::IsStaticBuild() const { return impl_->IsStaticBuild(); } +std::tuple InterpreterCore::InterpreterRunTime() { + return impl_->InterpreterRunTime(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index d21bd9e1fc378..cd03eb46a6707 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -79,6 +79,8 @@ class InterpreterCore { bool IsStaticBuild() const; + std::tuple InterpreterRunTime(); + private: DISABLE_COPY_AND_ASSIGN(InterpreterCore); diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 50af034414d6f..9fde156b9f11b 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -276,6 +276,11 @@ void NewIRInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) { << ") to InterpreterCore(" << this << ")"; } +std::tuple NewIRInterpreter::InterpreterRunTime() { + PADDLE_THROW(platform::errors::Unimplemented( + "NewIRInterpreter::InterpreterRunTime is not implemented.")); +} + const interpreter::NewIrDependencyBuilder& NewIRInterpreter::GetNewIrDependencyBuilder() const { return ir_dependency_builder_; diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index 3a128791cdfce..aae29829c3cf4 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -60,6 +60,8 @@ class NewIRInterpreter : public InterpreterBaseImpl { void ShareBuildResultsFrom(const InterpreterBaseImpl& src) override; + std::tuple InterpreterRunTime() override; + std::shared_ptr> GetDependencyCount() const override; bool IsSharedResultsBuild() const override; diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 10170fedb3815..8889a41cac6cc 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -105,8 +105,10 @@ ProgramInterpreter::~ProgramInterpreter() { void ProgramInterpreter::RunImpl() { if (FLAGS_auto_parallel_profiler) { - for (auto& timer : gpu_stream_timer_) { - timer.Start(); + // Note(sonder): Record the start time of the each stream. + for (size_t i = 0; i < stream_timers_.size(); ++i) { + auto& stream_timer = stream_timers_[i]; + stream_timer.Start(); } } @@ -135,8 +137,9 @@ void ProgramInterpreter::RunImpl() { } #endif if (FLAGS_auto_parallel_profiler) { - for (auto& timer : gpu_stream_timer_) { - timer.Stop(); + for (size_t i = 0; i < stream_timers_.size(); ++i) { + auto& stream_timer = stream_timers_[i]; + stream_timer.Stop(); } } } @@ -165,12 +168,6 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, ClearLoDTensorArrayInLocalScope(); } - if (FLAGS_auto_parallel_profiler) { - for (auto& timer : gpu_stream_timer_) { - VLOG(3) << "GPU stream event time: " << timer.ElapsedTime(); - } - } - // return Fetch Tensors Scope* inner_scope = HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope(); @@ -642,25 +639,57 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { void ProgramInterpreter::AddGpuStreamEvents() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (auto& instr : vec_instruction_) { - if (instr.KernelType() != OpFuncType::kGpuAsync) { - return; - } + stream_timers_.clear(); + std::vector streams; + bool has_default_stream = false; - if (instr.DeviceContext().GetPlace().GetType() == - phi::AllocationType::CUSTOM) { - return; + for (size_t i = 0; i < vec_instruction_.size(); ++i) { + auto& instr = vec_instruction_[i]; + if ((instr.KernelType() != OpFuncType::kGpuAsync) || + (instr.DeviceContext().GetPlace().GetType() == + phi::AllocationType::CUSTOM)) { + continue; } gpuStream_t stream = reinterpret_cast(instr.DeviceContext()) .stream(); - gpu_stream_timer_.emplace_back(stream); + if (stream != nullptr) { + has_default_stream = true; + } + } + size_t timers_size = has_default_stream ? streams.size() + 1 : streams.size(); + stream_timers_.resize(timers_size); + for (size_t i = 0; i < streams.size(); ++i) { + stream_timers_[i].SetStream(streams[i]); + } + if (has_default_stream) { + stream_timers_.back().SetStream(nullptr); } + #endif } +std::tuple ProgramInterpreter::InterpreterRunTime() { + double min_start_time = DBL_MAX, max_end_time = DBL_MIN; + for (size_t i = 0; i < stream_timers_.size(); ++i) { + auto& stream_timer = stream_timers_[i]; + double start_time = stream_timer.StartTime(); + double end_time = stream_timer.EndTime(); + + min_start_time = std::min(min_start_time, start_time); + max_end_time = std::max(max_end_time, end_time); + + VLOG(3) << "ProgramInterpreter::InterpreterRunTime:" + << "start_time: " << std::to_string(start_time) + << ", end_time: " << std::to_string(end_time) << ", min_start_time" + << std::to_string(min_start_time) + << ", max_end_time: " << std::to_string(max_end_time); + } + return std::make_tuple(min_start_time, max_end_time); +} + void ProgramInterpreter::Convert( std::vector* op_func_nodes) { auto& vec_meta_info = var_scope_.MutableVecMetaInfo(); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 79f2be4ca5785..98aac21bac880 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -98,6 +98,8 @@ class ProgramInterpreter : public InterpreterBaseImpl { bool IsStaticBuild() const override { return static_build_; } + std::tuple InterpreterRunTime() override; + private: // build graph void Convert(std::vector* op_func_nodes); @@ -213,7 +215,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector hookfuncs_; - std::vector gpu_stream_timer_; + std::vector stream_timers_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index f06bee2c884e3..5b12a70fe6c53 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -30,6 +30,7 @@ PHI_DECLARE_bool(enable_new_ir_in_executor); PHI_DECLARE_bool(enable_pir_api); PHI_DECLARE_bool(new_ir_apply_inplace_pass); +PHI_DECLARE_bool(auto_parallel_profiler); namespace paddle { namespace framework { @@ -205,6 +206,21 @@ paddle::framework::FetchList StandaloneExecutor::Run( } } + // record each job's run time + if (FLAGS_auto_parallel_profiler) { + for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { + const auto& job = jobs[job_idx]; + const std::string& job_type = job->Type(); + double start_time, end_time; + std::tie(start_time, end_time) = + interpretercores_[job_idx]->InterpreterRunTime(); + VLOG(3) << "Profiler Info: Job (" << job_idx << "), type = " << job_type + << ", micro_batch_id = " << job->MicroBatchId() + << ", job_start_time = " << std::to_string(start_time) + << ", job_end_time = " << std::to_string(end_time); + } + } + // return Fetch Tensors if (FLAGS_enable_new_ir_in_executor) { framework::FetchList fetch_res; diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 33f8eb8291b7f..e23613f94a9d0 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -14,6 +14,8 @@ #pragma once +#include + #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" @@ -42,20 +44,6 @@ class GpuTimer { stop_, phi::errors::PreconditionNotMet("Stop Event is not ready.")); } - explicit GpuTimer(gpuStream_t stream) : stream_(stream) { -#ifdef PADDLE_WITH_HIP - hipEventCreate(&start_); - hipEventCreate(&stop_); -#else - cudaEventCreate(&start_); - cudaEventCreate(&stop_); -#endif - PADDLE_ENFORCE_NOT_NULL( - start_, phi::errors::PreconditionNotMet("Start Event is not ready.")); - PADDLE_ENFORCE_NOT_NULL( - stop_, phi::errors::PreconditionNotMet("Stop Event is not ready.")); - } - ~GpuTimer() { #ifdef PADDLE_WITH_HIP hipEventDestroy(start_); @@ -82,10 +70,18 @@ class GpuTimer { #endif } - void Start() { Start(stream_); } + void Start() { + struct timeval time_now {}; + gettimeofday(&time_now, nullptr); + start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000); + Start(stream_); + } + void Stop() { Stop(stream_); } - float ElapsedTime() { + void SetStream(gpuStream_t stream) { stream_ = stream; } + + double ElapsedTime() { float milliseconds = 0; #ifdef PADDLE_WITH_HIP hipEventSynchronize(stop_); @@ -97,10 +93,15 @@ class GpuTimer { return milliseconds; } + double StartTime() { return start_time_; } + + double EndTime() { return ElapsedTime() + start_time_; } + private: gpuEvent_t start_; gpuEvent_t stop_; gpuStream_t stream_; + double start_time_; }; } // namespace phi From 6d1dc3d07bf6dc4c8cc475cd6380558705968f98 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 20 Oct 2023 09:50:00 +0000 Subject: [PATCH 03/52] add profiler helper static to produce json file --- .../new_executor/standalone_executor.cc | 2 +- paddle/phi/kernels/autotune/gpu_timer.h | 2 +- .../pp_utils/profiler_helper_static.py | 85 +++++++++++++++++++ ...est_distributed_fused_lamb_op_with_clip.py | 4 +- 4 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 5b12a70fe6c53..ea11bd1427758 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -214,7 +214,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( double start_time, end_time; std::tie(start_time, end_time) = interpretercores_[job_idx]->InterpreterRunTime(); - VLOG(3) << "Profiler Info: Job (" << job_idx << "), type = " << job_type + VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type << ", micro_batch_id = " << job->MicroBatchId() << ", job_start_time = " << std::to_string(start_time) << ", job_end_time = " << std::to_string(end_time); diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index e23613f94a9d0..9ef228906d1d6 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -73,7 +73,7 @@ class GpuTimer { void Start() { struct timeval time_now {}; gettimeofday(&time_now, nullptr); - start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000); + start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); Start(stream_); } diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py new file mode 100644 index 0000000000000..e6055524870d0 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py @@ -0,0 +1,85 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import re +from argparse import ArgumentParser + +import paddle + + +def parse_args(): + parser = ArgumentParser() + device_count = paddle.device.cuda.device_count() + all_devices = ",".join([str(i) for i in range(device_count)]) + parser.add_argument("--devices", type=str, default=all_devices) + parser.add_argument( + "--log_dir", type=str, default="build/Testing/Temporary/" + ) + args = parser.parse_args() + return args + + +def process_log_data(log_data, device_id): + log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)' + matches = re.findall(log_pattern, log_data) + events = [] + for match in matches: + job_id, job_type, micro_batch_id, job_start_time, job_end_time = match + if job_type in ["lr"]: + continue + + event_start = { + "name": job_type[0].upper() + "_" + str(job_id), + "cat": job_type, + "ph": "B", + "ts": float(job_start_time.strip()) * 1000, + "pid": "Main", + "tid": "GPU: " + str(device_id), + } + event_end = { + "name": job_type[0].upper() + "_" + str(job_id), + "cat": job_type, + "ph": "E", + "pid": "Main", + "ts": float(job_end_time.strip()) * 1000, + "tid": "GPU: " + str(device_id), + } + events.append(event_start) + events.append(event_end) + + return events + + +def main(): + args = parse_args() + all_events = [] + for device_id in args.devices.split(","): + print(f"Process device {device_id}") + device_id = int(device_id) + log_file = os.path.join(args.log_dir, "workerlog." + str(device_id)) + with open(log_file, "r") as f: + log_data = f.read() + events = process_log_data(log_data, device_id) + all_events.extend(events) + + save_path = os.path.join(args.log_dir, "pipeline_profile.json") + with open(save_path, "w") as f: + f.write(json.dumps({"traceEvents": all_events})) + print(f"Save pipeline profile to {save_path}") + + +if __name__ == "__main__": + main() diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py index 62a94832d1ae9..118d4f4a4ffb1 100644 --- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py +++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py @@ -49,14 +49,14 @@ def run_test( if os.name == 'nt': return args = locals() - log_dir = os.path.join(temp_dir.name, f'log_{os.getpid()}') + log_dir = "/home/root/Paddle/build/Testing/Temporary/" cmd = [ sys.executable, '-u', '-m', 'paddle.distributed.launch', '--devices', - '0,1', + '2,3', '--log_dir', log_dir, get_test_file(), From 6f4f67ce65b800e92b0725a4d42f5884c1e87619 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 23 Oct 2023 06:30:12 +0000 Subject: [PATCH 04/52] add color map and support perfetto format --- .../pp_utils/profiler_helper_static.py | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py index e6055524870d0..704fa0c6b9624 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py @@ -36,6 +36,12 @@ def process_log_data(log_data, device_id): log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)' matches = re.findall(log_pattern, log_data) events = [] + color_map = { + "forward": "thread_state_running", # RGB: 126, 200, 148 + "backward": "rail_idle", # RGB: 238, 142, 0 + "optimizer": "rail_response", # RGB: 238, 142, 0 + "default": "thread_state_unknown", # RGB: 199, 155, 125 + } for match in matches: job_id, job_type, micro_batch_id, job_start_time, job_end_time = match if job_type in ["lr"]: @@ -47,7 +53,8 @@ def process_log_data(log_data, device_id): "ph": "B", "ts": float(job_start_time.strip()) * 1000, "pid": "Main", - "tid": "GPU: " + str(device_id), + "tid": "GPU" + str(device_id), + "cname": color_map[job_type], } event_end = { "name": job_type[0].upper() + "_" + str(job_id), @@ -55,7 +62,8 @@ def process_log_data(log_data, device_id): "ph": "E", "pid": "Main", "ts": float(job_end_time.strip()) * 1000, - "tid": "GPU: " + str(device_id), + "tid": "GPU" + str(device_id), + "cname": color_map[job_type], } events.append(event_start) events.append(event_end) @@ -80,6 +88,38 @@ def main(): f.write(json.dumps({"traceEvents": all_events})) print(f"Save pipeline profile to {save_path}") + # support Perfetto format + save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json") + all_events.extend( + [ + { + "args": {"name": "GPU"}, + "cat": "__metadata", + "name": "thread_name", + "ph": "M", + "pid": "Main", + "tid": 0, + "ts": 0, + }, + { + "args": {"name": "GPU"}, + "cat": "__metadata", + "name": "thread_name", + "ph": "M", + "pid": "Main", + "tid": 1, + "ts": 0, + }, + ] + ) + json_str = json.dumps({"traceEvents": all_events}) + for i in range(len(args.devices.split(","))): + json_str = json_str.replace(f'"GPU{i}"', f'{i}') + + with open(save_path, "w") as f: + f.write(json_str) + print(f"Save pipeline profile to {save_path}") + if __name__ == "__main__": main() From 4d51610a9cec4793d1bb953b13bbccc9dffa020d Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 23 Oct 2023 07:03:24 +0000 Subject: [PATCH 05/52] recover codes --- test/legacy_test/test_distributed_fused_lamb_op_with_clip.py | 4 ++-- third_party/mkldnn | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py index 118d4f4a4ffb1..62a94832d1ae9 100644 --- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py +++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py @@ -49,14 +49,14 @@ def run_test( if os.name == 'nt': return args = locals() - log_dir = "/home/root/Paddle/build/Testing/Temporary/" + log_dir = os.path.join(temp_dir.name, f'log_{os.getpid()}') cmd = [ sys.executable, '-u', '-m', 'paddle.distributed.launch', '--devices', - '2,3', + '0,1', '--log_dir', log_dir, get_test_file(), diff --git a/third_party/mkldnn b/third_party/mkldnn index 01204edbda1c2..403667673f61a 160000 --- a/third_party/mkldnn +++ b/third_party/mkldnn @@ -1 +1 @@ -Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56 +Subproject commit 403667673f61a56289622fd5bc587b1856296fbc From c70d9f9690ee11d517d5545c0b6bb07aa6630bc2 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 23 Oct 2023 07:46:38 +0000 Subject: [PATCH 06/52] control include env for gpu_timer.h --- paddle/fluid/framework/new_executor/interpreter_base_impl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index d78988a244946..e0063c4b7cdf4 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -37,7 +37,10 @@ #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/device_event.h" #include "paddle/phi/backends/device_manager.h" + +#if defined(PADDLE_WITH_CUDA) #include "paddle/phi/kernels/autotune/gpu_timer.h" +#endif PD_DECLARE_bool(new_executor_serial_run); PD_DECLARE_bool(new_executor_static_build); From ad0f17a2f2867c649d29a90e824a82b30a0884c1 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 23 Oct 2023 07:50:37 +0000 Subject: [PATCH 07/52] fix logic for profiler_helper_static.py --- .../pp_utils/profiler_helper_static.py | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py index 704fa0c6b9624..6567b813adc92 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py @@ -90,28 +90,20 @@ def main(): # support Perfetto format save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json") - all_events.extend( - [ - { - "args": {"name": "GPU"}, - "cat": "__metadata", - "name": "thread_name", - "ph": "M", - "pid": "Main", - "tid": 0, - "ts": 0, - }, - { - "args": {"name": "GPU"}, - "cat": "__metadata", - "name": "thread_name", - "ph": "M", - "pid": "Main", - "tid": 1, - "ts": 0, - }, - ] - ) + for i in range(len(args.devices.split(","))): + all_events.extend( + [ + { + "args": {"name": "GPU"}, + "cat": "__metadata", + "name": "thread_name", + "ph": "M", + "pid": "Main", + "tid": i, + "ts": 0, + } + ] + ) json_str = json.dumps({"traceEvents": all_events}) for i in range(len(args.devices.split(","))): json_str = json_str.replace(f'"GPU{i}"', f'{i}') From e0442c6803d307df4cf9486f5b5dd879e44c4839 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 23 Oct 2023 10:00:57 +0000 Subject: [PATCH 08/52] fix build error --- paddle/fluid/framework/new_executor/program_interpreter.cc | 5 +++++ paddle/fluid/framework/new_executor/program_interpreter.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index a89afc7d09a82..78c0adc98e125 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -104,6 +104,7 @@ ProgramInterpreter::~ProgramInterpreter() { } void ProgramInterpreter::RunImpl() { +#if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { // Note(sonder): Record the start time of the each stream. for (size_t i = 0; i < stream_timers_.size(); ++i) { @@ -111,6 +112,7 @@ void ProgramInterpreter::RunImpl() { stream_timer.Start(); } } +#endif // lazy initialization of gc, do not create gc is the program only run once if (!gc_) { @@ -136,12 +138,15 @@ void ProgramInterpreter::RunImpl() { platform::DeviceContextPool::Instance().Get(place_)->Wait(); } #endif + +#if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { for (size_t i = 0; i < stream_timers_.size(); ++i) { auto& stream_timer = stream_timers_[i]; stream_timer.Stop(); } } +#endif } FetchList ProgramInterpreter::Run(const std::vector& feed_names, diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 98aac21bac880..79809a4e8edb7 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -215,7 +215,9 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector hookfuncs_; +#if defined(PADDLE_WITH_CUDA) std::vector stream_timers_; +#endif }; } // namespace framework From a8a37bb123867b4f7697b39886368b5ccadda02e Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 23 Oct 2023 14:18:46 +0000 Subject: [PATCH 09/52] fix build error --- paddle/fluid/framework/new_executor/program_interpreter.cc | 5 ++++- paddle/fluid/framework/new_executor/standalone_executor.cc | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 78c0adc98e125..ca2e67d5aeb4d 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -677,7 +677,9 @@ void ProgramInterpreter::AddGpuStreamEvents() { } std::tuple ProgramInterpreter::InterpreterRunTime() { - double min_start_time = DBL_MAX, max_end_time = DBL_MIN; + double min_start_time = std::numeric_limits::max(), + max_end_time = std::numeric_limits::min(); +#if defined(PADDLE_WITH_CUDA) for (size_t i = 0; i < stream_timers_.size(); ++i) { auto& stream_timer = stream_timers_[i]; double start_time = stream_timer.StartTime(); @@ -692,6 +694,7 @@ std::tuple ProgramInterpreter::InterpreterRunTime() { << std::to_string(min_start_time) << ", max_end_time: " << std::to_string(max_end_time); } +#endif return std::make_tuple(min_start_time, max_end_time); } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index ea11bd1427758..60c132110f1d4 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -207,6 +207,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( } // record each job's run time +#if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; @@ -220,6 +221,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( << ", job_end_time = " << std::to_string(end_time); } } +#endif // return Fetch Tensors if (FLAGS_enable_new_ir_in_executor) { From a20e6ce1a80e91392362689335c898de05b975f3 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 23 Oct 2023 14:50:22 +0000 Subject: [PATCH 10/52] recover thirdparty --- third_party/mkldnn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/mkldnn b/third_party/mkldnn index 403667673f61a..01204edbda1c2 160000 --- a/third_party/mkldnn +++ b/third_party/mkldnn @@ -1 +1 @@ -Subproject commit 403667673f61a56289622fd5bc587b1856296fbc +Subproject commit 01204edbda1c2a4ff0cccd40476ed6bd2fb62d56 From 3e10a6d2b2e35bd61bfb08d075cff5cc5d1a3a24 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Tue, 24 Oct 2023 10:19:36 +0000 Subject: [PATCH 11/52] add flag control: not support new ir now --- paddle/fluid/framework/new_executor/standalone_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 60c132110f1d4..45f6f103627e4 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -208,7 +208,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( // record each job's run time #if defined(PADDLE_WITH_CUDA) - if (FLAGS_auto_parallel_profiler) { + if (FLAGS_auto_parallel_profiler && !FLAGS_enable_new_ir_in_executor) { for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; const std::string& job_type = job->Type(); From 59b425eefad053a84f6d08d3e34c70dfca2e41b0 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Wed, 25 Oct 2023 00:51:04 +0000 Subject: [PATCH 12/52] set auto_parallel_profiler flag to false --- paddle/fluid/framework/new_executor/interpretercore.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 4da225d9068d1..3b28974efb5c6 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -35,7 +35,7 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler, - true, + false, "Enable auto parallel profiler, collecting the " "runtime of jobs in different devices"); From ddc5038b0293f9942304ebd148056e4262c802a8 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 26 Oct 2023 06:40:25 +0000 Subject: [PATCH 13/52] fix --- .../new_executor/interpreter_base_impl.h | 4 ---- .../framework/new_executor/program_interpreter.cc | 15 +++------------ .../framework/new_executor/program_interpreter.h | 4 ++++ .../framework/new_executor/standalone_executor.cc | 7 ++++++- paddle/phi/kernels/autotune/gpu_timer.h | 5 +++++ .../static}/profiler_helper_static.py | 0 6 files changed, 18 insertions(+), 17 deletions(-) rename python/paddle/distributed/{fleet/meta_parallel/pp_utils => auto_parallel/static}/profiler_helper_static.py (100%) diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index e0063c4b7cdf4..beab995a2c0bf 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -38,10 +38,6 @@ #include "paddle/fluid/platform/device_event.h" #include "paddle/phi/backends/device_manager.h" -#if defined(PADDLE_WITH_CUDA) -#include "paddle/phi/kernels/autotune/gpu_timer.h" -#endif - PD_DECLARE_bool(new_executor_serial_run); PD_DECLARE_bool(new_executor_static_build); PD_DECLARE_bool(new_executor_use_inplace); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index ca2e67d5aeb4d..9a5f8c8b255a9 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -106,7 +106,6 @@ ProgramInterpreter::~ProgramInterpreter() { void ProgramInterpreter::RunImpl() { #if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { - // Note(sonder): Record the start time of the each stream. for (size_t i = 0; i < stream_timers_.size(); ++i) { auto& stream_timer = stream_timers_[i]; stream_timer.Start(); @@ -646,7 +645,6 @@ void ProgramInterpreter::AddGpuStreamEvents() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) stream_timers_.clear(); std::vector streams; - bool has_default_stream = false; for (size_t i = 0; i < vec_instruction_.size(); ++i) { auto& instr = vec_instruction_[i]; @@ -659,20 +657,13 @@ void ProgramInterpreter::AddGpuStreamEvents() { gpuStream_t stream = reinterpret_cast(instr.DeviceContext()) .stream(); - - if (stream != nullptr) { - has_default_stream = true; - } + streams.emplace_back(stream); } - size_t timers_size = has_default_stream ? streams.size() + 1 : streams.size(); - stream_timers_.resize(timers_size); + + stream_timers_.resize(streams.size()); for (size_t i = 0; i < streams.size(); ++i) { stream_timers_[i].SetStream(streams[i]); } - if (has_default_stream) { - stream_timers_.back().SetStream(nullptr); - } - #endif } diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 79809a4e8edb7..5a80f71d423a7 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -16,6 +16,10 @@ #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h" +#if defined(PADDLE_WITH_CUDA) +#include "paddle/phi/kernels/autotune/gpu_timer.h" +#endif + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 45f6f103627e4..a6156c40927b7 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -208,13 +208,18 @@ paddle::framework::FetchList StandaloneExecutor::Run( // record each job's run time #if defined(PADDLE_WITH_CUDA) - if (FLAGS_auto_parallel_profiler && !FLAGS_enable_new_ir_in_executor) { + if (FLAGS_auto_parallel_profiler) { for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; const std::string& job_type = job->Type(); double start_time, end_time; std::tie(start_time, end_time) = interpretercores_[job_idx]->InterpreterRunTime(); + // Note(sonder): Used to record the runtime of each job in order to + // generate a parallel pipeline timeline. Job runtime information can be + // extracted from the logs using the scripts "profiler_helper_static.py". + // Do not modify, as it may affect the results of regular expression + // matching. VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type << ", micro_batch_id = " << job->MicroBatchId() << ", job_start_time = " << std::to_string(start_time) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 9ef228906d1d6..3b99b27447472 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -71,6 +71,11 @@ class GpuTimer { } void Start() { + // Note(sonder): Since it is not possible to directly obtain the start time + // of the event, "gettimeofday" is used here to retrieve it. Subsequently, + // the end time of the event is calculated by adding the start time on the + // CPU with ElapsedTime(). Using the CPU time ensures that the timing is + // synchronized across multiple GPUs. struct timeval time_now {}; gettimeofday(&time_now, nullptr); start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py similarity index 100% rename from python/paddle/distributed/fleet/meta_parallel/pp_utils/profiler_helper_static.py rename to python/paddle/distributed/auto_parallel/static/profiler_helper_static.py From 1dfc8162f4363c5067abe6cddb2ee51f74898180 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 26 Oct 2023 09:04:39 +0000 Subject: [PATCH 14/52] add auto_parallel_profiler as command parameter --- paddle/fluid/framework/new_executor/pir_interpreter.cc | 4 ++-- python/paddle/distributed/auto_parallel/constants.py | 1 + python/paddle/distributed/auto_parallel/static/engine.py | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 27522076dd77b..ec2ee2d365adc 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -281,9 +281,9 @@ void PirInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) { << ") to InterpreterCore(" << this << ")"; } -std::tuple NewIRInterpreter::InterpreterRunTime() { +std::tuple PirInterpreter::InterpreterRunTime() { PADDLE_THROW(platform::errors::Unimplemented( - "NewIRInterpreter::InterpreterRunTime is not implemented.")); + "PirInterpreter::InterpreterRunTime is not implemented.")); } const interpreter::NewIrDependencyBuilder& diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py index 9e41f5f732760..d78e944706905 100644 --- a/python/paddle/distributed/auto_parallel/constants.py +++ b/python/paddle/distributed/auto_parallel/constants.py @@ -112,6 +112,7 @@ def set_field_default_config(category, field, default_value): set_field_default_config(PIPELINE, "accumulate_steps", 1) set_field_default_config(PIPELINE, "generation_batch_size", 1) set_field_default_config(PIPELINE, "enable_send_recv_overlap", False) +set_field_default_config(PIPELINE, "auto_parallel_profiler", False) ######################################### # quantization configuration diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 999ad0cf94f92..6eeb583bd8294 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -251,6 +251,9 @@ def __init__( paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1}) paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1}) + if self._strategy.pipline.auto_parallel_profiler: + paddle.framework.set_flags({'FLAGS_auto_parallel_profiler': 1}) + def _prepare_data_spec(self, data, split, batch_size): inputs_spec = [] labels_spec = [] From 9f271ef183dd2c1724ce7229b4c379666dd11963 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 26 Oct 2023 10:05:46 +0000 Subject: [PATCH 15/52] fix value name --- python/paddle/distributed/auto_parallel/static/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 6eeb583bd8294..bea78ea32c835 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -251,7 +251,7 @@ def __init__( paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1}) paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1}) - if self._strategy.pipline.auto_parallel_profiler: + if self._strategy.pipeline.auto_parallel_profiler: paddle.framework.set_flags({'FLAGS_auto_parallel_profiler': 1}) def _prepare_data_spec(self, data, split, batch_size): From dabf964b4b55ab43bedf4644e501ebe729be6cc0 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 27 Oct 2023 04:49:00 +0000 Subject: [PATCH 16/52] support gettimeofday for win env --- paddle/phi/kernels/autotune/gpu_timer.h | 56 ++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 3b99b27447472..6d21cc529a1e3 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -14,11 +14,10 @@ #pragma once -#include - #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" + #ifdef PADDLE_WITH_CUDA #include #endif @@ -26,6 +25,59 @@ #include #endif +#ifdef WIN32 +#include +#include +#else +#include +#endif + +#ifdef WIN32 + +#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 +#else +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +struct timezone { + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +int gettimeofday(struct timeval *tv, struct timezone *tz) { + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag = 0; + + if (NULL != tv) { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + tmpres /= 10; /*convert into microseconds*/ + /*converting file time to unix epoch*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = static_cast(tmpres / 1000000UL); + tv->tv_usec = static_cast(tmpres % 1000000UL); + } + + if (NULL != tz) { + if (!tzflag) { + _tzset(); + tzflag++; + } + tz->tz_minuteswest = _timezone / 60; + tz->tz_dsttime = _daylight; + } + + return 0; +} + +#endif + namespace phi { class GpuTimer { From 6ad6f36f4d2c52f4fcad5d815b84d30705cc45e8 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 27 Oct 2023 06:54:36 +0000 Subject: [PATCH 17/52] fix win build error --- paddle/phi/kernels/autotune/gpu_timer.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 6d21cc529a1e3..7466848d156a9 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -40,12 +40,7 @@ #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL #endif -struct timezone { - int tz_minuteswest; /* minutes W of Greenwich */ - int tz_dsttime; /* type of dst correction */ -}; - -int gettimeofday(struct timeval *tv, struct timezone *tz) { +int gettimeofday(struct timeval *tv, void *tz) { FILETIME ft; unsigned __int64 tmpres = 0; static int tzflag = 0; @@ -64,15 +59,6 @@ int gettimeofday(struct timeval *tv, struct timezone *tz) { tv->tv_usec = static_cast(tmpres % 1000000UL); } - if (NULL != tz) { - if (!tzflag) { - _tzset(); - tzflag++; - } - tz->tz_minuteswest = _timezone / 60; - tz->tz_dsttime = _daylight; - } - return 0; } From d58cc94d2e188cb2aa1207794d93bfd5147c5f59 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 27 Oct 2023 07:27:30 +0000 Subject: [PATCH 18/52] fix win build error --- paddle/phi/kernels/autotune/gpu_timer.h | 40 +------------------------ 1 file changed, 1 insertion(+), 39 deletions(-) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 7466848d156a9..fec1233ff7185 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" @@ -25,45 +26,6 @@ #include #endif -#ifdef WIN32 -#include -#include -#else -#include -#endif - -#ifdef WIN32 - -#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 -#else -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) { - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag = 0; - - if (NULL != tv) { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - tmpres /= 10; /*convert into microseconds*/ - /*converting file time to unix epoch*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = static_cast(tmpres / 1000000UL); - tv->tv_usec = static_cast(tmpres % 1000000UL); - } - - return 0; -} - -#endif - namespace phi { class GpuTimer { From e9886aef6ee178abe0e1b9a8cfa504c467fef611 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 27 Oct 2023 08:19:23 +0000 Subject: [PATCH 19/52] use job_type_to_id --- .../fluid/framework/new_executor/standalone_executor.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 3581510771b13..dc2534f7d3e26 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -205,18 +205,25 @@ paddle::framework::FetchList StandaloneExecutor::Run( // record each job's run time #if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { + std::unordered_map job_type_to_id; for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; const std::string& job_type = job->Type(); double start_time, end_time; std::tie(start_time, end_time) = interpretercores_[job_idx]->InterpreterRunTime(); + if (job_type_to_id.count(job_type) == 0) { + job_type_to_id[job_type] = 0; + } else { + job_type_to_id[job_type] += 1; + } // Note(sonder): Used to record the runtime of each job in order to // generate a parallel pipeline timeline. Job runtime information can be // extracted from the logs using the scripts "profiler_helper_static.py". // Do not modify, as it may affect the results of regular expression // matching. - VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type + VLOG(0) << "Profiler Info: Job (" << job_type_to_id[job_type] + << "), type = " << job_type << ", micro_batch_id = " << job->MicroBatchId() << ", job_start_time = " << std::to_string(start_time) << ", job_end_time = " << std::to_string(end_time); From 282285bc92c439677e31fe342622b7dbf7cc1102 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 27 Oct 2023 08:43:13 +0000 Subject: [PATCH 20/52] Fixed repeatedly timing the same stream --- .../fluid/framework/new_executor/program_interpreter.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index b0f74bab2d291..f2592e1aaee5e 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -644,7 +644,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { void ProgramInterpreter::AddGpuStreamEvents() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) stream_timers_.clear(); - std::vector streams; + std::unordered_set streams; for (size_t i = 0; i < vec_instruction_.size(); ++i) { auto& instr = vec_instruction_[i]; @@ -657,12 +657,13 @@ void ProgramInterpreter::AddGpuStreamEvents() { gpuStream_t stream = reinterpret_cast(instr.DeviceContext()) .stream(); - streams.emplace_back(stream); + streams.insert(stream); } stream_timers_.resize(streams.size()); - for (size_t i = 0; i < streams.size(); ++i) { - stream_timers_[i].SetStream(streams[i]); + int i = 0; + for (auto& stream : streams) { + stream_timers_[i++].SetStream(stream); } #endif } From fdc3f6ddf1396576c94fc6cc5862404dc1fcdf69 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Wed, 1 Nov 2023 12:02:15 +0000 Subject: [PATCH 21/52] add step line for timeline --- .../new_executor/program_interpreter.cc | 67 ++++--------------- .../new_executor/program_interpreter.h | 4 +- .../new_executor/standalone_executor.cc | 9 +-- .../static/profiler_helper_static.py | 54 +++++++++++++-- 4 files changed, 67 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index eeb79a13f976e..7c43cd324bbdf 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -106,11 +106,9 @@ ProgramInterpreter::~ProgramInterpreter() { void ProgramInterpreter::RunImpl() { #if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { - for (size_t i = 0; i < stream_timers_.size(); ++i) { - auto& stream_timer = stream_timers_[i]; - stream_timer.Start(); - } + calculate_stream_timer_.Start(); } + #endif // lazy initialization of gc, do not create gc is the program only run once @@ -141,10 +139,7 @@ void ProgramInterpreter::RunImpl() { #if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { - for (size_t i = 0; i < stream_timers_.size(); ++i) { - auto& stream_timer = stream_timers_[i]; - stream_timer.Stop(); - } + calculate_stream_timer_.Stop(); } #endif } @@ -642,53 +637,13 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { } } -void ProgramInterpreter::AddGpuStreamEvents() { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - stream_timers_.clear(); - std::unordered_set streams; - - for (size_t i = 0; i < vec_instruction_.size(); ++i) { - auto& instr = vec_instruction_[i]; - if ((instr.KernelType() != OpFuncType::kGpuAsync) || - (instr.DeviceContext().GetPlace().GetType() == - phi::AllocationType::CUSTOM)) { - continue; - } - - gpuStream_t stream = - reinterpret_cast(instr.DeviceContext()) - .stream(); - streams.insert(stream); - } - - stream_timers_.resize(streams.size()); - int i = 0; - for (auto& stream : streams) { - stream_timers_[i++].SetStream(stream); - } -#endif -} - std::tuple ProgramInterpreter::InterpreterRunTime() { - double min_start_time = std::numeric_limits::max(), - max_end_time = std::numeric_limits::min(); + double start_time, end_time; #if defined(PADDLE_WITH_CUDA) - for (size_t i = 0; i < stream_timers_.size(); ++i) { - auto& stream_timer = stream_timers_[i]; - double start_time = stream_timer.StartTime(); - double end_time = stream_timer.EndTime(); - - min_start_time = std::min(min_start_time, start_time); - max_end_time = std::max(max_end_time, end_time); - - VLOG(3) << "ProgramInterpreter::InterpreterRunTime:" - << "start_time: " << std::to_string(start_time) - << ", end_time: " << std::to_string(end_time) << ", min_start_time" - << std::to_string(min_start_time) - << ", max_end_time: " << std::to_string(max_end_time); - } + start_time = calculate_stream_timer_.StartTime(); + end_time = calculate_stream_timer_.EndTime(); #endif - return std::make_tuple(min_start_time, max_end_time); + return std::make_tuple(start_time, end_time); } void ProgramInterpreter::Convert( @@ -728,7 +683,13 @@ void ProgramInterpreter::Convert( } if (FLAGS_auto_parallel_profiler) { - AddGpuStreamEvents(); +#if defined(PADDLE_WITH_CUDA) + gpuStream_t calculate_stream = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + calculate_stream_timer_.SetStream(calculate_stream); +#endif } BuildOperatorDependences(); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 5a80f71d423a7..e770b1e9abcbf 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -155,8 +155,6 @@ class ProgramInterpreter : public InterpreterBaseImpl { // For log and debug std::string GetDepsString() const; - void AddGpuStreamEvents(); - bool is_build_{false}; bool static_build_{false}; // Note(sonder): share the op dependency and event analysis procedure. @@ -220,7 +218,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector hookfuncs_; #if defined(PADDLE_WITH_CUDA) - std::vector stream_timers_; + phi::GpuTimer calculate_stream_timer_; #endif }; diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 3807639e53279..42e6a701eae3e 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -219,24 +219,19 @@ paddle::framework::FetchList StandaloneExecutor::Run( // record each job's run time #if defined(PADDLE_WITH_CUDA) if (FLAGS_auto_parallel_profiler) { - std::unordered_map job_type_to_id; for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; const std::string& job_type = job->Type(); double start_time, end_time; std::tie(start_time, end_time) = interpretercores_[job_idx]->InterpreterRunTime(); - if (job_type_to_id.count(job_type) == 0) { - job_type_to_id[job_type] = 0; - } else { - job_type_to_id[job_type] += 1; - } + // Note(sonder): Used to record the runtime of each job in order to // generate a parallel pipeline timeline. Job runtime information can be // extracted from the logs using the scripts "profiler_helper_static.py". // Do not modify, as it may affect the results of regular expression // matching. - VLOG(0) << "Profiler Info: Job (" << job_type_to_id[job_type] + VLOG(0) << "Profiler Info: Job (" << job->MicroBatchId() << "), type = " << job_type << ", micro_batch_id = " << job->MicroBatchId() << ", job_start_time = " << std::to_string(start_time) diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index 6567b813adc92..aecf7c3ae684d 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -42,11 +42,38 @@ def process_log_data(log_data, device_id): "optimizer": "rail_response", # RGB: 238, 142, 0 "default": "thread_state_unknown", # RGB: 199, 155, 125 } + step = 0 for match in matches: job_id, job_type, micro_batch_id, job_start_time, job_end_time = match if job_type in ["lr"]: continue + if job_type == "forward" and int(micro_batch_id) == 0: + if step > 0: + event_step_stop = { + "name": "Step " + str(step - 1), + "cat": "Step", + "ph": "E", + "ts": float(job_start_time.strip()) * 1000, + "pid": "Main", + "tid": "Step" + str(device_id), + "cname": color_map["default"], + } + events.append(event_step_stop) + + event_step_start = { + "name": "Step " + str(step), + "cat": "Step", + "ph": "B", + "ts": float(job_start_time.strip()) * 1000, + "pid": "Main", + "tid": "Step" + str(device_id), + "cname": color_map["default"], + } + events.append(event_step_start) + + step += 1 + event_start = { "name": job_type[0].upper() + "_" + str(job_id), "cat": job_type, @@ -68,6 +95,15 @@ def process_log_data(log_data, device_id): events.append(event_start) events.append(event_end) + event_step_end = { + "name": "Step " + str(step), + "cat": "Step", + "ph": "E", + "ts": events[-1]["ts"], + "pid": "Main", + "tid": "Step" + str(device_id), + "cname": color_map["default"], + } return events @@ -94,19 +130,29 @@ def main(): all_events.extend( [ { - "args": {"name": "GPU"}, + "args": {"name": "Step"}, "cat": "__metadata", "name": "thread_name", "ph": "M", "pid": "Main", - "tid": i, + "tid": i * 2, "ts": 0, - } + }, + { + "args": {"name": f"GPU:{i}"}, + "cat": "__metadata", + "name": "thread_name", + "ph": "M", + "pid": "Main", + "tid": i * 2 + 1, + "ts": 0, + }, ] ) json_str = json.dumps({"traceEvents": all_events}) for i in range(len(args.devices.split(","))): - json_str = json_str.replace(f'"GPU{i}"', f'{i}') + json_str = json_str.replace(f'"Step{i}"', f'{i * 2}') + json_str = json_str.replace(f'"GPU{i}"', f'{i * 2 + 1}') with open(save_path, "w") as f: f.write(json_str) From 1ceadc5fa003c674246ff82f8af79f38ef08e354 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 2 Nov 2023 11:37:24 +0000 Subject: [PATCH 22/52] add step timeline and fix logic when job overlap --- .../platform/device/gpu/cuda/cuda_profiler.cc | 19 +- .../static/profiler_helper_static.py | 162 +++++++++++------- 2 files changed, 118 insertions(+), 63 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc index a49d9013fb6d0..609f49a38b6be 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +PHI_DECLARE_bool(auto_parallel_profiler); + namespace paddle { namespace platform { @@ -46,10 +48,25 @@ void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) { eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; eventAttrib.message.ascii = name.c_str(); + if (FLAGS_auto_parallel_profiler) { + struct timeval time_now {}; + gettimeofday(&time_now, nullptr); + double push_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); + VLOG(0) << "NVTX range push: " << name + << ", time: " << std::to_string(push_time); + } dynload::nvtxRangePushEx(&eventAttrib); } -void CudaNvtxRangePop() { dynload::nvtxRangePop(); } +void CudaNvtxRangePop() { + if (FLAGS_auto_parallel_profiler) { + struct timeval time_now {}; + gettimeofday(&time_now, nullptr); + double pop_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); + VLOG(0) << "NVTX range pop, time: " << std::to_string(pop_time); + } + dynload::nvtxRangePop(); +} #endif } // namespace platform diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index aecf7c3ae684d..619811368b6b8 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -19,6 +19,13 @@ import paddle +color_map = { + "forward": "thread_state_running", # RGB: 126, 200, 148 + "backward": "rail_idle", # RGB: 238, 142, 0 + "optimizer": "rail_response", # RGB: 238, 142, 0 + "default": "thread_state_unknown", # RGB: 199, 155, 125 +} + def parse_args(): parser = ArgumentParser() @@ -32,93 +39,120 @@ def parse_args(): return args -def process_log_data(log_data, device_id): +def process_job_log(log_data, device_id): log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)' matches = re.findall(log_pattern, log_data) events = [] - color_map = { - "forward": "thread_state_running", # RGB: 126, 200, 148 - "backward": "rail_idle", # RGB: 238, 142, 0 - "optimizer": "rail_response", # RGB: 238, 142, 0 - "default": "thread_state_unknown", # RGB: 199, 155, 125 - } - step = 0 + last_end_time = None + for match in matches: job_id, job_type, micro_batch_id, job_start_time, job_end_time = match if job_type in ["lr"]: continue - if job_type == "forward" and int(micro_batch_id) == 0: - if step > 0: - event_step_stop = { - "name": "Step " + str(step - 1), - "cat": "Step", - "ph": "E", - "ts": float(job_start_time.strip()) * 1000, - "pid": "Main", - "tid": "Step" + str(device_id), - "cname": color_map["default"], - } - events.append(event_step_stop) - - event_step_start = { - "name": "Step " + str(step), - "cat": "Step", - "ph": "B", - "ts": float(job_start_time.strip()) * 1000, - "pid": "Main", - "tid": "Step" + str(device_id), - "cname": color_map["default"], - } - events.append(event_step_start) + start_time = float(job_start_time.strip()) * 1000 + end_time = float(job_end_time.strip()) * 1000 - step += 1 + if last_end_time is not None: + if start_time < last_end_time: + end_time = end_time + last_end_time - start_time + start_time = last_end_time event_start = { - "name": job_type[0].upper() + "_" + str(job_id), + "name": job_type + "_" + str(job_id), "cat": job_type, "ph": "B", - "ts": float(job_start_time.strip()) * 1000, - "pid": "Main", + "ts": start_time, + "pid": 0, "tid": "GPU" + str(device_id), "cname": color_map[job_type], } event_end = { - "name": job_type[0].upper() + "_" + str(job_id), + "name": job_type + "_" + str(job_id), "cat": job_type, "ph": "E", - "pid": "Main", - "ts": float(job_end_time.strip()) * 1000, + "pid": 0, + "ts": end_time, "tid": "GPU" + str(device_id), "cname": color_map[job_type], } events.append(event_start) events.append(event_end) - event_step_end = { - "name": "Step " + str(step), - "cat": "Step", - "ph": "E", - "ts": events[-1]["ts"], - "pid": "Main", - "tid": "Step" + str(device_id), - "cname": color_map["default"], - } + last_end_time = end_time + return events +def process_step_log(log_data, device_id): + start_pattern = r'.*?NVTX range push: (\d+), time: (\d+.\d+)' + end_pattern = r'.*?NVTX range pop, time: (\d+.\d+)' + start_matches = re.findall(start_pattern, log_data) + end_matches = re.findall(end_pattern, log_data) + + step_info = [] + for start_match, stop_match in zip(start_matches, end_matches): + step_id, start_time = start_match + stop_time = stop_match + if int(step_id) >= len(step_info): + for _ in range(int(step_id) - len(step_info) + 1): + step_info.append([float('inf'), 0]) + step_info[int(step_id)] = [start_time, stop_time] + return step_info + + def main(): args = parse_args() all_events = [] + step_infos = [] for device_id in args.devices.split(","): print(f"Process device {device_id}") device_id = int(device_id) log_file = os.path.join(args.log_dir, "workerlog." + str(device_id)) with open(log_file, "r") as f: log_data = f.read() - events = process_log_data(log_data, device_id) + events = process_job_log(log_data, device_id) + step_info = process_step_log(log_data, device_id) all_events.extend(events) + if len(step_info) > len(step_infos): + for _ in range(len(step_info) - len(step_infos)): + step_infos.append([float('inf'), 0]) + for i, info in enumerate(step_info): + start_time = float(info[0].strip()) * 1000 + stop_time = float(info[1].strip()) * 1000 + + step_infos[i][0] = min(step_infos[i][0], start_time) + step_infos[i][1] = max(step_infos[i][1], stop_time) + + for i, info in enumerate(step_infos): + if info[0] == float('inf'): + continue + start_time = info[0] + if i > 0: + start_time = max(start_time, step_infos[i - 1][1]) + event_start = { + "name": "step" + str(i), + "cat": "step", + "ph": "B", + "ts": start_time, + "pid": 0, + "tid": "Step", + "cname": color_map["default"], + } + event_end = { + "name": "step" + str(i), + "cat": "step", + "ph": "E", + "ts": info[1], + "pid": 0, + "tid": "Step", + "cname": color_map["default"], + } + + all_events.append(event_start) + all_events.append(event_end) + save_path = os.path.join(args.log_dir, "pipeline_profile.json") with open(save_path, "w") as f: f.write(json.dumps({"traceEvents": all_events})) @@ -126,33 +160,37 @@ def main(): # support Perfetto format save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json") + all_events.extend( + [ + { + "args": {"name": "STEP"}, + "cat": "__metadata", + "name": "thread_name", + "ph": "M", + "pid": 0, + "tid": 2333, + "ts": 0, + } + ] + ) for i in range(len(args.devices.split(","))): all_events.extend( [ - { - "args": {"name": "Step"}, - "cat": "__metadata", - "name": "thread_name", - "ph": "M", - "pid": "Main", - "tid": i * 2, - "ts": 0, - }, { "args": {"name": f"GPU:{i}"}, "cat": "__metadata", "name": "thread_name", "ph": "M", - "pid": "Main", - "tid": i * 2 + 1, + "pid": 0, + "tid": i + 2334, "ts": 0, - }, + } ] ) json_str = json.dumps({"traceEvents": all_events}) for i in range(len(args.devices.split(","))): - json_str = json_str.replace(f'"Step{i}"', f'{i * 2}') - json_str = json_str.replace(f'"GPU{i}"', f'{i * 2 + 1}') + json_str = json_str.replace('"Step"', '2333') + json_str = json_str.replace(f'"GPU{i}"', f'{i + 2334}') with open(save_path, "w") as f: f.write(json_str) From 679cc393c8745fee85bff9b072081bc307903a93 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 6 Nov 2023 08:10:13 +0000 Subject: [PATCH 23/52] update time record logic --- .../new_executor/program_interpreter.cc | 80 ++++++++++++++----- .../new_executor/program_interpreter.h | 5 +- paddle/phi/kernels/autotune/gpu_timer.h | 70 +++++++++++++--- .../static/profiler_helper_static.py | 5 -- 4 files changed, 125 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 7c43cd324bbdf..febd68d33d821 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -104,13 +104,6 @@ ProgramInterpreter::~ProgramInterpreter() { } void ProgramInterpreter::RunImpl() { -#if defined(PADDLE_WITH_CUDA) - if (FLAGS_auto_parallel_profiler) { - calculate_stream_timer_.Start(); - } - -#endif - // lazy initialization of gc, do not create gc is the program only run once if (!gc_) { gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_); @@ -131,17 +124,12 @@ void ProgramInterpreter::RunImpl() { async_work_queue_ = GetWorkQueue(); ExecuteInstructionList(vec_instruction_); } + #ifdef PADDLE_WITH_CUSTOM_DEVICE if (platform::is_custom_place(place_)) { platform::DeviceContextPool::Instance().Get(place_)->Wait(); } #endif - -#if defined(PADDLE_WITH_CUDA) - if (FLAGS_auto_parallel_profiler) { - calculate_stream_timer_.Stop(); - } -#endif } FetchList ProgramInterpreter::Run(const std::vector& feed_names, @@ -639,9 +627,35 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time, end_time; -#if defined(PADDLE_WITH_CUDA) - start_time = calculate_stream_timer_.StartTime(); - end_time = calculate_stream_timer_.EndTime(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // double remaining_time = std::numeric_limits::max(); + // double event_time, wait_time; + start_time = calculated_stream_timer_.StartTime(); + end_time = calculated_stream_timer_.EndTime(); +// gpuEvent_t stop_event = calculated_stream_timer_.StopEvent(); + +// for (auto& inst : vec_instruction_) { +// for (auto event_inter : inst.EventsToWait()) { +// auto* wrapper = static_cast( +// event_inter.event_->GetEvent().get()); + +// gpuEvent_t start_event = wrapper->inner_event_.GetRawCudaEvent(); +// event_time = 0; +// PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime( +// reinterpret_cast(&event_time), start_event, stop_event)); +// remaining_time = std::min(remaining_time, event_time); +// VLOG(0) << "event time: " << std::to_string(event_time) +// << " remaining: " << std::to_string(remaining_time); +// } +// } +// if (remaining_time == std::numeric_limits::max()) { +// remaining_time = 0; +// } +// wait_time = end_time - start_time - remaining_time; +// start_time = start_time + wait_time; +// VLOG(0) << "wait time: " << std::to_string(wait_time) +// << " start time: " << std::to_string(start_time) +// << " end time: " << std::to_string(end_time); #endif return std::make_tuple(start_time, end_time); } @@ -683,12 +697,12 @@ void ProgramInterpreter::Convert( } if (FLAGS_auto_parallel_profiler) { -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t calculate_stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); - calculate_stream_timer_.SetStream(calculate_stream); + calculated_stream_timer_.SetStream(calculate_stream); #endif } @@ -1050,6 +1064,15 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { try { instr_node.WaitEvent(place_); + if (FLAGS_auto_parallel_profiler) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!interpreter::IsCommunicationOp(instr_node) && + !calculated_stream_timer_.IsStart()) { + VLOG(3) << "Start calculated stream timer from op: " << op->Type(); + calculated_stream_timer_.Start(); + } +#endif + } if (!instr_node.IsArtificial()) { RunOperator(instr_node); @@ -1102,6 +1125,17 @@ void ProgramInterpreter::ExecuteInstructionList( exception_holder_.Clear(); + if (FLAGS_auto_parallel_profiler) { + for (int i = vec_instr.size() - 1; i >= 0; --i) { + auto& instr_node = vec_instr[i]; + if (!interpreter::IsCommunicationOp(instr_node)) { + VLOG(3) << "Last calculated op type: " << instr_node.OpBase()->Type(); + last_calculated_instr_id = i; + break; + } + } + } + for (size_t i = 0; i < dependecy_count_->size(); ++i) { if ((*dependecy_count_)[i] == 0) { // NOTE(zhiqiu): hot fix for jit input var @@ -1213,6 +1247,16 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { RunInstruction(instr_node); + if (FLAGS_auto_parallel_profiler) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (instr_id == last_calculated_instr_id) { + VLOG(3) << "Stop calculated stream timer from op: " + << instr_node.OpBase()->Type(); + calculated_stream_timer_.Stop(); + } +#endif + } + if (UNLIKELY(exception_holder_.IsCaught())) { VLOG(4) << "Exception caught"; if (exception_notifier_ != nullptr) { diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index e770b1e9abcbf..c92c97529932b 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -217,9 +217,10 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector hookfuncs_; -#if defined(PADDLE_WITH_CUDA) - phi::GpuTimer calculate_stream_timer_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + phi::GpuTimer calculated_stream_timer_; #endif + size_t last_calculated_instr_id; }; } // namespace framework diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index fec1233ff7185..25403abbe1578 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -28,6 +28,16 @@ namespace phi { +static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream, + cudaError_t status, + void *user_data) { + struct timeval time_now {}; + gettimeofday(&time_now, nullptr); + double *cpu_time = static_cast(user_data); + *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); + VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time); +} + class GpuTimer { public: GpuTimer() { @@ -42,6 +52,7 @@ class GpuTimer { start_, phi::errors::PreconditionNotMet("Start Event is not ready.")); PADDLE_ENFORCE_NOT_NULL( stop_, phi::errors::PreconditionNotMet("Stop Event is not ready.")); + is_start_ = false; } ~GpuTimer() { @@ -60,6 +71,7 @@ class GpuTimer { #else cudaEventRecord(start_, stream); #endif + is_start_ = true; } void Stop(gpuStream_t stream) { @@ -68,21 +80,41 @@ class GpuTimer { #else cudaEventRecord(stop_, stream); #endif + is_start_ = false; } void Start() { // Note(sonder): Since it is not possible to directly obtain the start time - // of the event, "gettimeofday" is used here to retrieve it. Subsequently, - // the end time of the event is calculated by adding the start time on the - // CPU with ElapsedTime(). Using the CPU time ensures that the timing is - // synchronized across multiple GPUs. - struct timeval time_now {}; - gettimeofday(&time_now, nullptr); - start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); + // of the event, "gettimeofday" is used here to retrieve it. The callback is + // used to record the start time of the event. Start(stream_); +#ifdef PADDLE_WITH_HIP + hipStreamAddCallback(stream_, + RecordEventTimerCallback, + reinterpret_cast(&start_time_), + 0); +#else + cudaStreamAddCallback(stream_, + RecordEventTimerCallback, + reinterpret_cast(&start_time_), + 0); +#endif } - void Stop() { Stop(stream_); } + void Stop() { + Stop(stream_); +#ifdef PADDLE_WITH_HIP + hipStreamAddCallback(stream_, + RecordEventTimerCallback, + reinterpret_cast(&end_time_), + 0); +#else + cudaStreamAddCallback(stream_, + RecordEventTimerCallback, + reinterpret_cast(&end_time_), + 0); +#endif + } void SetStream(gpuStream_t stream) { stream_ = stream; } @@ -98,15 +130,33 @@ class GpuTimer { return milliseconds; } - double StartTime() { return start_time_; } + double StartTime() { +#ifdef PADDLE_WITH_HIP + hipStreamSynchronize(stream_); +#else + cudaStreamSynchronize(stream_); +#endif + return start_time_; + } + + double EndTime() { +#ifdef PADDLE_WITH_HIP + hipStreamSynchronize(stream_); +#else + cudaStreamSynchronize(stream_); +#endif + return end_time_; + } - double EndTime() { return ElapsedTime() + start_time_; } + bool IsStart() { return is_start_; } private: gpuEvent_t start_; gpuEvent_t stop_; gpuStream_t stream_; double start_time_; + double end_time_; + bool is_start_; }; } // namespace phi diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index 619811368b6b8..3118e74af8c5c 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -53,11 +53,6 @@ def process_job_log(log_data, device_id): start_time = float(job_start_time.strip()) * 1000 end_time = float(job_end_time.strip()) * 1000 - if last_end_time is not None: - if start_time < last_end_time: - end_time = end_time + last_end_time - start_time - start_time = last_end_time - event_start = { "name": job_type + "_" + str(job_id), "cat": job_type, From 1a04fea1322797a9b50bf3aec17d50a5eaf7384c Mon Sep 17 00:00:00 2001 From: AndSonder Date: Tue, 7 Nov 2023 02:34:33 +0000 Subject: [PATCH 24/52] fix bug when start profile start from none zero step --- .../static/profiler_helper_static.py | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index 3118e74af8c5c..e541977ac7251 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -39,7 +39,7 @@ def parse_args(): return args -def process_job_log(log_data, device_id): +def process_job_log(log_data, device_id, log_start_time): log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)' matches = re.findall(log_pattern, log_data) events = [] @@ -53,6 +53,9 @@ def process_job_log(log_data, device_id): start_time = float(job_start_time.strip()) * 1000 end_time = float(job_end_time.strip()) * 1000 + if log_start_time > start_time: + continue + event_start = { "name": job_type + "_" + str(job_id), "cat": job_type, @@ -84,6 +87,7 @@ def process_step_log(log_data, device_id): end_pattern = r'.*?NVTX range pop, time: (\d+.\d+)' start_matches = re.findall(start_pattern, log_data) end_matches = re.findall(end_pattern, log_data) + end_matches = end_matches[len(end_matches) - len(start_matches) :] step_info = [] for start_match, stop_match in zip(start_matches, end_matches): @@ -93,7 +97,12 @@ def process_step_log(log_data, device_id): for _ in range(int(step_id) - len(step_info) + 1): step_info.append([float('inf'), 0]) step_info[int(step_id)] = [start_time, stop_time] - return step_info + + start_step = 0 + for info in step_info: + if info[0] == float('inf'): + start_step += 1 + return step_info, start_step def main(): @@ -106,20 +115,24 @@ def main(): log_file = os.path.join(args.log_dir, "workerlog." + str(device_id)) with open(log_file, "r") as f: log_data = f.read() - events = process_job_log(log_data, device_id) - step_info = process_step_log(log_data, device_id) - all_events.extend(events) + + step_info, start_step = process_step_log(log_data, device_id) if len(step_info) > len(step_infos): for _ in range(len(step_info) - len(step_infos)): step_infos.append([float('inf'), 0]) for i, info in enumerate(step_info): + if info[0] == float('inf'): + continue start_time = float(info[0].strip()) * 1000 stop_time = float(info[1].strip()) * 1000 step_infos[i][0] = min(step_infos[i][0], start_time) step_infos[i][1] = max(step_infos[i][1], stop_time) + events = process_job_log(log_data, device_id, step_infos[start_step][0]) + all_events.extend(events) + for i, info in enumerate(step_infos): if info[0] == float('inf'): continue From e1c619da29411ac35751e8b5f9d8a2557bcf31f9 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Tue, 7 Nov 2023 06:00:52 +0000 Subject: [PATCH 25/52] fix note --- paddle/fluid/framework/new_executor/standalone_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 42e6a701eae3e..7c978fe064198 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -195,7 +195,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( VLOG(6) << "Run job (" << job_idx << "), type = " << job_type << ", micro_batch_id =" << job->MicroBatchId(); - // Note(sonder): Share build results don't work for new IR now. + // NOTE(sonder): Share build results don't work for new IR now. if (type_to_first_id.count(job_type) != 0 && !FLAGS_enable_new_ir_in_executor) { interpretercores_[job_idx]->ShareBuildResultsFrom( From 9c8b7400c1a7169b94ae9b9d1d3b24a7b4a31d39 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Tue, 7 Nov 2023 06:55:59 +0000 Subject: [PATCH 26/52] remove FLAGS_auto_parallel_profiler --- .../framework/new_executor/interpretercore.cc | 5 +++-- .../framework/new_executor/interpretercore.h | 3 ++- .../framework/new_executor/pir_interpreter.cc | 3 ++- .../framework/new_executor/pir_interpreter.h | 6 ++++-- .../new_executor/program_interpreter.cc | 13 +++++++------ .../new_executor/program_interpreter.h | 7 +++++-- .../new_executor/standalone_executor.cc | 4 ++-- .../new_executor/standalone_executor.h | 2 ++ .../platform/device/gpu/cuda/cuda_profiler.cc | 19 +------------------ .../distributed/auto_parallel/constants.py | 2 +- .../auto_parallel/static/engine.py | 4 ++-- .../static/profiler_helper_static.py | 17 +++++++++++------ 12 files changed, 42 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index db6cbfbfd8e09..7423d43ac4502 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -75,8 +75,9 @@ FetchList InterpreterCore::Run( } FetchList InterpreterCore::Run(const std::vector& feed_names, - bool need_fetch) { - return impl_->Run(feed_names, need_fetch); + bool need_fetch, + bool enable_auto_parallel_profiler) { + return impl_->Run(feed_names, need_fetch, enable_auto_parallel_profiler); } void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr src) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index a4b2e0a6d67cf..8fbec0e296786 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -51,7 +51,8 @@ class InterpreterCore { bool need_fetch = true); paddle::framework::FetchList Run(const std::vector& feed_names, - bool need_fetch = true); + bool need_fetch = true, + bool enable_auto_parallel_profiler = false); void ShareWorkQueueFrom(std::shared_ptr src); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index 43492d0871758..ae38858139e92 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -1187,7 +1187,8 @@ paddle::framework::FetchList PirInterpreter::Run( } FetchList PirInterpreter::Run(const std::vector& feed_names, - bool need_fetch) { + bool need_fetch, + bool enable_auto_parallel_profiler) { SetDeviceId(place_); CheckCUDAGraphBeforeRun(feed_names); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h index 48e1e6c4bde63..6ff147002df12 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.h +++ b/paddle/fluid/framework/new_executor/pir_interpreter.h @@ -54,8 +54,10 @@ class PirInterpreter : public InterpreterBaseImpl { const std::vector& feed_tensors, bool need_fetch = true) override; - paddle::framework::FetchList Run(const std::vector& feed_names, - bool need_fetch = true) override; + paddle::framework::FetchList Run( + const std::vector& feed_names, + bool need_fetch = true, + bool enable_auto_parallel_profiler = false) override; void ShareWorkQueueFrom(InterpreterBaseImpl* src) override; diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 89a7da9e5c0fe..87c2e9f7fc9b7 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -39,7 +39,6 @@ #include "paddle/phi/core/flags.h" PHI_DECLARE_bool(dynamic_static_unified_comm); #endif -PHI_DECLARE_bool(auto_parallel_profiler); namespace paddle { namespace framework { @@ -133,7 +132,9 @@ void ProgramInterpreter::RunImpl() { } FetchList ProgramInterpreter::Run(const std::vector& feed_names, - bool need_fetch) { + bool need_fetch, + bool enable_auto_parallel_profiler) { + enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; std::vector op_func_nodes; Build(feed_names, &op_func_nodes); @@ -703,7 +704,7 @@ void ProgramInterpreter::Convert( vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); } - if (FLAGS_auto_parallel_profiler) { + if (enable_auto_parallel_profiler_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t calculate_stream = dynamic_cast( @@ -1075,7 +1076,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { try { instr_node.WaitEvent(place_); - if (FLAGS_auto_parallel_profiler) { + if (enable_auto_parallel_profiler_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!interpreter::IsCommunicationOp(instr_node) && !calculated_stream_timer_.IsStart()) { @@ -1136,7 +1137,7 @@ void ProgramInterpreter::ExecuteInstructionList( exception_holder_.Clear(); - if (FLAGS_auto_parallel_profiler) { + if (enable_auto_parallel_profiler_) { for (int i = vec_instr.size() - 1; i >= 0; --i) { auto& instr_node = vec_instr[i]; if (!interpreter::IsCommunicationOp(instr_node)) { @@ -1258,7 +1259,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { RunInstruction(instr_node); - if (FLAGS_auto_parallel_profiler) { + if (enable_auto_parallel_profiler_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (instr_id == last_calculated_instr_id) { VLOG(3) << "Stop calculated stream timer from op: " diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 9ae1e75e19b5d..379edbfab8f59 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -50,8 +50,10 @@ class ProgramInterpreter : public InterpreterBaseImpl { const std::vector& feed_tensors, bool need_fetch = true) override; - paddle::framework::FetchList Run(const std::vector& feed_names, - bool need_fetch = true) override; + paddle::framework::FetchList Run( + const std::vector& feed_names, + bool need_fetch = true, + bool enable_auto_parallel_profiler = false) override; void Build( const std::vector& feed_names, @@ -222,6 +224,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { phi::GpuTimer calculated_stream_timer_; #endif size_t last_calculated_instr_id; + bool enable_auto_parallel_profiler_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 7311d190ad4d9..47bd4375d4952 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -30,7 +30,6 @@ PHI_DECLARE_bool(enable_pir_in_executor); PHI_DECLARE_bool(enable_pir_api); PHI_DECLARE_bool(pir_apply_inplace_pass); -PHI_DECLARE_bool(auto_parallel_profiler); namespace paddle { namespace framework { @@ -50,6 +49,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, ss << scope << ", "; } VLOG(6) << ss.str(); + enable_auto_parallel_profiler_ = false; const auto& jobs = plan_.JobList(); for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { @@ -225,7 +225,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( // record each job's run time #if defined(PADDLE_WITH_CUDA) - if (FLAGS_auto_parallel_profiler) { + if (enable_auto_parallel_profiler_) { for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; const std::string& job_type = job->Type(); diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 8feef6e5b2f91..e73fa65375bab 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -55,6 +55,8 @@ class StandaloneExecutor { std::vector>> vec_force_events_to_wait_; + + bool enable_auto_parallel_profiler_; }; } // namespace framework diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc index 609f49a38b6be..a49d9013fb6d0 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_profiler.cc @@ -14,8 +14,6 @@ #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" -PHI_DECLARE_bool(auto_parallel_profiler); - namespace paddle { namespace platform { @@ -48,25 +46,10 @@ void CudaNvtxRangePush(const std::string& name, const NvtxRangeColor color) { eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; eventAttrib.message.ascii = name.c_str(); - if (FLAGS_auto_parallel_profiler) { - struct timeval time_now {}; - gettimeofday(&time_now, nullptr); - double push_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); - VLOG(0) << "NVTX range push: " << name - << ", time: " << std::to_string(push_time); - } dynload::nvtxRangePushEx(&eventAttrib); } -void CudaNvtxRangePop() { - if (FLAGS_auto_parallel_profiler) { - struct timeval time_now {}; - gettimeofday(&time_now, nullptr); - double pop_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); - VLOG(0) << "NVTX range pop, time: " << std::to_string(pop_time); - } - dynload::nvtxRangePop(); -} +void CudaNvtxRangePop() { dynload::nvtxRangePop(); } #endif } // namespace platform diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py index 5e8040e851b8d..7d8587315c7a8 100644 --- a/python/paddle/distributed/auto_parallel/constants.py +++ b/python/paddle/distributed/auto_parallel/constants.py @@ -114,7 +114,7 @@ def set_field_default_config(category, field, default_value): set_field_default_config(PIPELINE, "accumulate_steps", 1) set_field_default_config(PIPELINE, "generation_batch_size", 1) set_field_default_config(PIPELINE, "enable_send_recv_overlap", False) -set_field_default_config(PIPELINE, "auto_parallel_profiler", False) +set_field_default_config(PIPELINE, "schedule_profiler", False) ######################################### # quantization configuration diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 17d86130fab8a..8a875eee3024a 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -251,8 +251,8 @@ def __init__( paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1}) paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1}) - if self._strategy.pipeline.auto_parallel_profiler: - paddle.framework.set_flags({'FLAGS_auto_parallel_profiler': 1}) + if self._strategy.pipeline.schedule_profiler: + paddle.framework.set_flags({'schedule_profiler': 1}) def _prepare_data_spec(self, data, split, batch_size): inputs_spec = [] diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index e541977ac7251..35afd2c818612 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -13,11 +13,18 @@ # limitations under the License. import json +import logging import os import re from argparse import ArgumentParser import paddle +from paddle.base.log_helper import get_logger + +_logger = get_logger( + __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s' +) + color_map = { "forward": "thread_state_running", # RGB: 126, 200, 148 @@ -32,9 +39,7 @@ def parse_args(): device_count = paddle.device.cuda.device_count() all_devices = ",".join([str(i) for i in range(device_count)]) parser.add_argument("--devices", type=str, default=all_devices) - parser.add_argument( - "--log_dir", type=str, default="build/Testing/Temporary/" - ) + parser.add_argument("--log_dir", type=str, required=True) args = parser.parse_args() return args @@ -110,7 +115,7 @@ def main(): all_events = [] step_infos = [] for device_id in args.devices.split(","): - print(f"Process device {device_id}") + _logger.info(f"Process device {device_id}") device_id = int(device_id) log_file = os.path.join(args.log_dir, "workerlog." + str(device_id)) with open(log_file, "r") as f: @@ -164,7 +169,7 @@ def main(): save_path = os.path.join(args.log_dir, "pipeline_profile.json") with open(save_path, "w") as f: f.write(json.dumps({"traceEvents": all_events})) - print(f"Save pipeline profile to {save_path}") + _logger.info(f"Save pipeline profile to {save_path}") # support Perfetto format save_path = os.path.join(args.log_dir, "pipeline_profile_perfetto.json") @@ -202,7 +207,7 @@ def main(): with open(save_path, "w") as f: f.write(json_str) - print(f"Save pipeline profile to {save_path}") + _logger.info(f"Save pipeline profile to {save_path}") if __name__ == "__main__": From 63de31b57fe7c7fbcaeccf0260913bfba8cce322 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Tue, 7 Nov 2023 14:24:05 +0000 Subject: [PATCH 27/52] use run config instead FLAGS_auto_parallelxx --- .../new_executor/interpreter_base_impl.h | 4 +- .../new_executor/standalone_executor.cc | 14 +- .../new_executor/standalone_executor.h | 2 + paddle/fluid/pybind/pybind.cc | 6 + python/paddle/base/executor.py | 11 ++ .../auto_parallel/static/engine.py | 8 +- .../static/profiler_helper_static.py | 122 ++++++++---------- 7 files changed, 93 insertions(+), 74 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index 31cb9e1bcff25..9e441f9c9bdf2 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -71,7 +71,9 @@ class InterpreterBaseImpl { bool need_fetch = true) = 0; virtual paddle::framework::FetchList Run( - const std::vector& feed_names, bool need_fetch = true) = 0; + const std::vector& feed_names, + bool need_fetch = true, + bool enable_auto_parallel_profiler = false) = 0; virtual void ShareWorkQueueFrom(InterpreterBaseImpl* src) = 0; diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 47bd4375d4952..aa378bd626cd6 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -216,9 +216,14 @@ paddle::framework::FetchList StandaloneExecutor::Run( if (jobs.size() > 1 && job_type != "forward") { const std::vector tmp_feed_names = {}; interpretercores_[job_idx]->Run(tmp_feed_names, - /*need_fetch = */ false); + /*need_fetch = */ false, + /*enable_auto_parallel_profiler = */ + enable_auto_parallel_profiler_); } else { - interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false); + interpretercores_[job_idx]->Run(feed_names, + /*need_fetch = */ false, + /*enable_auto_parallel_profiler = */ + enable_auto_parallel_profiler_); } } } @@ -262,5 +267,10 @@ paddle::framework::FetchList StandaloneExecutor::Run( } } +void StandaloneExecutor::SetEnableAutoParallelProfiler( + bool enable_auto_parallel_profiler) { + enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index e73fa65375bab..01c3a197f298c 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -41,6 +41,8 @@ class StandaloneExecutor { paddle::framework::FetchList Run(const std::vector& feed_names); + void SetEnableAutoParallelProfiler(bool enable_auto_parallel_profiler); + private: bool is_interpretercore_build_result_shared_{false}; const platform::Place place_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cabefe569a9d7..58c578b9ae575 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2019,6 +2019,12 @@ All parameter, weight, gradient are variables in Paddle. ret = self.Run(feed_names); } return py::cast(std::move(ret)); + }) + + .def("set_enable_auto_parallel_profiler", + [](StandaloneExecutor &self, bool enable_auto_parallel_profiler) { + pybind11::gil_scoped_release release; + self.SetEnableAutoParallelProfiler(enable_auto_parallel_profiler); }); py::class_ start_time: - continue - - event_start = { - "name": job_type + "_" + str(job_id), - "cat": job_type, - "ph": "B", - "ts": start_time, - "pid": 0, - "tid": "GPU" + str(device_id), - "cname": color_map[job_type], - } - event_end = { - "name": job_type + "_" + str(job_id), - "cat": job_type, - "ph": "E", - "pid": 0, - "ts": end_time, - "tid": "GPU" + str(device_id), - "cname": color_map[job_type], - } - events.append(event_start) - events.append(event_end) + if job_type == "forward" and micro_batch_id == "0": + if step_start_time != 0: + step_times.append([step_start_time, step_end_time]) + step_start_time = start_time + step_end_time = end_time + + if len(step_times) >= start_step: + event_start = { + "name": job_type + "_" + str(job_id), + "cat": job_type, + "ph": "B", + "ts": start_time, + "pid": 0, + "tid": "GPU" + str(device_id), + "cname": color_map[job_type], + } + event_end = { + "name": job_type + "_" + str(job_id), + "cat": job_type, + "ph": "E", + "pid": 0, + "ts": end_time, + "tid": "GPU" + str(device_id), + "cname": color_map[job_type], + } + events.append(event_start) + events.append(event_end) last_end_time = end_time - return events - - -def process_step_log(log_data, device_id): - start_pattern = r'.*?NVTX range push: (\d+), time: (\d+.\d+)' - end_pattern = r'.*?NVTX range pop, time: (\d+.\d+)' - start_matches = re.findall(start_pattern, log_data) - end_matches = re.findall(end_pattern, log_data) - end_matches = end_matches[len(end_matches) - len(start_matches) :] - - step_info = [] - for start_match, stop_match in zip(start_matches, end_matches): - step_id, start_time = start_match - stop_time = stop_match - if int(step_id) >= len(step_info): - for _ in range(int(step_id) - len(step_info) + 1): - step_info.append([float('inf'), 0]) - step_info[int(step_id)] = [start_time, stop_time] - - start_step = 0 - for info in step_info: - if info[0] == float('inf'): - start_step += 1 - return step_info, start_step + step_times.append([step_start_time, step_end_time]) + step_times = step_times[start_step:] + return events, step_times def main(): args = parse_args() all_events = [] step_infos = [] + start_step = 0 + for device_id in args.devices.split(","): _logger.info(f"Process device {device_id}") device_id = int(device_id) @@ -121,31 +108,28 @@ def main(): with open(log_file, "r") as f: log_data = f.read() - step_info, start_step = process_step_log(log_data, device_id) - - if len(step_info) > len(step_infos): - for _ in range(len(step_info) - len(step_infos)): - step_infos.append([float('inf'), 0]) - for i, info in enumerate(step_info): - if info[0] == float('inf'): - continue - start_time = float(info[0].strip()) * 1000 - stop_time = float(info[1].strip()) * 1000 - - step_infos[i][0] = min(step_infos[i][0], start_time) - step_infos[i][1] = max(step_infos[i][1], stop_time) + start_step_pattern = ( + r'.*?Schedule Profiler start at step (\d+) and end at step.*' + ) + start_step_match = re.findall(start_step_pattern, log_data) + start_step = ( + int(start_step_match[0]) if len(start_step_match) > 0 else 0 + ) - events = process_job_log(log_data, device_id, step_infos[start_step][0]) + events, step_times = process_job_log(log_data, device_id, start_step) all_events.extend(events) + for i, info in enumerate(step_times): + if len(step_infos) <= i: + step_infos.append([float("inf"), float("-inf")]) + step_infos[i][0] = min(step_infos[i][0], info[0]) + step_infos[i][1] = max(step_infos[i][1], info[1]) for i, info in enumerate(step_infos): - if info[0] == float('inf'): - continue start_time = info[0] if i > 0: start_time = max(start_time, step_infos[i - 1][1]) event_start = { - "name": "step" + str(i), + "name": "step" + str(i + start_step), "cat": "step", "ph": "B", "ts": start_time, @@ -154,7 +138,7 @@ def main(): "cname": color_map["default"], } event_end = { - "name": "step" + str(i), + "name": "step" + str(i + start_step), "cat": "step", "ph": "E", "ts": info[1], From 8218ecb7ff677e17a4701224f08bcc58def99da8 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Tue, 7 Nov 2023 14:34:27 +0000 Subject: [PATCH 28/52] fix color map logic --- .../auto_parallel/static/profiler_helper_static.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index 69489b172f07f..25430c9688b6f 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -74,7 +74,6 @@ def process_job_log(log_data, device_id, start_step): "ts": start_time, "pid": 0, "tid": "GPU" + str(device_id), - "cname": color_map[job_type], } event_end = { "name": job_type + "_" + str(job_id), @@ -83,8 +82,11 @@ def process_job_log(log_data, device_id, start_step): "pid": 0, "ts": end_time, "tid": "GPU" + str(device_id), - "cname": color_map[job_type], } + if job_type in color_map: + event_start["cname"] = (color_map[job_type],) + event_end["cname"] = (color_map[job_type],) + events.append(event_start) events.append(event_end) From 4b318fcf7d24d1a25db62e65038bd9c7fa4010f4 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Tue, 7 Nov 2023 14:36:32 +0000 Subject: [PATCH 29/52] fix color map logic --- .../auto_parallel/static/profiler_helper_static.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index 25430c9688b6f..eeb4f6ef51250 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -84,8 +84,8 @@ def process_job_log(log_data, device_id, start_step): "tid": "GPU" + str(device_id), } if job_type in color_map: - event_start["cname"] = (color_map[job_type],) - event_end["cname"] = (color_map[job_type],) + event_start["cname"] = color_map[job_type] + event_end["cname"] = color_map[job_type] events.append(event_start) events.append(event_end) From 9f949f24c09ca511d14d3a4a2f73cbec60f489fe Mon Sep 17 00:00:00 2001 From: AndSonder Date: Wed, 8 Nov 2023 10:58:01 +0000 Subject: [PATCH 30/52] fix bug when log step does not start from 0 --- .../new_executor/program_interpreter.cc | 53 +++----- .../new_executor/program_interpreter.h | 2 +- .../new_executor/standalone_executor.cc | 9 ++ paddle/fluid/pybind/pybind.cc | 1 - paddle/phi/kernels/autotune/gpu_timer.h | 113 +++++++++++------- .../static/profiler_helper_static.py | 48 ++++---- 6 files changed, 117 insertions(+), 109 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 2cd47903bacce..93206d1f257c5 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -138,6 +138,17 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, bool need_fetch, bool enable_auto_parallel_profiler) { enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; + + if (enable_auto_parallel_profiler_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + gpuStream_t calculated_stream = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + calculated_stream_timer_.SetStream(calculated_stream); +#endif + } + std::vector op_func_nodes; Build(feed_names, &op_func_nodes); @@ -639,34 +650,9 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time, end_time; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // double remaining_time = std::numeric_limits::max(); - // double event_time, wait_time; start_time = calculated_stream_timer_.StartTime(); end_time = calculated_stream_timer_.EndTime(); -// gpuEvent_t stop_event = calculated_stream_timer_.StopEvent(); - -// for (auto& inst : vec_instruction_) { -// for (auto event_inter : inst.EventsToWait()) { -// auto* wrapper = static_cast( -// event_inter.event_->GetEvent().get()); - -// gpuEvent_t start_event = wrapper->inner_event_.GetRawCudaEvent(); -// event_time = 0; -// PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime( -// reinterpret_cast(&event_time), start_event, stop_event)); -// remaining_time = std::min(remaining_time, event_time); -// VLOG(0) << "event time: " << std::to_string(event_time) -// << " remaining: " << std::to_string(remaining_time); -// } -// } -// if (remaining_time == std::numeric_limits::max()) { -// remaining_time = 0; -// } -// wait_time = end_time - start_time - remaining_time; -// start_time = start_time + wait_time; -// VLOG(0) << "wait time: " << std::to_string(wait_time) -// << " start time: " << std::to_string(start_time) -// << " end time: " << std::to_string(end_time); + calculated_stream_timer_.ResetTime(); #endif return std::make_tuple(start_time, end_time); } @@ -711,16 +697,6 @@ void ProgramInterpreter::Convert( #endif } - if (enable_auto_parallel_profiler_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - gpuStream_t calculate_stream = - dynamic_cast( - platform::DeviceContextPool::Instance().Get(place_)) - ->stream(); - calculated_stream_timer_.SetStream(calculate_stream); -#endif - } - BuildOperatorDependences(); // NOTE(Ruibiao): For cross-step stream synchronization, an event may be @@ -1091,7 +1067,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { if (enable_auto_parallel_profiler_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!interpreter::IsCommunicationOp(instr_node) && - !calculated_stream_timer_.IsStart()) { + !calculated_stream_timer_.IsStarted()) { VLOG(3) << "Start calculated stream timer from op: " << op->Type(); calculated_stream_timer_.Start(); } @@ -1275,7 +1251,8 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { if (enable_auto_parallel_profiler_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (instr_id == last_calculated_instr_id) { + if (instr_id == last_calculated_instr_id && + calculated_stream_timer_.IsStarted()) { VLOG(3) << "Stop calculated stream timer from op: " << instr_node.OpBase()->Type(); calculated_stream_timer_.Stop(); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 379edbfab8f59..ca4d1c0d21157 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -221,7 +221,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector hookfuncs_; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - phi::GpuTimer calculated_stream_timer_; + phi::CalculatedStreamTimer calculated_stream_timer_; #endif size_t last_calculated_instr_id; bool enable_auto_parallel_profiler_; diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index aa378bd626cd6..f9667c18fd659 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -269,6 +269,15 @@ paddle::framework::FetchList StandaloneExecutor::Run( void StandaloneExecutor::SetEnableAutoParallelProfiler( bool enable_auto_parallel_profiler) { + gpuStream_t calculated_stream = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream)); +#endif enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58c578b9ae575..a8e5e4015af6e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2023,7 +2023,6 @@ All parameter, weight, gradient are variables in Paddle. .def("set_enable_auto_parallel_profiler", [](StandaloneExecutor &self, bool enable_auto_parallel_profiler) { - pybind11::gil_scoped_release release; self.SetEnableAutoParallelProfiler(enable_auto_parallel_profiler); }); diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 25403abbe1578..482df368af8a2 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -52,7 +52,6 @@ class GpuTimer { start_, phi::errors::PreconditionNotMet("Start Event is not ready.")); PADDLE_ENFORCE_NOT_NULL( stop_, phi::errors::PreconditionNotMet("Stop Event is not ready.")); - is_start_ = false; } ~GpuTimer() { @@ -71,7 +70,6 @@ class GpuTimer { #else cudaEventRecord(start_, stream); #endif - is_start_ = true; } void Stop(gpuStream_t stream) { @@ -80,83 +78,110 @@ class GpuTimer { #else cudaEventRecord(stop_, stream); #endif - is_start_ = false; } - void Start() { - // Note(sonder): Since it is not possible to directly obtain the start time - // of the event, "gettimeofday" is used here to retrieve it. The callback is - // used to record the start time of the event. - Start(stream_); + float ElapsedTime() { + float milliseconds = 0; #ifdef PADDLE_WITH_HIP - hipStreamAddCallback(stream_, - RecordEventTimerCallback, - reinterpret_cast(&start_time_), - 0); + hipEventSynchronize(stop_); + hipEventElapsedTime(&milliseconds, start_, stop_); #else - cudaStreamAddCallback(stream_, - RecordEventTimerCallback, - reinterpret_cast(&start_time_), - 0); + cudaEventSynchronize(stop_); + cudaEventElapsedTime(&milliseconds, start_, stop_); #endif + return milliseconds; } - void Stop() { - Stop(stream_); + private: + gpuEvent_t start_; + gpuEvent_t stop_; +}; + +class CalculatedStreamTimer { + public: + CalculatedStreamTimer() + : calculated_stream_(nullptr), + start_time_(0), + end_time_(0), + is_started_(false) {} + + void Start() { + // Note(sonder): Since it is not possible to directly obtain the start time + // of the event, "gettimeofday" is used here to retrieve it. The callback is + // used to record the start time of the event. + if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP - hipStreamAddCallback(stream_, - RecordEventTimerCallback, - reinterpret_cast(&end_time_), - 0); + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamAddCallback(calculated_stream_, + RecordEventTimerCallback, + reinterpret_cast(&start_time_), + 0)); #else - cudaStreamAddCallback(stream_, - RecordEventTimerCallback, - reinterpret_cast(&end_time_), - 0); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaStreamAddCallback(calculated_stream_, + RecordEventTimerCallback, + reinterpret_cast(&start_time_), + 0)); #endif + is_started_ = true; + } } - void SetStream(gpuStream_t stream) { stream_ = stream; } - - double ElapsedTime() { - float milliseconds = 0; + void Stop() { + if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP - hipEventSynchronize(stop_); - hipEventElapsedTime(&milliseconds, start_, stop_); + PADDLE_ENFORCE_GPU_SUCCESS( + hipStreamAddCallback(calculated_stream_, + RecordEventTimerCallback, + reinterpret_cast(&end_time_), + 0)); #else - cudaEventSynchronize(stop_); - cudaEventElapsedTime(&milliseconds, start_, stop_); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaStreamAddCallback(calculated_stream_, + RecordEventTimerCallback, + reinterpret_cast(&end_time_), + 0)); #endif - return milliseconds; + is_started_ = false; + } } double StartTime() { + if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP - hipStreamSynchronize(stream_); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_)); #else - cudaStreamSynchronize(stream_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_)); #endif + } return start_time_; } double EndTime() { + if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP - hipStreamSynchronize(stream_); + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_)); #else - cudaStreamSynchronize(stream_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_)); #endif + } return end_time_; } - bool IsStart() { return is_start_; } + bool IsStarted() { return is_started_; } + + void SetStream(gpuStream_t stream) { calculated_stream_ = stream; } + + void ResetTime() { + start_time_ = 0; + end_time_ = 0; + } private: - gpuEvent_t start_; - gpuEvent_t stop_; - gpuStream_t stream_; + gpuStream_t calculated_stream_; double start_time_; double end_time_; - bool is_start_; + bool is_started_; }; } // namespace phi diff --git a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py index eeb4f6ef51250..08a048c0bb68e 100644 --- a/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py +++ b/python/paddle/distributed/auto_parallel/static/profiler_helper_static.py @@ -44,7 +44,7 @@ def parse_args(): return args -def process_job_log(log_data, device_id, start_step): +def process_job_log(log_data, device_id): log_pattern = r'.*?Profiler Info: Job \((\d+)\), type = (\w+), micro_batch_id = (\d+), job_start_time = (\d+.\d+), job_end_time = (\d+.\d+)' matches = re.findall(log_pattern, log_data) events = [] @@ -66,34 +66,32 @@ def process_job_log(log_data, device_id, start_step): step_start_time = start_time step_end_time = end_time - if len(step_times) >= start_step: - event_start = { - "name": job_type + "_" + str(job_id), - "cat": job_type, - "ph": "B", - "ts": start_time, - "pid": 0, - "tid": "GPU" + str(device_id), - } - event_end = { - "name": job_type + "_" + str(job_id), - "cat": job_type, - "ph": "E", - "pid": 0, - "ts": end_time, - "tid": "GPU" + str(device_id), - } - if job_type in color_map: - event_start["cname"] = color_map[job_type] - event_end["cname"] = color_map[job_type] + event_start = { + "name": job_type + "_" + str(job_id), + "cat": job_type, + "ph": "B", + "ts": start_time, + "pid": 0, + "tid": "GPU" + str(device_id), + } + event_end = { + "name": job_type + "_" + str(job_id), + "cat": job_type, + "ph": "E", + "pid": 0, + "ts": end_time, + "tid": "GPU" + str(device_id), + } + if job_type in color_map: + event_start["cname"] = color_map[job_type] + event_end["cname"] = color_map[job_type] - events.append(event_start) - events.append(event_end) + events.append(event_start) + events.append(event_end) last_end_time = end_time step_times.append([step_start_time, step_end_time]) - step_times = step_times[start_step:] return events, step_times @@ -118,7 +116,7 @@ def main(): int(start_step_match[0]) if len(start_step_match) > 0 else 0 ) - events, step_times = process_job_log(log_data, device_id, start_step) + events, step_times = process_job_log(log_data, device_id) all_events.extend(events) for i, info in enumerate(step_times): if len(step_infos) <= i: From ffc7b3974f7e3471567e0dca7620c71084974468 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 9 Nov 2023 02:43:30 +0000 Subject: [PATCH 31/52] fix --- .../fluid/framework/new_executor/standalone_executor.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index f9667c18fd659..d1fbc030e6abe 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -269,16 +269,18 @@ paddle::framework::FetchList StandaloneExecutor::Run( void StandaloneExecutor::SetEnableAutoParallelProfiler( bool enable_auto_parallel_profiler) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t calculated_stream = dynamic_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream)); -#else +#if defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream)); +#elif defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(calculated_stream)); #endif enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; +#endif } } // namespace framework From 1925dd7a3d9ab4713a0a697f5263b6079dfe3e5a Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 9 Nov 2023 05:23:12 +0000 Subject: [PATCH 32/52] fix --- paddle/fluid/framework/new_executor/standalone_executor.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index d1fbc030e6abe..eb879e3970919 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -274,10 +274,10 @@ void StandaloneExecutor::SetEnableAutoParallelProfiler( dynamic_cast( platform::DeviceContextPool::Instance().Get(place_)) ->stream(); -#if defined(PADDLE_WITH_CUDA) - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream)); -#elif defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(calculated_stream)); +#elif defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream)); #endif enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; #endif From d299723c87da29b8f81ba43f4ef9c2d8ec34cca4 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 9 Nov 2023 06:26:47 +0000 Subject: [PATCH 33/52] don't use set_enable_auto_parallel_profiler --- python/paddle/base/executor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index ea4a95e8b1064..c9e2f3d6aa4db 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -1895,10 +1895,6 @@ def _run_impl( else: tensor._copy_from(cpu_tensor, self.place) - new_exe.set_enable_auto_parallel_profiler( - self.enable_auto_parallel_profiler - ) - ret = new_exe.run(list(feed.keys()), return_numpy) set_flags(stored_flag) return ret From 5297b7aed0cc07ba6ce1c06c8cef400367840ffe Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 9 Nov 2023 06:30:25 +0000 Subject: [PATCH 34/52] fix bug --- python/paddle/base/executor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index c9e2f3d6aa4db..65c1c4403416d 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -1895,6 +1895,11 @@ def _run_impl( else: tensor._copy_from(cpu_tensor, self.place) + if self.enable_auto_parallel_profiler: + new_exe.set_enable_auto_parallel_profiler( + self.enable_auto_parallel_profiler + ) + ret = new_exe.run(list(feed.keys()), return_numpy) set_flags(stored_flag) return ret From 8bfb6c07b94e04c5d44e99707b7240fde621f46b Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 9 Nov 2023 06:51:36 +0000 Subject: [PATCH 35/52] disable auto_parallel_profiler when not open flag by command line --- python/paddle/base/executor.py | 4 ++-- python/paddle/distributed/auto_parallel/static/engine.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index 65c1c4403416d..72c58eb6dca01 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -1184,7 +1184,7 @@ def __init__(self, place=None): self.op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName() - self.enable_auto_parallel_profiler = False + self.enable_auto_parallel_profiler = None def _is_optimizer_op(self, op): return self.op_role_key in op.attr_names and int( @@ -1895,7 +1895,7 @@ def _run_impl( else: tensor._copy_from(cpu_tensor, self.place) - if self.enable_auto_parallel_profiler: + if self.enable_auto_parallel_profiler is not None: new_exe.set_enable_auto_parallel_profiler( self.enable_auto_parallel_profiler ) diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 972f435c69462..a88ccac384639 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -1485,9 +1485,10 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None): ): self._prepare_reader() - self._executor.enable_auto_parallel_profiler = ( - self.enable_auto_parallel_profiler - ) + if self._strategy.pipeline.auto_parallel_profiler: + self._executor.enable_auto_parallel_profiler = ( + self.enable_auto_parallel_profiler + ) outs = self._executor.run( self.main_program, From 13b14d1755b75eef068791e1509d934969038f2c Mon Sep 17 00:00:00 2001 From: AndSonder Date: Thu, 9 Nov 2023 13:55:15 +0000 Subject: [PATCH 36/52] fix bug --- python/paddle/distributed/auto_parallel/static/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index a88ccac384639..0052e3dc075fb 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -1485,7 +1485,7 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None): ): self._prepare_reader() - if self._strategy.pipeline.auto_parallel_profiler: + if self._strategy.pipeline.schedule_profiler: self._executor.enable_auto_parallel_profiler = ( self.enable_auto_parallel_profiler ) From 5bb55e16b6bff78ff03c18f9c7de04ebe4bf351e Mon Sep 17 00:00:00 2001 From: AndSonder Date: Fri, 10 Nov 2023 03:17:02 +0000 Subject: [PATCH 37/52] remove resettime --- paddle/phi/kernels/autotune/gpu_timer.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 482df368af8a2..5e214482652cc 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -172,11 +172,6 @@ class CalculatedStreamTimer { void SetStream(gpuStream_t stream) { calculated_stream_ = stream; } - void ResetTime() { - start_time_ = 0; - end_time_ = 0; - } - private: gpuStream_t calculated_stream_; double start_time_; From f422b339ac2b1e7a49f36220ea428ece2c4b9323 Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 13 Nov 2023 01:57:56 +0000 Subject: [PATCH 38/52] fix build bug --- paddle/fluid/framework/new_executor/program_interpreter.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 93206d1f257c5..9ff8cab1845e8 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -652,7 +652,6 @@ std::tuple ProgramInterpreter::InterpreterRunTime() { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) start_time = calculated_stream_timer_.StartTime(); end_time = calculated_stream_timer_.EndTime(); - calculated_stream_timer_.ResetTime(); #endif return std::make_tuple(start_time, end_time); } From ed5f7fca663a0caf303ebce82700086efa09edbe Mon Sep 17 00:00:00 2001 From: AndSonder Date: Mon, 13 Nov 2023 09:59:02 +0000 Subject: [PATCH 39/52] fix --- .../new_executor/interpreter_base_impl.h | 2 +- .../framework/new_executor/interpretercore.cc | 8 +--- .../framework/new_executor/interpretercore.h | 2 +- .../framework/new_executor/pir_interpreter.cc | 2 +- .../framework/new_executor/pir_interpreter.h | 2 +- .../new_executor/program_interpreter.cc | 46 ++++++++----------- .../new_executor/program_interpreter.h | 8 ++-- .../new_executor/standalone_executor.cc | 16 +++---- .../new_executor/standalone_executor.h | 4 +- paddle/fluid/pybind/pybind.cc | 6 +-- paddle/phi/kernels/autotune/gpu_timer.h | 18 +++++++- python/paddle/base/executor.py | 14 +++--- .../auto_parallel/static/engine.py | 6 +-- 13 files changed, 69 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index 9e441f9c9bdf2..d23246a784cc8 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -73,7 +73,7 @@ class InterpreterBaseImpl { virtual paddle::framework::FetchList Run( const std::vector& feed_names, bool need_fetch = true, - bool enable_auto_parallel_profiler = false) = 0; + bool enable_job_schedule_profiler = false) = 0; virtual void ShareWorkQueueFrom(InterpreterBaseImpl* src) = 0; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 7423d43ac4502..4f929a709345a 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -34,10 +34,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); -PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler, - false, - "Enable auto parallel profiler, collecting the " - "runtime of jobs in different devices"); namespace paddle { namespace framework { @@ -76,8 +72,8 @@ FetchList InterpreterCore::Run( FetchList InterpreterCore::Run(const std::vector& feed_names, bool need_fetch, - bool enable_auto_parallel_profiler) { - return impl_->Run(feed_names, need_fetch, enable_auto_parallel_profiler); + bool enable_job_schedule_profiler) { + return impl_->Run(feed_names, need_fetch, enable_job_schedule_profiler); } void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr src) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 8fbec0e296786..77ad1b8cbc361 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -52,7 +52,7 @@ class InterpreterCore { paddle::framework::FetchList Run(const std::vector& feed_names, bool need_fetch = true, - bool enable_auto_parallel_profiler = false); + bool enable_job_schedule_profiler = false); void ShareWorkQueueFrom(std::shared_ptr src); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc index ae38858139e92..ec0b4aa78920c 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc @@ -1188,7 +1188,7 @@ paddle::framework::FetchList PirInterpreter::Run( FetchList PirInterpreter::Run(const std::vector& feed_names, bool need_fetch, - bool enable_auto_parallel_profiler) { + bool enable_job_schedule_profiler) { SetDeviceId(place_); CheckCUDAGraphBeforeRun(feed_names); diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.h b/paddle/fluid/framework/new_executor/pir_interpreter.h index 6ff147002df12..586a750cbb08e 100644 --- a/paddle/fluid/framework/new_executor/pir_interpreter.h +++ b/paddle/fluid/framework/new_executor/pir_interpreter.h @@ -57,7 +57,7 @@ class PirInterpreter : public InterpreterBaseImpl { paddle::framework::FetchList Run( const std::vector& feed_names, bool need_fetch = true, - bool enable_auto_parallel_profiler = false) override; + bool enable_job_schedule_profiler = false) override; void ShareWorkQueueFrom(InterpreterBaseImpl* src) override; diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 9ff8cab1845e8..89fb4da41949a 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -90,6 +90,10 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, }; PrepareForCUDAGraphCapture(); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + calculate_stream_timer_ = phi::CalculateStreamTimer(place_); +#endif } ProgramInterpreter::~ProgramInterpreter() { @@ -136,18 +140,8 @@ void ProgramInterpreter::RunImpl() { FetchList ProgramInterpreter::Run(const std::vector& feed_names, bool need_fetch, - bool enable_auto_parallel_profiler) { - enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; - - if (enable_auto_parallel_profiler_) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - gpuStream_t calculated_stream = - dynamic_cast( - platform::DeviceContextPool::Instance().Get(place_)) - ->stream(); - calculated_stream_timer_.SetStream(calculated_stream); -#endif - } + bool enable_job_schedule_profiler) { + enable_job_schedule_profiler_ = enable_job_schedule_profiler; std::vector op_func_nodes; Build(feed_names, &op_func_nodes); @@ -650,8 +644,8 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time, end_time; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - start_time = calculated_stream_timer_.StartTime(); - end_time = calculated_stream_timer_.EndTime(); + start_time = calculate_stream_timer_.StartTime(); + end_time = calculate_stream_timer_.EndTime(); #endif return std::make_tuple(start_time, end_time); } @@ -1063,15 +1057,15 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { try { instr_node.WaitEvent(place_); - if (enable_auto_parallel_profiler_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (!interpreter::IsCommunicationOp(instr_node) && - !calculated_stream_timer_.IsStarted()) { + if (enable_job_schedule_profiler_) { + if (!calculate_stream_timer_.IsStarted() && + !interpreter::IsCommunicationOp(instr_node)) { VLOG(3) << "Start calculated stream timer from op: " << op->Type(); - calculated_stream_timer_.Start(); + calculate_stream_timer_.Start(); } -#endif } +#endif if (!instr_node.IsArtificial()) { RunOperator(instr_node); @@ -1126,12 +1120,12 @@ void ProgramInterpreter::ExecuteInstructionList( exception_holder_.Clear(); - if (enable_auto_parallel_profiler_) { + if (enable_job_schedule_profiler_) { for (int i = vec_instr.size() - 1; i >= 0; --i) { auto& instr_node = vec_instr[i]; if (!interpreter::IsCommunicationOp(instr_node)) { VLOG(3) << "Last calculated op type: " << instr_node.OpBase()->Type(); - last_calculated_instr_id = i; + last_calculate_instr_id_ = i; break; } } @@ -1248,16 +1242,16 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { RunInstruction(instr_node); - if (enable_auto_parallel_profiler_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (instr_id == last_calculated_instr_id && - calculated_stream_timer_.IsStarted()) { + if (enable_job_schedule_profiler_) { + if (instr_id == last_calculate_instr_id_ && + calculate_stream_timer_.IsStarted()) { VLOG(3) << "Stop calculated stream timer from op: " << instr_node.OpBase()->Type(); - calculated_stream_timer_.Stop(); + calculate_stream_timer_.Stop(); } -#endif } +#endif if (UNLIKELY(exception_holder_.IsCaught())) { VLOG(4) << "Exception caught"; diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index ca4d1c0d21157..a6fa780dc1bda 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -53,7 +53,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { paddle::framework::FetchList Run( const std::vector& feed_names, bool need_fetch = true, - bool enable_auto_parallel_profiler = false) override; + bool enable_job_schedule_profiler = false) override; void Build( const std::vector& feed_names, @@ -221,10 +221,10 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector hookfuncs_; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - phi::CalculatedStreamTimer calculated_stream_timer_; + phi::CalculateStreamTimer calculate_stream_timer_; #endif - size_t last_calculated_instr_id; - bool enable_auto_parallel_profiler_; + size_t last_calculate_instr_id_; + bool enable_job_schedule_profiler_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index eb879e3970919..4c0a6ee380c25 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -49,7 +49,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, ss << scope << ", "; } VLOG(6) << ss.str(); - enable_auto_parallel_profiler_ = false; + enable_job_schedule_profiler_ = false; const auto& jobs = plan_.JobList(); for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { @@ -217,20 +217,20 @@ paddle::framework::FetchList StandaloneExecutor::Run( const std::vector tmp_feed_names = {}; interpretercores_[job_idx]->Run(tmp_feed_names, /*need_fetch = */ false, - /*enable_auto_parallel_profiler = */ - enable_auto_parallel_profiler_); + /*enable_job_schedule_profiler = */ + enable_job_schedule_profiler_); } else { interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false, - /*enable_auto_parallel_profiler = */ - enable_auto_parallel_profiler_); + /*enable_job_schedule_profiler = */ + enable_job_schedule_profiler_); } } } // record each job's run time #if defined(PADDLE_WITH_CUDA) - if (enable_auto_parallel_profiler_) { + if (enable_job_schedule_profiler_) { for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; const std::string& job_type = job->Type(); @@ -268,7 +268,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( } void StandaloneExecutor::SetEnableAutoParallelProfiler( - bool enable_auto_parallel_profiler) { + bool enable_job_schedule_profiler) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t calculated_stream = dynamic_cast( @@ -279,7 +279,7 @@ void StandaloneExecutor::SetEnableAutoParallelProfiler( #elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream)); #endif - enable_auto_parallel_profiler_ = enable_auto_parallel_profiler; + enable_job_schedule_profiler_ = enable_job_schedule_profiler; #endif } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 01c3a197f298c..8624b7c93b41c 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -41,7 +41,7 @@ class StandaloneExecutor { paddle::framework::FetchList Run(const std::vector& feed_names); - void SetEnableAutoParallelProfiler(bool enable_auto_parallel_profiler); + void SetEnableAutoParallelProfiler(bool enable_job_schedule_profiler); private: bool is_interpretercore_build_result_shared_{false}; @@ -58,7 +58,7 @@ class StandaloneExecutor { std::vector>> vec_force_events_to_wait_; - bool enable_auto_parallel_profiler_; + bool enable_job_schedule_profiler_; }; } // namespace framework diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a8e5e4015af6e..e0b615442cbb2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2021,9 +2021,9 @@ All parameter, weight, gradient are variables in Paddle. return py::cast(std::move(ret)); }) - .def("set_enable_auto_parallel_profiler", - [](StandaloneExecutor &self, bool enable_auto_parallel_profiler) { - self.SetEnableAutoParallelProfiler(enable_auto_parallel_profiler); + .def("set_enable_job_schedule_profiler", + [](StandaloneExecutor &self, bool enable_job_schedule_profiler) { + self.SetEnableAutoParallelProfiler(enable_job_schedule_profiler); }); py::class_( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + } + void Start() { // Note(sonder): Since it is not possible to directly obtain the start time // of the event, "gettimeofday" is used here to retrieve it. The callback is diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index 72c58eb6dca01..c7f04df8f12a4 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -823,9 +823,9 @@ def run(self, feed_names, return_numpy=True): ) return tensors - def set_enable_auto_parallel_profiler(self, enable_auto_parallel_profiler): - self._new_exe.set_enable_auto_parallel_profiler( - enable_auto_parallel_profiler + def set_enable_job_schedule_profiler(self, enable_job_schedule_profiler): + self._new_exe.set_enable_job_schedule_profiler( + enable_job_schedule_profiler ) def _create_new_executor(self): @@ -1184,7 +1184,7 @@ def __init__(self, place=None): self.op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName() - self.enable_auto_parallel_profiler = None + self.enable_job_schedule_profiler = None def _is_optimizer_op(self, op): return self.op_role_key in op.attr_names and int( @@ -1895,9 +1895,9 @@ def _run_impl( else: tensor._copy_from(cpu_tensor, self.place) - if self.enable_auto_parallel_profiler is not None: - new_exe.set_enable_auto_parallel_profiler( - self.enable_auto_parallel_profiler + if self.enable_job_schedule_profiler is not None: + new_exe.set_enable_job_schedule_profiler( + self.enable_job_schedule_profiler ) ret = new_exe.run(list(feed.keys()), return_numpy) diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 0052e3dc075fb..452ed114b2230 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -251,7 +251,7 @@ def __init__( paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1}) paddle.framework.set_flags({'FLAGS_new_executor_static_build': 1}) - self.enable_auto_parallel_profiler = False + self.enable_job_schedule_profiler = False def _prepare_data_spec(self, data, split, batch_size): inputs_spec = [] @@ -1486,8 +1486,8 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None): self._prepare_reader() if self._strategy.pipeline.schedule_profiler: - self._executor.enable_auto_parallel_profiler = ( - self.enable_auto_parallel_profiler + self._executor.enable_job_schedule_profiler = ( + self.enable_job_schedule_profiler ) outs = self._executor.run( From 718cf1779daaed80df2fbbdd391a9dcfd616915e Mon Sep 17 00:00:00 2001 From: andsonder Date: Tue, 14 Nov 2023 11:01:05 +0000 Subject: [PATCH 40/52] remove set enable --- .../new_executor/standalone_executor.cc | 25 ++++--------------- .../new_executor/standalone_executor.h | 8 +++--- paddle/fluid/pybind/pybind.cc | 15 +++++------ python/paddle/base/executor.py | 9 ++++--- 4 files changed, 19 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 4c0a6ee380c25..1037f3da0e5c9 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -157,7 +157,8 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, } paddle::framework::FetchList StandaloneExecutor::Run( - const std::vector& feed_names) { + const std::vector& feed_names, + const bool enable_job_schedule_profiler) { platform::RecordEvent record_event( "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); @@ -218,19 +219,19 @@ paddle::framework::FetchList StandaloneExecutor::Run( interpretercores_[job_idx]->Run(tmp_feed_names, /*need_fetch = */ false, /*enable_job_schedule_profiler = */ - enable_job_schedule_profiler_); + enable_job_schedule_profiler); } else { interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false, /*enable_job_schedule_profiler = */ - enable_job_schedule_profiler_); + enable_job_schedule_profiler); } } } // record each job's run time #if defined(PADDLE_WITH_CUDA) - if (enable_job_schedule_profiler_) { + if (enable_job_schedule_profiler) { for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { const auto& job = jobs[job_idx]; const std::string& job_type = job->Type(); @@ -267,21 +268,5 @@ paddle::framework::FetchList StandaloneExecutor::Run( } } -void StandaloneExecutor::SetEnableAutoParallelProfiler( - bool enable_job_schedule_profiler) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - gpuStream_t calculated_stream = - dynamic_cast( - platform::DeviceContextPool::Instance().Get(place_)) - ->stream(); -#if defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(calculated_stream)); -#elif defined(PADDLE_WITH_CUDA) - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream)); -#endif - enable_job_schedule_profiler_ = enable_job_schedule_profiler; -#endif -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 8624b7c93b41c..621cbf9335dfc 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -39,9 +39,9 @@ class StandaloneExecutor { ~StandaloneExecutor() {} - paddle::framework::FetchList Run(const std::vector& feed_names); - - void SetEnableAutoParallelProfiler(bool enable_job_schedule_profiler); + paddle::framework::FetchList Run( + const std::vector& feed_names, + const bool enable_job_schedule_profiler = false); private: bool is_interpretercore_build_result_shared_{false}; @@ -57,8 +57,6 @@ class StandaloneExecutor { std::vector>> vec_force_events_to_wait_; - - bool enable_job_schedule_profiler_; }; } // namespace framework diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e0b615442cbb2..2951a0ba8bf65 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2012,22 +2012,19 @@ All parameter, weight, gradient are variables in Paddle. const interpreter::Plan &, Scope *>()) .def("run", - [](StandaloneExecutor &self, std::vector feed_names) { + [](StandaloneExecutor &self, + std::vector feed_names, + constbool enable_job_schedule_profiler = false) { paddle::framework::FetchList ret; { pybind11::gil_scoped_release release; - ret = self.Run(feed_names); + ret = self.Run(feed_names, enable_job_schedule_profiler); } return py::cast(std::move(ret)); }) - .def("set_enable_job_schedule_profiler", - [](StandaloneExecutor &self, bool enable_job_schedule_profiler) { - self.SetEnableAutoParallelProfiler(enable_job_schedule_profiler); - }); - - py::class_>(m, "Job") + py::class_>(m, "Job") .def(py::init(), py::arg("type")) .def("micro_batch_id", &framework::interpreter::Job::MicroBatchId) .def("type", &framework::interpreter::Job::Type) diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index c7f04df8f12a4..317b92218832e 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -797,6 +797,7 @@ def __init__(self, place, plan, scope): self._plan = plan self._scope = scope self._new_exe = self._create_new_executor() + self._enable_job_schedule_profiler = False def run(self, feed_names, return_numpy=True): """ @@ -808,7 +809,9 @@ def run(self, feed_names, return_numpy=True): (the Tensor specified in the fetch list) to numpy.ndarray. if it is False, the type of the return value is a list of :code:`LoDTensor`. The default is True. """ - tensors = self._new_exe.run(feed_names)._move_to_list() + tensors = self._new_exe.run( + feed_names, self._enable_job_schedule_profiler + )._move_to_list() if return_numpy: tensors = as_numpy(tensors, copy=True) if not get_flags("FLAGS_enable_pir_in_executor")[ @@ -824,9 +827,7 @@ def run(self, feed_names, return_numpy=True): return tensors def set_enable_job_schedule_profiler(self, enable_job_schedule_profiler): - self._new_exe.set_enable_job_schedule_profiler( - enable_job_schedule_profiler - ) + self._enable_job_schedule_profiler = enable_job_schedule_profiler def _create_new_executor(self): new_exe = core.StandaloneExecutor(self._place, self._plan, self._scope) From f36b57b61916f9ee8c98f98f28ee775835b5e926 Mon Sep 17 00:00:00 2001 From: andsonder Date: Wed, 15 Nov 2023 03:15:58 +0000 Subject: [PATCH 41/52] fix build error --- paddle/fluid/framework/new_executor/program_interpreter.cc | 2 +- paddle/fluid/framework/new_executor/standalone_executor.cc | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 89fb4da41949a..ced72c782aace 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -642,7 +642,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { } std::tuple ProgramInterpreter::InterpreterRunTime() { - double start_time, end_time; + double start_time = 0, end_time = 0; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) start_time = calculate_stream_timer_.StartTime(); end_time = calculate_stream_timer_.EndTime(); diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 1037f3da0e5c9..5dac646b82dcb 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -49,7 +49,6 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, ss << scope << ", "; } VLOG(6) << ss.str(); - enable_job_schedule_profiler_ = false; const auto& jobs = plan_.JobList(); for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { From 444b7a7c7ab1439c625ec61093550d736e176cbb Mon Sep 17 00:00:00 2001 From: andsonder Date: Wed, 15 Nov 2023 05:44:39 +0000 Subject: [PATCH 42/52] fix build error --- paddle/fluid/pybind/pybind.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2951a0ba8bf65..336c00b6ab4f2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2014,7 +2014,7 @@ All parameter, weight, gradient are variables in Paddle. .def("run", [](StandaloneExecutor &self, std::vector feed_names, - constbool enable_job_schedule_profiler = false) { + const bool enable_job_schedule_profiler = false) { paddle::framework::FetchList ret; { pybind11::gil_scoped_release release; From f49491674829cbfba45a761eaf20af9e7ae738f5 Mon Sep 17 00:00:00 2001 From: andsonder Date: Wed, 15 Nov 2023 05:46:26 +0000 Subject: [PATCH 43/52] fix build error --- paddle/fluid/pybind/pybind.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 336c00b6ab4f2..1cb0bb718aea0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2021,10 +2021,10 @@ All parameter, weight, gradient are variables in Paddle. ret = self.Run(feed_names, enable_job_schedule_profiler); } return py::cast(std::move(ret)); - }) + }); - py::class_>(m, "Job") + py::class_>(m, "Job") .def(py::init(), py::arg("type")) .def("micro_batch_id", &framework::interpreter::Job::MicroBatchId) .def("type", &framework::interpreter::Job::Type) From a2b598841e8c7e0c5b6712ee4ea2f3c474792d95 Mon Sep 17 00:00:00 2001 From: andsonder Date: Wed, 15 Nov 2023 14:44:26 +0000 Subject: [PATCH 44/52] fix ci error --- paddle/phi/kernels/autotune/gpu_timer.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index e93315495f983..bfe421d380eff 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -113,10 +113,12 @@ class CalculateStreamTimer { start_time_(0), end_time_(0), is_started_(false) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) calculated_stream_ = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); +#endif } void Start() { From fb748d9f512dc34eea5f13325a5f2396f385b0cf Mon Sep 17 00:00:00 2001 From: andsonder Date: Wed, 15 Nov 2023 16:25:45 +0000 Subject: [PATCH 45/52] fix --- .../fluid/framework/new_executor/program_interpreter.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index ced72c782aace..7e15497a4263b 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -91,7 +91,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, PrepareForCUDAGraphCapture(); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) calculate_stream_timer_ = phi::CalculateStreamTimer(place_); #endif } @@ -643,7 +643,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time = 0, end_time = 0; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) start_time = calculate_stream_timer_.StartTime(); end_time = calculate_stream_timer_.EndTime(); #endif @@ -1057,7 +1057,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { try { instr_node.WaitEvent(place_); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) if (enable_job_schedule_profiler_) { if (!calculate_stream_timer_.IsStarted() && !interpreter::IsCommunicationOp(instr_node)) { @@ -1242,7 +1242,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { RunInstruction(instr_node); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) if (enable_job_schedule_profiler_) { if (instr_id == last_calculate_instr_id_ && calculate_stream_timer_.IsStarted()) { From aa5570d950ab8d87d58fa52420cd36edfa6ff092 Mon Sep 17 00:00:00 2001 From: andsonder Date: Wed, 15 Nov 2023 19:43:46 +0000 Subject: [PATCH 46/52] fix run error --- paddle/phi/kernels/autotune/gpu_timer.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index bfe421d380eff..f8f4b6bf849cd 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -108,17 +108,15 @@ class CalculateStreamTimer { end_time_(0), is_started_(false) {} - explicit CalculateStreamTimer(const paddle::platform::Place place) + explicit CalculateStreamTimer(const paddle::platform::Place &place) : calculated_stream_(nullptr), start_time_(0), end_time_(0), is_started_(false) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) calculated_stream_ = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); -#endif } void Start() { From 6b18e10b5eabcc7b41d2170bc2d8f33595a25a0a Mon Sep 17 00:00:00 2001 From: andsonder Date: Wed, 15 Nov 2023 21:11:06 +0000 Subject: [PATCH 47/52] fix --- .../framework/new_executor/program_interpreter.cc | 6 +++++- paddle/phi/kernels/autotune/gpu_timer.h | 11 ----------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 7e15497a4263b..bdd9a6b452e11 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -92,7 +92,11 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, PrepareForCUDAGraphCapture(); #if defined(PADDLE_WITH_CUDA) - calculate_stream_timer_ = phi::CalculateStreamTimer(place_); + gpuStream_t calculated_stream = + dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + calculate_stream_timer_.SetStream(calculated_stream); #endif } diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index f8f4b6bf849cd..338920a1ae363 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -108,17 +108,6 @@ class CalculateStreamTimer { end_time_(0), is_started_(false) {} - explicit CalculateStreamTimer(const paddle::platform::Place &place) - : calculated_stream_(nullptr), - start_time_(0), - end_time_(0), - is_started_(false) { - calculated_stream_ = - dynamic_cast( - paddle::platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - } - void Start() { // Note(sonder): Since it is not possible to directly obtain the start time // of the event, "gettimeofday" is used here to retrieve it. The callback is From f096253f7bc47a2c62ce343ae85c2755f6c81965 Mon Sep 17 00:00:00 2001 From: andsonder Date: Thu, 16 Nov 2023 00:55:05 +0000 Subject: [PATCH 48/52] fix --- .../new_executor/program_interpreter.cc | 18 ++++++++------- paddle/fluid/pybind/pybind.cc | 2 +- python/paddle/base/executor.py | 23 ++++++++----------- .../auto_parallel/static/engine.py | 7 +++--- 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index bdd9a6b452e11..55763d7f021b7 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -90,14 +90,6 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, }; PrepareForCUDAGraphCapture(); - -#if defined(PADDLE_WITH_CUDA) - gpuStream_t calculated_stream = - dynamic_cast( - paddle::platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - calculate_stream_timer_.SetStream(calculated_stream); -#endif } ProgramInterpreter::~ProgramInterpreter() { @@ -147,6 +139,16 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, bool enable_job_schedule_profiler) { enable_job_schedule_profiler_ = enable_job_schedule_profiler; +#if defined(PADDLE_WITH_CUDA) + if (enable_job_schedule_profiler_) { + gpuStream_t calculated_stream = + dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + calculate_stream_timer_.SetStream(calculated_stream); + } +#endif + std::vector op_func_nodes; Build(feed_names, &op_func_nodes); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d7c98ec112481..e9877b5325357 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2014,7 +2014,7 @@ All parameter, weight, gradient are variables in Paddle. .def("run", [](StandaloneExecutor &self, std::vector feed_names, - const bool enable_job_schedule_profiler = false) { + bool enable_job_schedule_profiler = false) { paddle::framework::FetchList ret; { pybind11::gil_scoped_release release; diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py index ed6a3cb03ab86..be8dda2b0bf08 100755 --- a/python/paddle/base/executor.py +++ b/python/paddle/base/executor.py @@ -807,9 +807,10 @@ def __init__(self, place, plan, scope): self._plan = plan self._scope = scope self._new_exe = self._create_new_executor() - self._enable_job_schedule_profiler = False - def run(self, feed_names, return_numpy=True): + def run( + self, feed_names, return_numpy=True, enable_job_schedule_profiler=False + ): """ Args: feed_names(list): This parameter represents the input names of the model. @@ -820,7 +821,7 @@ def run(self, feed_names, return_numpy=True): the type of the return value is a list of :code:`LoDTensor`. The default is True. """ tensors = self._new_exe.run( - feed_names, self._enable_job_schedule_profiler + feed_names, enable_job_schedule_profiler )._move_to_list() if return_numpy: tensors = as_numpy(tensors, copy=True) @@ -836,9 +837,6 @@ def run(self, feed_names, return_numpy=True): ) return tensors - def set_enable_job_schedule_profiler(self, enable_job_schedule_profiler): - self._enable_job_schedule_profiler = enable_job_schedule_profiler - def _create_new_executor(self): new_exe = core.StandaloneExecutor(self._place, self._plan, self._scope) @@ -1207,7 +1205,7 @@ def __init__(self, place=None): self.op_role_key = core.op_proto_and_checker_maker.kOpRoleAttrName() - self.enable_job_schedule_profiler = None + self.enable_job_schedule_profiler = False def _is_optimizer_op(self, op): return self.op_role_key in op.attr_names and int( @@ -1929,12 +1927,11 @@ def _run_impl( else: tensor._copy_from(cpu_tensor, self.place) - if self.enable_job_schedule_profiler is not None: - new_exe.set_enable_job_schedule_profiler( - self.enable_job_schedule_profiler - ) - - ret = new_exe.run(list(feed.keys()), return_numpy) + ret = new_exe.run( + list(feed.keys()), + return_numpy, + self.enable_job_schedule_profiler, + ) set_flags(stored_flag) return ret diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py index 16bc6b74e432e..93f1d477baf5c 100644 --- a/python/paddle/distributed/auto_parallel/static/engine.py +++ b/python/paddle/distributed/auto_parallel/static/engine.py @@ -1495,10 +1495,9 @@ def run(self, data=None, feed=None, fetch_list=None, mode=None): ): self._prepare_reader() - if self._strategy.pipeline.schedule_profiler: - self._executor.enable_job_schedule_profiler = ( - self.enable_job_schedule_profiler - ) + self._executor.enable_job_schedule_profiler = ( + self.enable_job_schedule_profiler + ) outs = self._executor.run( self.main_program, From 560fb61b077f56c2b498ddec8d46a2bacc871e81 Mon Sep 17 00:00:00 2001 From: andsonder Date: Thu, 16 Nov 2023 02:35:54 +0000 Subject: [PATCH 49/52] fix calculate_stream_timer logic --- .../new_executor/program_interpreter.cc | 26 +++++++------------ .../new_executor/program_interpreter.h | 2 +- paddle/phi/kernels/autotune/gpu_timer.h | 14 ++++++++++ 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 55763d7f021b7..65f865a862d6d 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -90,6 +90,10 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, }; PrepareForCUDAGraphCapture(); + +#if defined(PADDLE_WITH_CUDA) + calculate_stream_timer_ = std::make_unique(place); +#endif } ProgramInterpreter::~ProgramInterpreter() { @@ -139,16 +143,6 @@ FetchList ProgramInterpreter::Run(const std::vector& feed_names, bool enable_job_schedule_profiler) { enable_job_schedule_profiler_ = enable_job_schedule_profiler; -#if defined(PADDLE_WITH_CUDA) - if (enable_job_schedule_profiler_) { - gpuStream_t calculated_stream = - dynamic_cast( - paddle::platform::DeviceContextPool::Instance().Get(place_)) - ->stream(); - calculate_stream_timer_.SetStream(calculated_stream); - } -#endif - std::vector op_func_nodes; Build(feed_names, &op_func_nodes); @@ -650,8 +644,8 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time = 0, end_time = 0; #if defined(PADDLE_WITH_CUDA) - start_time = calculate_stream_timer_.StartTime(); - end_time = calculate_stream_timer_.EndTime(); + start_time = calculate_stream_timer_->StartTime(); + end_time = calculate_stream_timer_->EndTime(); #endif return std::make_tuple(start_time, end_time); } @@ -1065,10 +1059,10 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { instr_node.WaitEvent(place_); #if defined(PADDLE_WITH_CUDA) if (enable_job_schedule_profiler_) { - if (!calculate_stream_timer_.IsStarted() && + if (!calculate_stream_timer_->IsStarted() && !interpreter::IsCommunicationOp(instr_node)) { VLOG(3) << "Start calculated stream timer from op: " << op->Type(); - calculate_stream_timer_.Start(); + calculate_stream_timer_->Start(); } } #endif @@ -1251,10 +1245,10 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) { #if defined(PADDLE_WITH_CUDA) if (enable_job_schedule_profiler_) { if (instr_id == last_calculate_instr_id_ && - calculate_stream_timer_.IsStarted()) { + calculate_stream_timer_->IsStarted()) { VLOG(3) << "Stop calculated stream timer from op: " << instr_node.OpBase()->Type(); - calculate_stream_timer_.Stop(); + calculate_stream_timer_->Stop(); } } #endif diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index a6fa780dc1bda..5b8b4dbb36a81 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -221,7 +221,7 @@ class ProgramInterpreter : public InterpreterBaseImpl { std::vector hookfuncs_; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - phi::CalculateStreamTimer calculate_stream_timer_; + std::unique_ptr calculate_stream_timer_; #endif size_t last_calculate_instr_id_; bool enable_job_schedule_profiler_; diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 338920a1ae363..2d06de4f4cac3 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -108,10 +108,23 @@ class CalculateStreamTimer { end_time_(0), is_started_(false) {} + explicit CalculateStreamTimer(const paddle::platform::Place &place) + : calculated_stream_(nullptr), + start_time_(0), + end_time_(0), + is_started_(false), + place_(place) {} + void Start() { // Note(sonder): Since it is not possible to directly obtain the start time // of the event, "gettimeofday" is used here to retrieve it. The callback is // used to record the start time of the event. + if (!is_started_) { + calculated_stream_ = + dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + } if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( @@ -180,6 +193,7 @@ class CalculateStreamTimer { double start_time_; double end_time_; bool is_started_; + const paddle::platform::Place place_; }; } // namespace phi From bbb30715ded07fe80aa12c8a2874ed20e7b38f03 Mon Sep 17 00:00:00 2001 From: andsonder Date: Fri, 17 Nov 2023 02:07:24 +0000 Subject: [PATCH 50/52] remove fluid head --- paddle/phi/kernels/autotune/gpu_timer.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 2d06de4f4cac3..e160b3567e3ac 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -14,11 +14,11 @@ #pragma once -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/place.h" #include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_decls.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/errors.h" @@ -108,7 +108,7 @@ class CalculateStreamTimer { end_time_(0), is_started_(false) {} - explicit CalculateStreamTimer(const paddle::platform::Place &place) + explicit CalculateStreamTimer(const phi::Place &place) : calculated_stream_(nullptr), start_time_(0), end_time_(0), @@ -120,10 +120,9 @@ class CalculateStreamTimer { // of the event, "gettimeofday" is used here to retrieve it. The callback is // used to record the start time of the event. if (!is_started_) { - calculated_stream_ = - dynamic_cast( - paddle::platform::DeviceContextPool::Instance().Get(place_)) - ->stream(); + calculated_stream_ = dynamic_cast( + phi::DeviceContextPool::Instance().Get(place_)) + ->stream(); } if (calculated_stream_ != nullptr) { #ifdef PADDLE_WITH_HIP @@ -193,7 +192,7 @@ class CalculateStreamTimer { double start_time_; double end_time_; bool is_started_; - const paddle::platform::Place place_; + const phi::Place place_; }; } // namespace phi From e15c19e36616da20d94e1b35f18d8c9481fa82b6 Mon Sep 17 00:00:00 2001 From: andsonder Date: Fri, 17 Nov 2023 03:32:10 +0000 Subject: [PATCH 51/52] fix build error --- paddle/phi/kernels/autotune/gpu_timer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index e160b3567e3ac..c50a571a7fd95 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/dynload/port.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_decls.h" From 989348c33dc9a7912b0a345680558bcec42de497 Mon Sep 17 00:00:00 2001 From: andsonder Date: Fri, 17 Nov 2023 15:02:48 +0000 Subject: [PATCH 52/52] set default value for enable_job_schedule_profiler --- paddle/fluid/framework/new_executor/program_interpreter.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 65f865a862d6d..04d116a53a525 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -54,7 +54,8 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, block_(block), stream_analyzer_(place), execution_config_(execution_config), - var_scope_(scope) { + var_scope_(scope), + enable_job_schedule_profiler_(false) { VLOG(4) << "ProgramInterpreter(): " << this << " on " << place_; exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);