diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ecb8ef807b4ba..e27e8d74fc2d0 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -10,7 +10,6 @@ if(TRACE_PROFILE) link_directories("${SCALOPUS_PATH}/scalopus/so/scalopus_tracing") link_directories("${SCALOPUS_PATH}/scalopus/so/thirdparty/seasocks/src/main/c/") endif() - #windows treat symbolic file as a real file, which is different with unix #We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) diff --git a/paddle/fluid/framework/boxps_worker.cc b/paddle/fluid/framework/boxps_worker.cc index 9b1c7c82d951c..5826bbd5ca8da 100644 --- a/paddle/fluid/framework/boxps_worker.cc +++ b/paddle/fluid/framework/boxps_worker.cc @@ -35,6 +35,7 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #endif +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // The producer side. #include #include @@ -43,6 +44,7 @@ limitations under the License. */ #include #include #include +#endif DECLARE_bool(enable_sync_dense_moment); DECLARE_bool(check_nan_inf); @@ -742,9 +744,13 @@ void BoxPSWorker::TrainFilesWithProfiler() { main_timer.Resume(); reader_timer.Resume(); +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_START("PackBatchTask", dev_ctx_->Wait()); +#endif batch_size = PackBatchTask(); +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_END("PackBatchTask", dev_ctx_->Wait()); +#endif reader_timer.Pause(); if (batch_size <= 0) { break; @@ -756,19 +762,27 @@ void BoxPSWorker::TrainFilesWithProfiler() { cal_timer.Resume(); int op_id = 0; dev_ctx_->Wait(); - std::vector op_names; + // std::vector op_names; +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_START("ops run",); +#endif for (auto& op : ops_) { +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) RUNTIME_TRACE_SCOPE_START((op->Type()+" run").c_str(),); +#endif timeline.Start(); op->Run(*thread_scope_, place_); dev_ctx_->Wait(); timeline.Pause(); op_total_time[op_id++] += timeline.ElapsedUS(); +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) RUNTIME_TRACE_SCOPE_END((op->Type()+" run").c_str(),); +#endif } dev_ctx_->Wait(); +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_END("ops run",); +#endif cal_timer.Pause(); #if defined(PADDLE_WITH_CUDA) if (FLAGS_check_nan_inf) { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index accfb19c4605d..379374ac90ab4 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -561,22 +561,29 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, } } +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_START("executor ops run",); +#endif for (int64_t i = start_op_index; i < end_op_index; ++i) { auto& op = ctx->ops_[i]; +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) xpu_wait(); RUNTIME_TRACE_SCOPE_START((op->Type()+" run").c_str(),); +#endif op->Run(*local_scope, place_); if (gc) { platform::RecordEvent record( "CheckGC", platform::TracerEventType::UserDefined, 10); DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get()); } +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) RUNTIME_TRACE_SCOPE_END((op->Type()+" run").c_str(),); xpu_wait(); +#endif } +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_END("executor ops run",); - +#endif auto callback = [scope, local_scope, keep_kids]() { if (local_scope != scope) { VLOG(4) << "Delete scope: " << local_scope; diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 0721beccd86d4..a53ed1e7c9c4e 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // The producer side. #include #include @@ -36,6 +37,7 @@ limitations under the License. */ #include #include #include +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc index de7f119829d77..84d7711d30591 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cc +++ b/paddle/fluid/framework/fleet/box_wrapper.cc @@ -144,7 +144,7 @@ void BoxWrapper::EndPass(bool need_save_delta) { << "MB, available: " << (available >> 20) << "MB"; } #endif -#ifdef TRACE_PROFILE +#if defined(TRACE_PROFILE) && defined(PADDLE_WITH_XPU_KP) static int trace_pass_count = std::getenv("TRACE_PASS_NUM")!=NULL ? std::stoi(std::string(std::getenv("TRACE_PASS_NUM"))): 1; diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index d212e5b67bbf6..f525d01da6f7e 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -48,6 +48,8 @@ limitations under the License. */ #include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/framework/fleet/metrics.h" #include "paddle/fluid/framework/fleet/box_wrapper_kernel.h" + +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // The producer side. #include #include @@ -56,6 +58,7 @@ limitations under the License. */ #include #include #include +#endif #define BUF_SIZE 1024 * 1024 DECLARE_int32(fix_dayid); @@ -393,7 +396,7 @@ class BoxWrapper { use_xpu_sparse_map_ = true; } #endif -#ifdef TRACE_PROFILE +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // Client side to produce the tracepoints. factory = std::make_shared(); const auto server = factory->serve(); @@ -909,7 +912,7 @@ class BoxWrapper { std::set slot_eval_set_; std::atomic dataset_id_{0}; std::atomic round_id_{0}; -#ifdef TRACE_PROFILE +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) scalopus::TransportLoopbackFactory::Ptr factory; std::shared_ptr manager; scalopus::CatapultRecorder::Ptr catapult_recorder; diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h index 99aea16507db7..23dc9dc546c07 100644 --- a/paddle/fluid/framework/fleet/box_wrapper_impl.h +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // The producer side. #include #include @@ -24,6 +25,7 @@ limitations under the License. */ #include #include #include +#endif DECLARE_bool(enable_pullpush_dedup_keys); diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ef1c7c6be6aa5..7e37a6d00ba78 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -48,18 +48,26 @@ void NaiveExecutor::Run() { platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_START("naive_executor ops run",); +#endif for (auto &op : ops_) { +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) xpu_wait(); RUNTIME_TRACE_SCOPE_START((op->Type()+" run").c_str(),); +#endif VLOG(4) << std::this_thread::get_id() << " run " << op->DebugStringEx(scope_) << " on scope " << scope_; op->SetIsCalledByExecutor(false); op->Run(*scope_, place_); +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) RUNTIME_TRACE_SCOPE_END((op->Type()+" run").c_str(),); xpu_wait(); +#endif } +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_END("naive_executor ops run",); +#endif } void NaiveExecutor::CreateVariables(const ProgramDesc &desc, diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index db09736b0302c..91551c8f1eab9 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -24,6 +24,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // The producer side. #include #include @@ -32,6 +33,7 @@ #include #include #include +#endif namespace phi { class DenseTensor; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 5086b536a351e..e32483aebd002 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -45,7 +45,9 @@ class DenseTensor; #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#endif +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // The producer side. #include #include @@ -1669,10 +1671,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope, 1, platform::EventRole::kInnerOp); if (need_prepare_data_) { +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_START("PrepareData",); +#endif transfer_scope = PrepareData( scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) TRACE_SCOPE_END("PrepareData",);//wait? +#endif } } // exec scope is the scope that kernel actually executed on. diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index b9f23c07eba89..d6d0f6a21ade3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1944,7 +1944,7 @@ AnalysisPredictor::~AnalysisPredictor() { } device_contexts_.clear(); - #ifdef TRACE_PROFILE + #if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // need to guarantee we propagate the tracepoints before we stop the interval. std::this_thread::sleep_for(std::chrono::milliseconds(1000)); catapult_recorder->stopInterval(); diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index caa9529558f0f..8365beada2754 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -105,7 +105,7 @@ class AnalysisPredictor : public PaddlePredictor { config_.EnableMemoryOptim(false); } predictor_id_ = inference::GetUniqueId(); -#ifdef TRACE_PROFILE +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // Client side to produce the tracepoints. factory = std::make_shared(); const auto server = factory->serve(); @@ -556,7 +556,7 @@ class AnalysisPredictor : public PaddlePredictor { #endif friend class paddle_infer::experimental::InternalUtils; -#ifdef TRACE_PROFILE +#if defined(TRACE_PROFILE) && defined(PADDLE_WITH_XPU_KP) scalopus::TransportLoopbackFactory::Ptr factory; std::shared_ptr manager; scalopus::CatapultRecorder::Ptr catapult_recorder; diff --git a/paddle/fluid/operators/collective/c_mixallgather_op.cc b/paddle/fluid/operators/collective/c_mixallgather_op.cc index 6fbbd4042e06f..5471788761acf 100644 --- a/paddle/fluid/operators/collective/c_mixallgather_op.cc +++ b/paddle/fluid/operators/collective/c_mixallgather_op.cc @@ -29,6 +29,7 @@ limitations under the License. */ #endif #include "paddle/fluid/operators/tensor_formatter.h" +#if defined(TRACE_PROFILE) && (defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_XPU)) // The producer side. #include #include @@ -37,6 +38,7 @@ limitations under the License. */ #include #include #include +#endif namespace paddle { namespace operators {