diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index c8f3dc0d673f1..ff0c28c00716d 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -882,21 +882,22 @@ target_link_libraries( conditional_block_op_helper pylayer_op_helper) -cc_library( - parallel_executor - SRCS parallel_executor.cc - DEPS ssa_graph_executor graph build_strategy collective_helper - variable_helper) - cc_library( compiled_program SRCS compiled_program.cc - DEPS graph build_strategy) + DEPS graph + build_strategy + reference_count_pass + eager_deletion_pass + buffer_shared_inplace_op_pass + buffer_shared_cross_op_memory_reuse_pass + inplace_addto_op_pass + set_reader_device_info_utils) cc_library( executor_cache SRCS executor_cache.cc - DEPS parallel_executor pir_transforms pir) + DEPS pir_transforms pir) cc_library( prune SRCS prune.cc @@ -962,7 +963,8 @@ cc_library( phi common imperative_flag - layer) + layer + op_dialect_vjp) cc_library( type_info diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 20c1444f238eb..3a22cb8131c90 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -7,12 +7,6 @@ if(WITH_PSCORE) endif() set_source_files_properties( reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties( - threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS - ${DISTRIBUTE_COMPILE_FLAGS}) - set_source_files_properties( - async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS - ${DISTRIBUTE_COMPILE_FLAGS}) endif() set(op_handle_srcs @@ -81,36 +75,6 @@ endif() add_dependencies(detail_op_handle framework_proto auto_parallel_proto xxhash) -set(ssa_graph_executor_srcs - ssa_graph_executor.cc - threaded_ssa_graph_executor.cc - parallel_ssa_graph_executor.cc - async_ssa_graph_executor.cc - bind_threaded_ssa_graph_executor.cc - fast_threaded_ssa_graph_executor.cc - scope_buffered_ssa_graph_executor.cc - scope_buffered_monitor.cc) -set(SSA_GRAPH_EXECUTOR_DEPS - graph - framework_proto - detail_op_handle - reference_count_pass - eager_deletion_pass - buffer_shared_inplace_op_pass - buffer_shared_cross_op_memory_reuse_pass - inplace_addto_op_pass - set_reader_device_info_utils - scope - simple_threadpool - device_context - profiler - selected_rows_utils) - -cc_library( - ssa_graph_executor - SRCS ${ssa_graph_executor_srcs} - DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) - set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc deleted file mode 100644 index a93e59b27aebb..0000000000000 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ /dev/null @@ -1,208 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" - -#include "paddle/fluid/framework/variable_helper.h" - -#if defined PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" -#endif - -namespace paddle { -namespace framework { -namespace details { - -inline void InitVarsInScope(const std::vector &var_infos, - Scope *scope, - Scope *local_scope) { - VLOG(3) << "InitVarsInScope"; - for (auto &info : var_infos) { - if (info.persistable_) { // Persistable - auto *var = scope->FindVar(info.name_); - if (var != nullptr) { - VLOG(2) << info.name_ - << " has been initialized beforehand in global scope, skipped"; - continue; - } - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - InitializeVariable(local_scope->Var(info.name_), info.type_); - } - } -} - -// get CommContext and remote send and recv op -void ProcessGraph(std::vector graphs, Scope *scope) { return; } - -AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - std::vector graphs) - : strategy_(strategy), - local_scopes_(local_scopes), - local_exec_scopes_(local_exec_scopes), - pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), - places_(places), - graphs_(std::move(graphs)), - executors_(), - run_futures_(), - var_infos_() { - VLOG(3) << "build AsyncSSAGraphExecutor"; - PADDLE_ENFORCE_EQ(places_.size(), - local_scopes_.size(), - platform::errors::InvalidArgument( - "The number of places and the number of local scopes " - "should be equal, but got number of places is %d and " - "number of local scopes is %d.", - places_.size(), - local_scopes_.size())); - PADDLE_ENFORCE_EQ( - local_scopes_.size(), - local_exec_scopes_.size(), - platform::errors::InvalidArgument( - "The number of local scopes and the number of local execution scopes " - "should be equal, but got number of local scopes is %d and " - "number of local execution scopes is %d.", - local_scopes_.size(), - local_exec_scopes_.size())); - - // set the correct size of thread pool to each device. - strategy_.num_threads_ = strategy_.num_threads_ < places_.size() - ? 1UL - : strategy_.num_threads_ / places_.size(); - VLOG(1) << "set num_threads: " << strategy_.num_threads_ - << " to run the operators of the graph on each device."; - for (size_t i = 0; i < places.size(); ++i) { - executors_.emplace_back( - new details::ThreadedSSAGraphExecutor(strategy_, - {local_scopes_[i]}, - {local_exec_scopes_[i]}, - {places_[i]}, - graphs_[i])); - } - - for (auto &node : graphs_[0]->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos_.emplace_back(); - var_infos_.back().name_ = node->Var()->Name(); - var_infos_.back().type_ = node->Var()->GetType(); - var_infos_.back().persistable_ = node->Var()->Persistable(); - } - } - - for (size_t i = local_scopes_.size(); i >= 1; --i) { - InitVarsInScope( - var_infos_, local_scopes_[i - 1], local_exec_scopes_[i - 1]); - } - ProcessGraph(graphs_, local_scopes_[0]); -} - -void AsyncSSAGraphExecutor::StartOffPythonTrainLoop(bool return_merged) { - VLOG(3) << "StartOffPythonTrainLoop size = " << places_.size(); - for (size_t i = 1; i < places_.size(); ++i) { - auto call = [this, i, return_merged]() -> void { - VLOG(3) << "start off python thread " << i; - try { - while (true) { - executors_[i]->Run({}, return_merged); - } - } catch (...) { - exception_holder_.Catch(std::current_exception()); - VLOG(3) << "get exception type = " << exception_holder_.Type(); - } - VLOG(3) << "thread " << i << " exited!"; - }; - run_futures_.emplace_back(pool_->enqueue(std::move(call))); - } -} - -void AsyncSSAGraphExecutor::HandleException() { - if (exception_holder_.IsCaught()) { - for (auto &f : run_futures_) { - VLOG(3) << "wait future"; - f.wait(); - } - VLOG(3) << "caught exception " << exception_holder_.Type() - << ", rethrow it"; - run_futures_.clear(); - exception_holder_.ReThrow(); - } -} - -FetchResultType AsyncSSAGraphExecutor::Run( - const std::vector &fetch_tensors, bool return_merged) { - PADDLE_ENFORCE_EQ(return_merged, - true, - platform::errors::InvalidArgument( - "AsyncSSAGraphExecutor does not support unmerged " - "results to be fetched!")); - // init once - if (run_futures_.empty() && places_.size() > 1) { -#if defined PADDLE_WITH_PSCORE - if (strategy_.thread_barrier_) { - paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset( - places_.size()); - } -#endif - exception_holder_.Clear(); - StartOffPythonTrainLoop(return_merged); - } - - if (places_.size() == 1) { - exception_holder_.Clear(); - } - - FetchResultType fetch_data; - - try { - fetch_data = executors_[0]->Run(fetch_tensors, return_merged); - } catch (...) { - exception_holder_.Catch(std::current_exception()); - } - - HandleException(); - - FetchList ret; - auto &val = PADDLE_GET(FetchList, fetch_data); - for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { - if (data_is_lod_tensor(val.at(fetch_idx))) { - std::vector lodtensor_ptrs; - lodtensor_ptrs.push_back( - &(PADDLE_GET(phi::DenseTensor, val.at(fetch_idx)))); - phi::DenseTensor var; - MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace()); - ret.emplace_back(var); - } else { - auto array = PADDLE_GET(LoDTensorArray, val.at(fetch_idx)); - LoDTensorArray item_array; - item_array.reserve(array.size()); - for (auto &item : array) { - std::vector lodtensor_ptrs; - lodtensor_ptrs.push_back(&item); - item_array.emplace_back(); - MergeLoDTensor( - &(item_array.back()), lodtensor_ptrs, platform::CPUPlace()); - } - ret.emplace_back(item_array); - } - } - return ret; -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h deleted file mode 100644 index bca1f0b460ff4..0000000000000 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include "ThreadPool.h" -#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" - -namespace paddle { -namespace framework { -namespace details { - -struct VarInfo { - std::string name_; - proto::VarType::Type type_; - bool persistable_; -}; - -class AsyncSSAGraphExecutor final : public SSAGraphExecutor { - public: - AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - std::vector graphs); - ~AsyncSSAGraphExecutor() final = default; - const ir::Graph &Graph() const override { return *graphs_[0]; } - - FetchResultType Run(const std::vector &fetch_tensors, - bool return_merged) override; - - private: - void StartOffPythonTrainLoop(bool return_merged); - void HandleException(); - - private: - ExecutionStrategy strategy_; - std::vector local_scopes_; - std::vector local_exec_scopes_; - std::unique_ptr<::ThreadPool> pool_{nullptr}; - std::vector places_; - std::vector graphs_; - - std::vector> executors_; - ExceptionHolder exception_holder_; - std::vector> run_futures_; - std::vector var_infos_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc deleted file mode 100644 index f36f29a5b0217..0000000000000 --- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc +++ /dev/null @@ -1,350 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h" - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/computation_op_handle.h" -#include "paddle/fluid/framework/details/fetch_op_handle.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -#if defined(PADDLE_WITH_XPU) -namespace paddle { -namespace framework { -namespace details { - -BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph) - : strategy_(strategy), - local_scopes_(local_scopes), - local_exec_scopes_(local_exec_scopes), - places_(places), - graph_(graph), - prepare_pool_(1), - multi_device_op_pool_(1) { - for (uint32_t i = 0; i < places.size(); i++) { - pool_.emplace_back(std::unique_ptr<::ThreadPool>(new ::ThreadPool(1))); - } - int index = 0; - for (uint32_t i = 0; i < places.size(); i++) { - int id = places_[i].device; - if (place_to_index_.find(id) == place_to_index_.end()) { - place_to_index_[id] = index; - index++; - } - } - for (auto &op : ir::FilterByNodeWrapper(*graph_)) { - int dep = static_cast(op->NotReadyInputSize()); - op_deps_.emplace(op, dep); - if (dep == 0) { - bootstrap_ops_.emplace_back(op); - } - } - PADDLE_ENFORCE_GT(op_deps_.size(), - 0, - platform::errors::PreconditionNotMet( - "The graph doesn't have operators.")); - PrepareAtomicOpDeps(); -} - -static std::vector get_children(OpHandleBase *op) { - auto &outputs = op->Outputs(); - std::vector ret; - for (auto &output : outputs) { - ret.insert( - ret.end(), output->PendingOps().begin(), output->PendingOps().end()); - } - return ret; -} - -static std::vector get_parents(OpHandleBase *op) { - auto &inputs = op->Inputs(); - std::vector ret; - for (auto &input : inputs) { - if (input->GeneratedOp() != nullptr) { - ret.push_back(input->GeneratedOp()); - } - } - return ret; -} - -FetchResultType BindThreadedSSAGraphExecutor::Run( - const std::vector &fetch_tensors, bool return_merged) { - VLOG(3) << "enter BindThreadedSSAGraphExecutor Run"; - return RunMainStream(fetch_tensors, return_merged); -} - -// use 2 streams to run op. The first stream is main stream and will run -// most op exclude op depending on multi device(e.g., all_reduce, fetch op) -FetchResultType BindThreadedSSAGraphExecutor::RunMainStream( - const std::vector &fetch_tensors, bool return_merged) { - VLOG(3) << "enter MainStream Run"; - std::unique_ptr> - op_deps = atomic_op_deps_.get(); - PrepareAtomicOpDeps(); - - error_state = 0; - paddle::framework::FetchResultType fetches; - if (return_merged) { - fetches = FetchList(fetch_tensors.size()); - } else { - fetches = FetchUnmergedList(fetch_tensors.size()); - } - std::unordered_map> fetched_vars; - std::vector fetch_ops; - std::vector ready_fetch_ops; - auto ready_ops = std::make_shared>(); - exception_.Clear(); - - InsertFetchOps(fetch_tensors, - &fetches, - &fetched_vars, - op_deps.get(), - &fetch_ops, - &ready_fetch_ops, - return_merged); - for (auto cur_op : bootstrap_ops_) { - ready_ops->Push(cur_op); - } - for (auto cur_op : ready_fetch_ops) { - ready_ops->Push(cur_op); - } - - { - std::lock_guard lock(mutex_); - exec_op_count_ = 0; - } - - platform::XPUPlace cur_place; - std::size_t cur_count = 0; - - while (cur_count < op_deps->size()) { - cur_count++; - auto cur_op = ready_ops->Pop(); - // when exception, get cur_op == nullptr - if (cur_op == nullptr) { - std::lock_guard lock(mutex_); - exec_op_count_ = op_deps->size(); - break; - } - auto dev_ctxes_ = cur_op->DeviceContext(); - if (cur_op->IsMultiDeviceTransfer()) { - RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops); - continue; - } else { - cur_place = dev_ctxes_.begin()->first; - int cur_index = place_to_index_[cur_place.device]; - RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index); - } - } - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return exec_op_count_ >= op_deps->size(); }); - } - - if (exception_.IsCaught()) { - ExecutionFinal(&fetch_ops); - } - - // Wait FetchOps. - ClearFetchOp(graph_, &fetch_ops); - return fetches; -} - -void BindThreadedSSAGraphExecutor::InsertFetchOps( - const std::vector &fetch_tensors, - FetchResultType *fetches, - std::unordered_map> *fetched_vars, - std::unordered_map *op_deps, - std::vector *fetch_ops, - std::vector *ready_fetch_ops, - bool return_merged) { - std::unordered_set fetch_tensor_set(fetch_tensors.begin(), - fetch_tensors.end()); - for (auto &fetch_var_name : fetch_tensor_set) { - for (auto &var_map : graph_->Get(kGraphVars)) { - auto it = var_map.find(fetch_var_name); - if (it != var_map.end()) { - (*fetched_vars)[fetch_var_name].push_back(*it->second.rbegin()); - } - } - } - - for (size_t i = 0; i < fetch_tensors.size(); ++i) { - auto &var_name = fetch_tensors.at(i); - auto fetched_var_it = fetched_vars->find(var_name); - PADDLE_ENFORCE_NE( - fetched_var_it, - fetched_vars->end(), - platform::errors::PreconditionNotMet( - "Cannot find fetched variable(%s) in current computation graph. " - "Possible reasons are:\n" - " 1. The variable to be fetched is not defined in main program.\n" - " 2. The variable to be fetched is not an input or output of any " - "operator.\n" - " 3. Confirm that you have used the fetch `Variable` format " - "instead of the string literal('%s') in `fetch_list` parameter " - "when using `executor.run` method. In other words, the format of " - "`executor.run(fetch_list=[fetch_var])`(fetch_var is a Variable) " - "is recommended.", - var_name, - var_name)); - - auto &vars = fetched_var_it->second; - - ir::Node *fetch_node = - graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); - auto *op = new FetchOpHandle(fetch_node, - fetches, - i, - &local_scopes_, - &local_exec_scopes_, - return_merged); - fetch_ops->emplace_back(op); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - for (auto &p : places_) { - op->SetDeviceContext(p, pool.Get(p)); - } - - for (auto *var : vars) { - op->AddInput(var); - } - - int dep = static_cast(op->NotReadyInputSize()); - (*op_deps)[op].dep_num = dep; - (*op_deps)[op].op = op; - if (dep == 0) { - ready_fetch_ops->emplace_back(op); - } - } -} -// RunMultiDeviceOpAsync function is used for Communicated OPs -// like all_reduce\broadcast among multicards. -void BindThreadedSSAGraphExecutor::RunMultiDeviceOpAsync( - OpHandleBase *op, - std::unordered_map *op_deps, - std::shared_ptr> ready_ops) { - multi_device_op_pool_.enqueue([=] { - try { - if (error_state == 0 && LIKELY(!strategy_.dry_run_)) { - auto dev_ctxes = op->DeviceContext(); - auto &inputs = op->Inputs(); - for (auto &input : inputs) { - if (input && input->GeneratedOp() != nullptr) { - auto dev_ctxes = input->GeneratedOp()->DeviceContext(); - for (auto &item : dev_ctxes) { - ((platform::XPUDeviceContext *)(item.second))->Wait(); - } - } else { - VLOG(3) << "No generated op:" << op->Name(); - } - } - op->Run(strategy_.use_device_); - auto &outputs = op->Outputs(); - for (auto &output : outputs) { - for (auto &pending_op : output->PendingOps()) { - std::atomic &deps = op_deps->at(pending_op).dep_num; - if (deps.fetch_sub(1) == 1) { - ready_ops->Push(pending_op); - } - } - } - } else if (error_state) { - ready_ops->Push(nullptr); - } - } catch (...) { - error_state = 1; - exception_.Catch(std::current_exception()); - ready_ops->Push(nullptr); - } - { - std::lock_guard lock(mutex_); - exec_op_count_++; - cv_.notify_all(); - } - }); -} -// RunOpAsyncMainStream function is used for computed OPs -void BindThreadedSSAGraphExecutor::RunOpAsyncMainStream( - OpHandleBase *op, - std::unordered_map *op_deps, - std::shared_ptr> ready_ops, - int index) { - pool_[index]->enqueue([=] { - try { - if (error_state == 0 && LIKELY(!strategy_.dry_run_)) { - op->Run(strategy_.use_device_); - auto &outputs = op->Outputs(); - for (auto &output : outputs) { - for (auto &pending_op : output->PendingOps()) { - std::atomic &deps = op_deps->at(pending_op).dep_num; - if (deps.fetch_sub(1) == 1) { - ready_ops->Push(pending_op); - } - } - } - } else if (error_state) { - ready_ops->Push(nullptr); - } - } catch (...) { - error_state = 1; - exception_.Catch(std::current_exception()); - ready_ops->Push(nullptr); - } - { - std::lock_guard lock(mutex_); - exec_op_count_++; - cv_.notify_all(); - } - }); -} - -void BindThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { - atomic_op_deps_ = prepare_pool_.enqueue([&] { - auto *op_deps = new std::unordered_map; - for (auto &pair : op_deps_) { - (*op_deps)[pair.first].dep_num = pair.second; - (*op_deps)[pair.first].op = pair.first; - } - return std::unique_ptr< - std::unordered_map>(op_deps); - }); -} - -const ir::Graph &BindThreadedSSAGraphExecutor::Graph() const { return *graph_; } - -void BindThreadedSSAGraphExecutor::ExecutionFinal( - std::vector *fetch_ops) { - VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it"; - ClearFetchOp(graph_, fetch_ops); - exception_.ReThrow(); -} - -} // namespace details -} // namespace framework -} // namespace paddle -#endif diff --git a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h deleted file mode 100644 index ac07eb9fa5d1b..0000000000000 --- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include // NOLINT -#include -#include // NOLINT -#include -#include -#include - -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/details/exception_holder.h" -#include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/ssa_graph_executor.h" - -#if defined(PADDLE_WITH_XPU) -namespace paddle { -namespace framework { -class Scope; -namespace details { - -struct RunningItem { - std::atomic dep_num; - OpHandleBase *op; -}; - -class OpHandleBase; -class BindThreadedSSAGraphExecutor : public SSAGraphExecutor { - public: - BindThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph); - // FeedFetchList Run(const std::vector &fetch_tensors) override; - // Run a SSAGraph by a thread pool - // Use topological sort algorithm - FetchResultType Run(const std::vector &fetch_tensors, - bool return_merged) override; - const ir::Graph &Graph() const override; - - private: - FetchResultType RunMainStream(const std::vector &fetch_tensors, - bool return_merged); - - // Note(zcd): the ThreadPool should be placed last so that ThreadPool should - // be destroyed first. - ExecutionStrategy strategy_; - std::vector local_scopes_; - std::vector local_exec_scopes_; - std::vector places_; - ir::Graph *graph_; - - std::unordered_map op_deps_; - std::unordered_map place_to_index_; - std::vector bootstrap_ops_; - - std::unique_ptr stream_op_count_; - - std::future< - std::unique_ptr>> - atomic_op_deps_; - ExceptionHolder exception_; - - std::vector> pool_; - ::ThreadPool prepare_pool_; - ::ThreadPool multi_device_op_pool_; - - std::mutex mutex_; - std::condition_variable cv_; - uint32_t exec_op_count_; - std::atomic error_state; - - void RunOpAsyncMainStream( - OpHandleBase *op, - std::unordered_map *op_deps, - std::shared_ptr> ready_ops, - int index); - - void RunMultiDeviceOpAsync( - OpHandleBase *op, - std::unordered_map *op_deps, - std::shared_ptr> ready_ops); - - void PrepareAtomicOpDeps(); - - int get_pool_thread_index(int device_id); - - inline void ExecutionFinal(std::vector *fetch_ops); - - void InsertFetchOps( - const std::vector &fetch_tensors, - FetchResultType *fetches, - std::unordered_map> - *fetched_vars, - std::unordered_map *op_deps, - std::vector *fetch_ops, - std::vector *ready_fetch_ops, - bool return_merged); -}; -} // namespace details -} // namespace framework -} // namespace paddle - -#endif diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc deleted file mode 100644 index f947794ccdd05..0000000000000 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ /dev/null @@ -1,391 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/computation_op_handle.h" -#include "paddle/fluid/framework/details/fetch_async_op_handle.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -namespace paddle::framework::details { - -FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph) - : strategy_(strategy), - local_scopes_(local_scopes), - local_exec_scopes_(local_exec_scopes), - places_(places), - graph_(graph), - op_deps_(), - bootstrap_ops_(), - fetch_ctxs_(), - remaining_(0), - atomic_op_deps_(), - pool_(nullptr), - // add one more thread for generate op_deps - prepare_pool_(1), - traced_ops_() { - platform::EmplaceDeviceContexts( - &fetch_ctxs_, - places, - /*disable_setting_default_stream_for_allocator=*/true, - /*stream_priority=*/0); - if (ir::IsTopologySortOperationsUnique(*graph_)) { - VLOG(10) << "Change thread number to 1 because the topology sort order is " - "unique"; - strategy_.num_threads_ = 1; - traced_ops_.clear(); - for (auto *op_node : TopologySortOperations(*graph_)) { - if (op_node->IsWrappedBy()) { - traced_ops_.emplace_back(&(op_node->Wrapper())); - } - } - } - pool_ = std::make_unique<::ThreadPool>(strategy.num_threads_); - for (auto &op : ir::FilterByNodeWrapper(*graph_)) { - int dep = static_cast(op->NotReadyInputSize()); - op_deps_.emplace(op, dep); - if (dep == 0) { - bootstrap_ops_.emplace_back(op); - } - } - PADDLE_ENFORCE_GT(op_deps_.size(), - 0, - platform::errors::PreconditionNotMet( - "The graph doesn't have operators.")); - PrepareAtomicOpDeps(); -} - -FetchResultType FastThreadedSSAGraphExecutor::Run( - const std::vector &fetch_tensors, bool return_merged) { - VLOG(3) << "enter FastThreadedSSAGraphExecutor Run"; - std::unique_ptr event( - new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare", - platform::TracerEventType::UserDefined, - 2)); - std::unique_ptr>> - op_deps = atomic_op_deps_.get(); - PrepareAtomicOpDeps(); - size_t num_ops = op_deps->size(); - - FetchResultType fetches; - if (return_merged) { - fetches = FetchList(fetch_tensors.size()); - } else { - fetches = FetchUnmergedList(fetch_tensors.size()); - } - std::unordered_map> fetched_vars; - std::vector fetch_ops; - std::vector ready_fetch_ops; - exception_.Clear(); - InsertFetchOps(fetch_tensors, - &fetches, - &fetched_vars, - op_deps.get(), - &fetch_ops, - &ready_fetch_ops, - return_merged); - event.reset(nullptr); - if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) { - // If the num_threads is 1, we can record the order of operator's - // execution in the first iteration, and in subsequent iterations, - // run the recorded operators directly. This strategy could make the - // execution faster. - VLOG(3) << "Run the traced ops."; - bool is_exception_free = - RunTracedOps(traced_ops_) && RunTracedOps(fetch_ops); - if (!is_exception_free) { - ExecutionFinal(&fetch_ops); - } - } else { - traced_ops_.clear(); - remaining_ = 0; - auto complete_q = std::make_shared>(); - VLOG(3) << "number of bootstrap_ops_: " << bootstrap_ops_.size(); - VLOG(3) << "number of ready_fetch_ops: " << ready_fetch_ops.size(); - for (auto op : bootstrap_ops_) { - RunOpAsync(op_deps.get(), op, complete_q); - } - for (auto op : ready_fetch_ops) { - RunOpAsync(op_deps.get(), op, complete_q); - } - - size_t num_complete = 0; - while (num_complete != op_deps->size()) { - size_t num_comp = complete_q->Pop(); - if (num_comp == -1UL) { - int remaining = 0; - while (true) { - remaining = remaining_; - if (remaining == 0) { - break; - } - for (int i = 0; i < remaining; ++i) { - complete_q->Pop(); - } - } - if (exception_.IsCaught()) { - ExecutionFinal(&fetch_ops); - } - } - num_complete += num_comp; - } - } - // Wait FetchOps. - if (!fetch_ops.empty()) { - platform::RecordEvent record_wait( - "FastThreadedSSAGraphExecutor::WaitFetchOps", - platform::TracerEventType::Operator, - 1); - ClearFetchOp(graph_, &fetch_ops); - - for (auto &place : places_) { - fetch_ctxs_[place].get().get()->Wait(); - } - } - - return fetches; -} - -void FastThreadedSSAGraphExecutor::InsertFetchOps( - const std::vector &fetch_tensors, - FetchResultType *fetches, - std::unordered_map> *fetched_vars, - std::unordered_map> *op_deps, - std::vector *fetch_ops, - std::vector *ready_fetch_ops, - bool return_merged) { - std::unordered_set fetch_tensor_set(fetch_tensors.begin(), - fetch_tensors.end()); - for (auto &fetch_var_name : fetch_tensor_set) { - for (auto &var_map : graph_->Get(kGraphVars)) { - auto it = var_map.find(fetch_var_name); - if (it != var_map.end()) { - (*fetched_vars)[fetch_var_name].push_back(*it->second.rbegin()); - } - } - } - - for (size_t i = 0; i < fetch_tensors.size(); ++i) { - auto &var_name = fetch_tensors.at(i); - auto fetched_var_it = fetched_vars->find(var_name); - PADDLE_ENFORCE_NE( - fetched_var_it, - fetched_vars->end(), - platform::errors::PreconditionNotMet( - "Cannot find fetched variable(%s) in current computation graph. " - "Possible reasons are:\n" - " 1. The variable to be fetched is not defined in main program.\n" - " 2. The variable to be fetched is not an input or output of any " - "operator.\n" - " 3. Confirm that you have used the fetch `Variable` format " - "instead of the string literal('%s') in `fetch_list` parameter " - "when using `executor.run` method. In other words, the format of " - "`executor.run(fetch_list=[fetch_var])`(fetch_var is a Variable) " - "is recommended.", - var_name, - var_name)); - - auto &vars = fetched_var_it->second; - - ir::Node *fetch_node = - graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); - auto *op = new FetchAsyncOpHandle(fetch_node, - fetches, - i, - &local_scopes_, - &local_exec_scopes_, - return_merged); - fetch_ops->emplace_back(op); - - for (auto &p : places_) { - op->SetDeviceContext(p, fetch_ctxs_[p].get().get()); - } - - for (auto *var : vars) { - op->AddInput(var); - } - - for (auto *var : vars) { - auto *op = var->GeneratedOp(); - auto *compute_op = dynamic_cast(op); - if (compute_op) { - compute_op->SetLockAndRecordEventFree(false); - } - } - - int dep = static_cast(op->NotReadyInputSize()); - (*op_deps)[op] = dep; - if (dep == 0) { - ready_fetch_ops->emplace_back(op); - } - } -} - -bool FastThreadedSSAGraphExecutor::RunOp( - OpHandleBase *op, - const std::shared_ptr> &complete_q, - size_t *complete) { - RunOpSync(op); - if (LIKELY(!exception_.IsCaught())) { - if (LIKELY(!strategy_.dry_run_)) { - RecordOps(op); - } - ++(*complete); - return true; - } else { - --remaining_; - complete_q->Push(-1UL); - return false; - } -} - -void FastThreadedSSAGraphExecutor::RunOpAsync( - std::unordered_map> *op_deps, - OpHandleBase *op, - const std::shared_ptr> &complete_q) { - ++remaining_; - platform::RecordEvent record("WorkQueue::AddTask", - platform::TracerEventType::UserDefined, - 10 /*level*/); - this->pool_->enqueue([=] { - std::deque op_queue; - op_queue.push_front(op); - - size_t complete = 0; - while (!op_queue.empty()) { - OpHandleBase *op_to_run = op_queue.back(); - op_queue.pop_back(); - - // The Op involves data transfer of multiple devices may block other - // computations emit. For example: - // 1 step, queue=[Share, Allreduce], which Share is high priority - // 2 step, Share exec, pending_op=Grad, queue=[Allreduce, Grad] - // 3 step, Allreduce run with sync. Although Allreduce and Grad do not - // have topo dependency, but Grad must wait for Allreduce to complete - // before scheduling. - // In this scenario, calculation and communication may not overlap. - // Therefore, emit the op in the queue before running multi device op. - if (op_to_run->IsMultiDeviceTransfer()) { - while (!op_queue.empty()) { - OpHandleBase *post_op = op_queue.back(); - op_queue.pop_back(); - RunOpAsync(op_deps, post_op, complete_q); - } - } - VLOG(3) << "start to run op: " << op_to_run->Name(); - if (!RunOp(op_to_run, complete_q, &complete)) { - return; - } - auto &outputs = op_to_run->Outputs(); - op_to_run = nullptr; - for (auto &output : outputs) { - for (auto &pending_op : output->PendingOps()) { - std::atomic &deps = op_deps->at(pending_op); - if (deps.fetch_sub(1) != 1) continue; - - // NOTE(zjl): op with highest priority should run - // first without switching to another thread. - if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) { - op_queue.push_back(pending_op); - } else if (pending_op->IsMultiDeviceTransfer()) { - // multi device ops should be scheduled prior to computing ops - op_queue.push_front(pending_op); - } else { - if (op_to_run == nullptr) { - op_to_run = pending_op; - } else { - RunOpAsync(op_deps, pending_op, complete_q); - } - } - } - } - - if (op_to_run != nullptr) { - op_queue.push_front(op_to_run); - } - } - --remaining_; - complete_q->Push(complete); - }); -} - -void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { - atomic_op_deps_ = prepare_pool_.enqueue([&] { - auto *op_deps = new std::unordered_map>; - for (auto &pair : op_deps_) { - (*op_deps)[pair.first] = pair.second; - } - return std::unique_ptr< - std::unordered_map>>(op_deps); - }); -} - -const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; } - -void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) { - if (strategy_.num_threads_ == 1 && !dynamic_cast(op)) { - traced_ops_.emplace_back(op); - } -} - -void FastThreadedSSAGraphExecutor::ExecutionFinal( - std::vector *fetch_ops) { - VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it"; - // NOTE: If a new exception occurs in this ClearFetchOp operation, it will - // cause the loss of exception triggered firstly not thrown. - // Instead, the cleanup operation should only be performed when an EOF - // exception is caught. If other exceptions are triggered, the ClearFetchOp - // should not be continued. - if (exception_.Type() == "EOF") { - ClearFetchOp(graph_, fetch_ops); - } - exception_.ReThrow(); -} - -bool FastThreadedSSAGraphExecutor::RunTracedOps( - const std::vector &traced_ops) { - for (auto &op : traced_ops) { - if (!RunOpSync(op)) return false; - } - return true; -} - -bool FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) { - try { - VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); - if (LIKELY(!strategy_.dry_run_)) { - op->Run(strategy_.use_device_); - } - VLOG(10) << op << " " << op->Name() << " Done "; - return true; - } catch (...) { - exception_.Catch(std::current_exception()); - return false; - } -} - -} // namespace paddle::framework::details diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h deleted file mode 100644 index 11e137ba9637d..0000000000000 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include -#include -#include -#include - -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/details/exception_holder.h" -#include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/ssa_graph_executor.h" - -namespace paddle { -namespace framework { -class Scope; -namespace details { - -class OpHandleBase; -class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { - public: - FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph); - FetchResultType Run(const std::vector &fetch_tensors, - bool return_merged) override; - const ir::Graph &Graph() const override; - - private: - // Note(zcd): the ThreadPool should be placed last so that ThreadPool should - // be destroyed first. - ExecutionStrategy strategy_; - std::vector local_scopes_; - std::vector local_exec_scopes_; - std::vector places_; - ir::Graph *graph_; - - std::unordered_map op_deps_; - std::vector bootstrap_ops_; - - std::map>> - fetch_ctxs_; - std::atomic remaining_; - - std::future< - std::unique_ptr>>> - atomic_op_deps_; - ExceptionHolder exception_; - - std::unique_ptr<::ThreadPool> pool_; - ::ThreadPool prepare_pool_; - - std::vector traced_ops_; - - bool RunOp(OpHandleBase *op, - const std::shared_ptr> &complete_q, - size_t *complete); - - void RunOpAsync(std::unordered_map> *op_deps, - OpHandleBase *op, - const std::shared_ptr> &complete_q); - - void PrepareAtomicOpDeps(); - - inline void RecordOps(OpHandleBase *op); - - inline void ExecutionFinal(std::vector *fetch_ops); - - inline bool RunOpSync(OpHandleBase *op); - - bool RunTracedOps(const std::vector &traced_ops); - - void InsertFetchOps( - const std::vector &fetch_tensors, - FetchResultType *fetches, - std::unordered_map> - *fetched_vars, - std::unordered_map> *op_deps, - std::vector *fetch_ops, - std::vector *ready_fetch_ops, - bool return_merged); -}; -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index 60c3f35a4f7f7..cca7d203df5da 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -21,8 +21,8 @@ #include #include +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -41,6 +41,12 @@ namespace paddle { namespace framework { namespace details { +struct VariableInfo { + std::string name_; + proto::VarType::Type type_; + bool persistable_; +}; + // all variable in each devices. // The outside vector is the device vector. Each element of this vector is a // map from variable name to variables. The variables, who have the same name, @@ -63,7 +69,7 @@ constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce"; typedef std::unordered_set GraphDepVars; constexpr char kGraphDepVars[] = "dep_vars"; -typedef std::unordered_map FusedVars; +typedef std::unordered_map FusedVars; constexpr char kFusedVars[] = "fused_vars"; constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc deleted file mode 100644 index 45660331c1202..0000000000000 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" - -#include -#include -#include - -#include "paddle/fluid/framework/ir/graph_helper.h" - -namespace paddle::framework::details { - -static std::vector> SeparateMultiDevicesGraph( - ir::Graph *graph, size_t place_num) { - std::vector> graphs; - graphs.reserve(place_num); - for (size_t i = 0; i < place_num; ++i) { - ProgramDesc empty; - graphs.emplace_back(std::make_unique(empty)); - auto &g = graphs.back(); - g->Set(kGraphVars, new GraphVars(1UL)); - g->Set(kGraphDepVars, new GraphDepVars); - auto &stale_ops = - graph->Get>(details::kStaleProgramOpDescs); - g->Erase(details::kStaleProgramOpDescs); - g->Set>(details::kStaleProgramOpDescs, - new std::vector(stale_ops)); - } - auto op_handles = ir::FilterByNodeWrapper(*graph); - - for (auto &op : op_handles) { - auto &dev_ctx = op->DeviceContext(); - auto &p = dev_ctx.begin()->first; - int dev_id = p.device; // NOLINT - auto &dev_dummys = graphs[dev_id]->Get(kGraphDepVars); - graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release()); - - for (auto &var : op->Inputs()) { - auto dummy_ptr = dynamic_cast(var); - if (dummy_ptr) { - dev_dummys.insert(var); - if (graph->Nodes().count(var->Node())) - graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); - } - } - for (auto &var : op->Outputs()) { - auto dummy_ptr = dynamic_cast(var); - if (dummy_ptr) { - dev_dummys.insert(var); - if (graph->Nodes().count(var->Node())) - graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); - } - } - } - - for (size_t dev_id = 0; dev_id < place_num; ++dev_id) { - auto &dev_vars = graphs[dev_id]->Get(kGraphVars)[0]; - auto &origin_vars = graph->Get(kGraphVars)[dev_id]; - for (auto &name_pair : origin_vars) { - dev_vars.emplace(name_pair.first, name_pair.second); - for (auto &version_pair : name_pair.second) { - if (graph->Nodes().count(version_pair->Node())) { - graphs[dev_id]->AddNode( - graph->RemoveNode(version_pair->Node()).release()); - } - } - } - } - - return graphs; -} - -ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph) - // TODO(Yancey1989): Copying graphs is not safely since it deleted the - // attrs. - : ParallelSSAGraphExecutor( - strategy, - local_scopes, - local_exec_scopes, - places, - SeparateMultiDevicesGraph(graph, places.size())) {} - -ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - std::vector> graphs) - : strategy_(strategy), - local_scopes_(local_scopes), - pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), - places_(places), - graphs_(std::move(graphs)), - feed_status_(places.size(), FeedStatus::kNone) { - PADDLE_ENFORCE_EQ(places_.size(), - local_scopes_.size(), - platform::errors::InvalidArgument( - "The number of places and the number of local scopes " - "should be equal, but got number of places is %d and " - "number of local scopes is %d.", - places_.size(), - local_scopes_.size())); - - PADDLE_ENFORCE_EQ(places_.size(), - graphs_.size(), - platform::errors::InvalidArgument( - "Graph number does not match place number")); - - PADDLE_ENFORCE_GT( - places_.size(), - 0, - platform::errors::InvalidArgument("place number must be larger than 0")); - - auto seq_allreduce_pass = - ir::PassRegistry::Instance().Get("all_reduce_deps_pass"); - seq_allreduce_pass->Set(kUseHierarchicalAllReduce, new bool(false)); - for (auto &graph : graphs_) { - graph.reset(seq_allreduce_pass->Apply(graph.release())); - } - - // set the correct size of thread pool to each device. - strategy_.num_threads_ = strategy_.num_threads_ < places_.size() - ? 1UL - : strategy_.num_threads_ / places_.size(); - VLOG(1) << "set num_threads: " << strategy_.num_threads_ - << " to run the operators of the graph on each device."; - for (size_t i = 0; i < places.size(); ++i) { - executors_.emplace_back( - new details::FastThreadedSSAGraphExecutor(strategy_, - local_scopes_, - local_exec_scopes, - {places_[i]}, - graphs_.at(i).get())); - } -} - -std::vector ParallelSSAGraphExecutor::Graphs() { - std::vector result; - result.reserve(graphs_.size()); - for (auto &g : graphs_) { - result.emplace_back(g.get()); - } - return result; -} - -enum ExceptionStatus { kSuccess = 0, kEOF, kOther }; - -FetchResultType ParallelSSAGraphExecutor::Run( - const std::vector &fetch_tensors, bool return_merged) { - size_t feed_num = std::count( - feed_status_.begin(), feed_status_.end(), FeedStatus::kHasFeed); - bool has_feed = (feed_num > 0); - - VLOG(10) << "Feed num " << feed_num; - - size_t place_num = places_.size(); - - std::vector> run_futures; - std::vector exception_status(place_num, - ExceptionStatus::kSuccess); - - std::vector fetch_data; - fetch_data.reserve(place_num); - exception_holder_.Clear(); - - for (size_t i = 0; i < place_num; ++i) { - auto call = [&, i]() -> FetchResultType { - try { - if (!support_partial_feed_ || !has_feed || - feed_status_[i] == FeedStatus::kHasFeed) { - return executors_[i]->Run(fetch_tensors, return_merged); - } - } catch (platform::EOFException &) { - exception_status[i] = ExceptionStatus::kEOF; - exception_holder_.Catch(std::current_exception()); - } catch (...) { - exception_status[i] = ExceptionStatus::kOther; - exception_holder_.Catch(std::current_exception()); - } - - if (return_merged) { - return FetchList(); - } else { - return FetchUnmergedList(); - } - }; - - if (pool_) { - run_futures.emplace_back(pool_->enqueue(std::move(call))); - } else { - fetch_data.emplace_back(call()); - } - } - - if (pool_) { - for (auto &f : run_futures) { - fetch_data.emplace_back(f.get()); - } - } - - bool has_exception = exception_holder_.IsCaught(); - if (!support_partial_feed_ && has_exception) { - VLOG(10) << "Exception rethrow because partial feed is not supported"; - exception_holder_.ReThrow(); - } - - std::vector is_valid(place_num, true); - - if (support_partial_feed_) { - if (has_feed) { - for (size_t i = 0; i < place_num; ++i) { - if (feed_status_[i] == FeedStatus::kNone) { - is_valid[i] = false; - } else if (exception_status[i] != ExceptionStatus::kSuccess) { - PADDLE_ENFORCE_EQ(has_exception, - true, - platform::errors::InvalidArgument( - "Thread pool raises exception but not caught")); - VLOG(10) << "Exception rethrow because non-EOF exception raises when " - "feed is given"; - exception_holder_.ReThrow(); - } - } - } else { - for (size_t i = 0; i < place_num; ++i) { - if (exception_status[i] == ExceptionStatus::kOther) { - PADDLE_ENFORCE_EQ(has_exception, - true, - platform::errors::InvalidArgument( - "Thread pool raises exception but not caught")); - VLOG(10) << "Exception rethrow because non-EOF exception raises when " - "feed is not given"; - exception_holder_.ReThrow(); - } else if (exception_status[i] != ExceptionStatus::kSuccess) { - is_valid[i] = false; - } - } - } - } - - if (std::count(is_valid.begin(), is_valid.end(), true) == 0) { - PADDLE_ENFORCE_EQ(has_exception, - true, - platform::errors::InvalidArgument( - "Thread pool raises exception but not caught")); - VLOG(10) << "Raise exception because there is no success worker"; - exception_holder_.ReThrow(); - } - - if (return_merged) { - FetchList ret; - ret.reserve(fetch_tensors.size()); - for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { - std::vector lodtensor_ptrs; - lodtensor_ptrs.reserve(place_num); - std::vector lodtensorarray_ptrs; - lodtensorarray_ptrs.reserve(place_num); - for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { - if (!is_valid[scope_idx]) { - continue; - } - const auto &fetch_list = - PADDLE_GET_CONST(FetchList, fetch_data[scope_idx]); - if (data_is_lod_tensor(fetch_list[fetch_idx])) { - lodtensor_ptrs.push_back( - &(PADDLE_GET_CONST(phi::DenseTensor, fetch_list[fetch_idx]))); - } else { - lodtensorarray_ptrs.push_back( - &(PADDLE_GET_CONST(LoDTensorArray, fetch_list[fetch_idx]))); - } - } - if (!lodtensor_ptrs.empty()) { - phi::DenseTensor var; - MergeLoDTensor(&var, lodtensor_ptrs, platform::CPUPlace()); - ret.emplace_back(var); - } else { - LoDTensorArray var_array(lodtensorarray_ptrs[0]->size()); - for (size_t i = 0; i < lodtensorarray_ptrs[0]->size(); ++i) { - phi::DenseTensor var; - std::vector ptrs; - ptrs.reserve(lodtensor_ptrs.size()); - for (auto &lodtensorarray_ptr : lodtensorarray_ptrs) { - ptrs.push_back(&(lodtensorarray_ptr->at(i))); - } - MergeLoDTensor(&var, ptrs, platform::CPUPlace()); - var_array[i] = std::move(var); - } - ret.emplace_back(var_array); - } - } - return ret; - } else { - FetchUnmergedList ret; - ret.reserve(fetch_tensors.size()); - for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { - ret.emplace_back(); - for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); - ++scope_idx) { - if (!is_valid[scope_idx]) { - continue; - } - const auto &fetch_list = - PADDLE_GET_CONST(FetchUnmergedList, fetch_data[scope_idx]); - PADDLE_ENFORCE_EQ( - fetch_list[fetch_idx].size(), - 1, - platform::errors::Fatal("Each place must have only one fetched " - "phi::DenseTensor/LoDTensorArray!")); - ret.back().emplace_back(fetch_list[fetch_idx][0]); - } - } - return ret; - } -} - -} // namespace paddle::framework::details diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h deleted file mode 100644 index 3414c7361e040..0000000000000 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "ThreadPool.h" -#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -namespace details { - -class ParallelSSAGraphExecutor final : public SSAGraphExecutor { - public: - enum FeedStatus { - kNone = 0, // No feed - kHasFeed = 1 // Has feed - }; - - public: - ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph); - - ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - std::vector> graphs); - - ~ParallelSSAGraphExecutor() final = default; - - const ir::Graph &Graph() const override { return *graphs_[0]; } - - std::vector Graphs(); - - FetchResultType Run(const std::vector &fetch_tensors, - bool return_merged) override; - - void SetHasFeed(size_t dev_idx, bool has_feed) { - feed_status_[dev_idx] = has_feed ? FeedStatus::kHasFeed : FeedStatus::kNone; - } - - void EnablePartialFeedSupport() { support_partial_feed_ = true; } - - bool SupportPartialFeed() const { return support_partial_feed_; } - - private: - ExecutionStrategy strategy_; - std::vector local_scopes_; - std::unique_ptr<::ThreadPool> pool_{nullptr}; - std::vector places_; - std::vector> graphs_; - - std::vector> - executors_; - ExceptionHolder exception_holder_; - - bool support_partial_feed_{false}; - std::vector feed_status_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc deleted file mode 100644 index e3b3446209584..0000000000000 --- a/paddle/fluid/framework/details/scope_buffered_monitor.cc +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/scope_buffered_monitor.h" - -#include "paddle/common/flags.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -namespace paddle::framework { -class Variable; -} // namespace paddle::framework - -COMMON_DECLARE_double(local_exe_sub_scope_limit); - -namespace paddle::framework::details { - -static constexpr double kMB = 1.0 / (1024.0 * 1024.0); - -static void GetTensors(Variable *var, - std::unordered_set *tensor_set) { - if (var->IsType() && - var->Get().IsInitialized()) { - tensor_set->insert(var->GetMutable()); - } else if (var->IsType() && - var->Get().value().IsInitialized()) { - tensor_set->insert(var->GetMutable()->mutable_value()); - } else if (var->IsType()) { - auto *tensor_arr = var->GetMutable(); - for (auto &t : *tensor_arr) { - if (t.IsInitialized()) { - tensor_set->insert(&t); - } - } - } -} - -static void GetTensors(Scope *scope, - std::unordered_set *tensor_set) { - for (auto &var_name : scope->LocalVarNames()) { - GetTensors(scope->FindVar(var_name), tensor_set); - } - - for (auto *kid : scope->kids()) { - GetTensors(kid, tensor_set); - } -} - -static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) { - std::unordered_set tensor_set; - GetTensors(scope, &tensor_set); - size_t memory_size = 0; - std::unordered_set allocation_set; - for (auto *tensor : tensor_set) { - if (clear_cpu_tensor && platform::is_cpu_place(tensor->place())) { - tensor->clear(); - } else { - auto allocation = tensor->Holder().get(); - if (!allocation_set.count(allocation)) { - memory_size += allocation->size(); - allocation_set.insert(allocation); - } - } - } - return memory_size; -} - -size_t GetScopeVarMemorySize(Scope *scope) { - return GetTensorMemorySize(scope, false /*clear_cpu_tensor*/); -} - -ScopeBufferedMonitor::ScopeBufferedMonitor( - const std::vector &places, - const std::vector &local_exec_scopes) - : places_(places), - local_exec_scopes_(local_exec_scopes), - pre_local_exec_scopes_(local_exec_scopes.size()), - post_local_exec_scopes_(local_exec_scopes.size()), - history_local_exec_scopes_() {} - -void ScopeBufferedMonitor::Apply(const std::function &callback, - bool has_fetch) { - std::unique_ptr pre_local_exec_scopes_event( - new platform::RecordEvent( - "ScopeBufferedMonitor::pre_local_exec_scopes_process", - platform::TracerEventType::UserDefined, - 2)); - for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { - pre_local_exec_scopes_.at(scope_id).clear(); - auto scopes = local_exec_scopes_.at(scope_id)->kids(); - VLOG(10) << "pre_local_exec_scopes[" << scope_id - << "] sub-scope: " << scopes.size(); - pre_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end()); - } - pre_local_exec_scopes_event.reset(); - - callback(); - - std::unique_ptr post_local_exec_scopes_event( - new platform::RecordEvent( - "ScopeBufferedMonitor::post_local_exec_scopes_process", - platform::TracerEventType::UserDefined, - 2)); - for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { - post_local_exec_scopes_.at(scope_id).clear(); - auto scopes = local_exec_scopes_.at(scope_id)->kids(); - VLOG(10) << "post_local_exec_scopes[" << scope_id - << "] sub-scope: " << scopes.size(); - post_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end()); - } - - history_local_exec_scopes_.emplace_back(); - auto &incr_local_exec_scopes = history_local_exec_scopes_.back(); - incr_local_exec_scopes.resize(local_exec_scopes_.size()); - for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { - for (auto &scope : post_local_exec_scopes_.at(scope_id)) { - if (!pre_local_exec_scopes_.at(scope_id).count(scope)) { - incr_local_exec_scopes.at(scope_id).insert(scope); - } - } - - if (VLOG_IS_ON(10)) { - if (!incr_local_exec_scopes.at(scope_id).empty() && - FLAGS_local_exe_sub_scope_limit > 0) { - VLOG(10) - << "FLAGS_local_exe_sub_scope_limit is " - << FLAGS_local_exe_sub_scope_limit - << " MBytes now. If you don't need to limit the memory of local " - "execution scope, you should set " - "FLAGS_local_exe_sub_scope_limit=-1."; - } - std::stringstream out; - out << scope_id << " kids: "; - for (auto &scope : incr_local_exec_scopes.at(scope_id)) { - out << scope << ", "; - } - VLOG(10) << out.str(); - } - } - - size_t history_step = history_local_exec_scopes_.size(); - if (has_fetch && history_step >= 2) { - ClearHistoryLocalExecScopes(history_step - 1); - } - - // Delete CPU Memory - std::vector gpu_memory_size_per_gpu(places_.size()); - for (auto &scope_vec : history_local_exec_scopes_) { - for (size_t idx = 0; idx < scope_vec.size(); ++idx) { - for (auto &scope : scope_vec.at(idx)) { - gpu_memory_size_per_gpu.at(idx) += - GetTensorMemorySize(scope, true /*clear_cpu_tensor*/); - } - } - } - if (VLOG_IS_ON(8)) { - for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) { - VLOG(8) << "history local exec scopes contains " - << string::HumanReadableSize( - gpu_memory_size_per_gpu.at(idx)) // NOLINT - << " in " << places_.at(idx); - } - } - - if (FLAGS_local_exe_sub_scope_limit > 0) { - for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) { - if (gpu_memory_size_per_gpu.at(idx) / kMB >= // NOLINT - FLAGS_local_exe_sub_scope_limit) { - platform::DeviceContextPool::Instance().Get(places_.at(idx))->Wait(); - local_exec_scopes_.at(idx)->DropKids(); - } - for (auto &scope_vec : history_local_exec_scopes_) { - scope_vec.at(idx).clear(); - } - } - } -} - -void ScopeBufferedMonitor::ClearHistoryLocalExecScopes(size_t history_step) { - VLOG(10) << "delete pre_incr_local_exec_scopes."; - for (size_t i = 0; i < history_step; ++i) { - auto &pre_incr_local_exec_scopes = history_local_exec_scopes_.front(); - for (size_t scope_idx = 0; scope_idx < pre_incr_local_exec_scopes.size(); - ++scope_idx) { - for (auto scope : pre_incr_local_exec_scopes[scope_idx]) { - local_exec_scopes_.at(scope_idx)->DeleteScope(scope); - } - } - history_local_exec_scopes_.pop_front(); - } -} - -void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() { - history_local_exec_scopes_.clear(); -} - -} // namespace paddle::framework::details diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.h b/paddle/fluid/framework/details/scope_buffered_monitor.h deleted file mode 100644 index 3a94534eff458..0000000000000 --- a/paddle/fluid/framework/details/scope_buffered_monitor.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include - -#include "paddle/fluid/framework/scope.h" - -namespace paddle { -namespace framework { -namespace details { - -class ScopeBufferedMonitor { - public: - ScopeBufferedMonitor(const std::vector &places, - const std::vector &local_exec_scopes); - - void Apply(const std::function &callback, bool has_fetch); - - void ClearHistoryLocalExecScopes(); - - void ClearHistoryLocalExecScopes(size_t history_step); - - private: - std::vector places_; - std::vector local_exec_scopes_; - std::vector> pre_local_exec_scopes_; - std::vector> post_local_exec_scopes_; - std::deque>> - history_local_exec_scopes_; -}; - -size_t GetScopeVarMemorySize(Scope *scope); - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc deleted file mode 100644 index 1db2fff2b556d..0000000000000 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ /dev/null @@ -1,234 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" - -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -namespace paddle { -namespace framework { -namespace details { - -ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( - ExecutionStrategy strategy, - std::vector local_scopes, - std::vector local_exec_scopes, - std::vector var_infos, - std::vector places, - std::unique_ptr &&underlying_executor) - : strategy_(strategy), - underlying_executor_(std::move(underlying_executor)), - local_scopes_(std::move(local_scopes)), - local_exec_scopes_(std::move(local_exec_scopes)), - preserve_vars_(), - tmp_var_infos_(), - tensor_array_vars_(), - var_infos_(std::move(var_infos)), - places_(std::move(places)), - scope_monitor_(places_, local_exec_scopes_) { - PADDLE_ENFORCE_EQ( - local_scopes_.size(), - local_exec_scopes_.size(), - platform::errors::InvalidArgument( - "The number of local scopes and the number of local execution scopes " - "should be equal, but got number of local scopes is %d and " - "number of local execution scopes is %d.", - local_scopes_.size(), - local_exec_scopes_.size())); - PrepareLocalExeScopes(); -} - -static void RunProgramDescs(const ProgramDescs &programs, - const std::vector &local_exec_scopes, - const std::vector &places) { - for (auto &program : programs) { - for (auto &op_desc : program.Block(0).AllOps()) { - for (size_t i = 0; i < local_exec_scopes.size(); ++i) { - auto op = OpRegistry::CreateOp(*op_desc); - op->Run(*local_exec_scopes[i], places[i]); - } - } - } -} - -FetchResultType ScopeBufferedSSAGraphExecutor::Run( - const std::vector &fetch_tensors, bool return_merged) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::IsCUDAGraphCapturing()) { - strategy_.num_iteration_per_drop_scope_ = - std::numeric_limits::max(); - DropLocalExeScopes(/*need_wait=*/false); - } -#endif - - if (drop_scope_counter_ == 0) { - platform::RecordEvent e( - "InitLocalVars", platform::TracerEventType::UserDefined, 2); - InitVariables(); - } - - FetchResultType fetch_data; - std::exception_ptr eptr = nullptr; - - auto exe_run_func = [&]() { - try { - fetch_data = underlying_executor_->Run(fetch_tensors, return_merged); - } catch (...) { - eptr = std::current_exception(); - } - }; - - if (strategy_.num_iteration_per_drop_scope_ == 1) { - exe_run_func(); - } else { - scope_monitor_.Apply(exe_run_func, !fetch_tensors.empty()); - } - - if (VLOG_IS_ON(5)) { - for (auto *scope : local_exec_scopes_) { - VLOG(5) << "Left " - << string::HumanReadableSize( - GetScopeVarMemorySize(scope)) // NOLINT - << " on scope " << scope << " before deleting"; - } - } - - ++drop_scope_counter_; - if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ || - DropScopeOrNot()) { - DropLocalExeScopes(!platform::IsCUDAGraphCapturing()); - } - - if (VLOG_IS_ON(5)) { - for (auto *scope : local_exec_scopes_) { - VLOG(5) << "Left " - << string::HumanReadableSize( - GetScopeVarMemorySize(scope)) // NOLINT - << " on scope " << scope << " after deleting"; - } - } - - if (eptr) { - std::rethrow_exception(eptr); - } else { - return fetch_data; - } -} - -bool ScopeBufferedSSAGraphExecutor::DropScopeOrNot() const { - for (auto &var : tensor_array_vars_) { - auto tensor_array = var->GetMutable(); - for (phi::DenseTensor &tensor : *tensor_array) { - if (tensor.IsInitialized()) { - return true; - } - } - tensor_array->clear(); - } - return false; -} - -void ScopeBufferedSSAGraphExecutor::InitVariables() { - for (auto &info : tmp_var_infos_) { - for (auto &pair : info) { - InitializeVariable(pair.first, pair.second); - } - } - - const ir::Graph &graph = Graph(); - if (!is_initialized_) { - // startup_program_descs only need to be executed once - if (graph.Has(details::kStartupProgramDescs)) { - auto &program_descs = - graph.Get(details::kStartupProgramDescs); - RunProgramDescs(program_descs, local_exec_scopes_, places_); - } - is_initialized_ = true; - } - - if (graph.Has(details::kProgramDescs)) { - auto &program_descs = - graph.Get(details::kProgramDescs); - RunProgramDescs(program_descs, local_exec_scopes_, places_); - } -} - -void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) { - platform::RecordEvent drop_scope_event( - "DropLocalExeScopes", platform::TracerEventType::UserDefined, 2); - drop_scope_counter_ = 0; - if (need_wait) { - for (auto &p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - } - scope_monitor_.ClearHistoryLocalExecScopes(); - for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { - local_exec_scopes_[i]->EraseVarsExcept(preserve_vars_[i]); - local_exec_scopes_[i]->DropKids(); - for (auto &preserve_var : preserve_vars_[i]) { - preserve_var->Clear(); - } - VLOG(3) << "Drop local execution scope: " << local_scopes_[i]; - } -} - -void ScopeBufferedSSAGraphExecutor::PrepareLocalExeScopes() { - // Create local scopes. - preserve_vars_.resize(local_scopes_.size()); - tmp_var_infos_.resize(local_scopes_.size()); - - for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { - size_t idx = local_scopes_.size() - 1 - (it - local_scopes_.rbegin()); - auto *scope = local_scopes_[idx]; - auto *local_scope = local_exec_scopes_[idx]; - - for (auto &info : var_infos_) { - if (info.persistable_) { // Persistable - auto var = scope->FindVar(info.name_); - if (var != nullptr) { - VLOG(2) - << info.name_ - << " has been initialized beforehand in global scope, skipped"; - continue; - } - InitializeVariable(scope->Var(info.name_), info.type_); - } else { - Variable *tmp_var = local_scope->Var(info.name_); - preserve_vars_[idx].emplace(tmp_var); - tmp_var_infos_[idx].emplace_back(tmp_var, info.type_); - if (info.type_ == proto::VarType::LOD_TENSOR_ARRAY) { - tensor_array_vars_.emplace_back(tmp_var); - } - } - } - } -} - -bool ScopeBufferedSSAGraphExecutor::NeedCreateLocalExeScope() { - return drop_scope_counter_ == 0; -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h deleted file mode 100644 index 6e64b486d29ef..0000000000000 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/scope_buffered_monitor.h" -#include "paddle/fluid/framework/details/ssa_graph_executor.h" -#include "paddle/fluid/framework/details/var_handle.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/place.h" -namespace paddle { -namespace framework { -namespace details { - -struct VariableInfo { - std::string name_; - proto::VarType::Type type_; - bool persistable_; -}; - -class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { - public: - ScopeBufferedSSAGraphExecutor( - ExecutionStrategy strategy, - std::vector local_scopes, - std::vector local_exec_scopes, - std::vector var_infos, - std::vector places, - std::unique_ptr&& underlying_executor); - - const ir::Graph& Graph() const override { - return underlying_executor_->Graph(); - } - - FetchResultType Run(const std::vector& fetch_tensors, - bool return_merged) override; - - void DropLocalExeScopes(bool need_wait = true); - - bool NeedCreateLocalExeScope(); - - void PrepareLocalExeScopes(); - - private: - void InitVariables(); - - bool DropScopeOrNot() const; - - bool is_initialized_{false}; - size_t drop_scope_counter_{0}; - ExecutionStrategy strategy_; - std::unique_ptr underlying_executor_; - std::vector local_scopes_; - - std::vector local_exec_scopes_; - std::vector> preserve_vars_; - std::vector>> - tmp_var_infos_; - - std::vector tensor_array_vars_; - - std::vector var_infos_; - std::vector places_; - - ScopeBufferedMonitor scope_monitor_; -}; -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc deleted file mode 100644 index 6ade32097bd82..0000000000000 --- a/paddle/fluid/framework/details/ssa_graph_executor.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/ssa_graph_executor.h" - -#include "paddle/fluid/framework/details/fetch_async_op_handle.h" - -namespace paddle { -namespace framework { -namespace details { -SSAGraphExecutor::~SSAGraphExecutor() = default; - -void ClearFetchOp(ir::Graph* graph, std::vector* fetch_ops) { - if (fetch_ops->empty()) return; - - for (auto& op : *fetch_ops) { - PADDLE_ENFORCE_EQ(dynamic_cast(op) != nullptr || - dynamic_cast(op) != nullptr, - true, - platform::errors::PreconditionNotMet( - "The input ops of ClearFetchOp function should be " - "FetchOpHandle or FetchAsyncOpHandle.")); - for (auto& out_var : op->Node()->outputs) { - graph->RemoveNode(out_var); - } - for (auto& in_var : op->Inputs()) { - in_var->RemoveOutput(op, op->Node()); - } - graph->RemoveNode(op->Node()); - } - fetch_ops->clear(); -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h deleted file mode 100644 index 0ac46bbc4da25..0000000000000 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/details/fetch_op_handle.h" -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -namespace details { -class SSAGraphExecutor { - DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); - - public: - SSAGraphExecutor() {} - - virtual ~SSAGraphExecutor(); - - virtual const ir::Graph& Graph() const = 0; - - virtual FetchResultType Run(const std::vector& fetch_tensors, - bool return_merged = true) = 0; -}; - -void ClearFetchOp(ir::Graph* graph, std::vector* fetch_ops); -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc deleted file mode 100644 index 6697a33e3e1d6..0000000000000 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ /dev/null @@ -1,411 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" - -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -#if defined PADDLE_WITH_PSCORE -#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" -#endif - -namespace paddle { -namespace framework { -namespace details { -ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( - const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph) - : graph_(graph), - local_scopes_(local_scopes), - local_exec_scopes_(local_exec_scopes), - places_(places), - fetch_ctxs_(), - op_deps_(nullptr), - op_deps_futures_(), - strategy_(strategy), - run_op_futures_(), - prepare_pool_(1), - pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) - : nullptr), - traced_ops_() { - platform::EmplaceDeviceContexts( - &fetch_ctxs_, - places, - /*disable_setting_default_stream_for_allocator=*/true, - /*stream_priority=*/0); - - if (strategy_.num_iteration_per_run_ > 1) { - int read_op_num = 0; - for (auto *node : graph_->Nodes()) { - if (node->IsOp() && node->Name() == "read") { - read_op_num++; - } - } - if (read_op_num == 0) { - LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model " - "should use pyreader to feed data!"; - } - } - PrepareOpDeps(); - CopyOpDeps(); -} - -inline FetchResultType ThreadedSSAGraphExecutor::RunImpl( - const std::vector &fetch_tensors, bool return_merged) { - std::unique_ptr event( - new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", - platform::TracerEventType::UserDefined, - 2)); - std::unique_ptr op_deps = op_deps_futures_.get(); - CopyOpDeps(); - - VLOG(10) << "ThreadedSSAGraphExecutor::Run"; - std::shared_ptr> ready_vars( - new BlockingQueue); - auto &pending_ops = op_deps->pending_ops_; - auto &pending_vars = op_deps->pending_vars_; - auto &ready_ops = op_deps->ready_ops_; - size_t num_ops = op_deps->num_ops_; - - // Step 2. Insert FetchOps - std::vector fetch_ops; - std::unordered_set fetch_dependencies; - FetchResultType fetch_data; - if (return_merged) { - fetch_data = FetchList(fetch_tensors.size()); - } else { - fetch_data = FetchUnmergedList(fetch_tensors.size()); - } - - InsertFetchOps(fetch_tensors, - &fetch_ops, - &fetch_dependencies, - &ready_ops, - &pending_ops, - &pending_vars, - &fetch_data, - return_merged); - - exception_holder_.Clear(); - event.reset(nullptr); - - // Step 3. Execution - if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) { - // If the num_threads is 1, we can record the order of operator's - // execution in the first iteration, and in subsequent iterations, - // run the recorded operators directly. This strategy could make the - // execution faster. - VLOG(3) << "Run the traced ops."; - bool is_exception_free = - RunTracedOps(traced_ops_) && RunTracedOps(fetch_ops); - if (!is_exception_free) { - ExecutionFinal(&fetch_ops); - } - } else { - traced_ops_.clear(); - auto run_all_ops = [&](std::unordered_set &set) { - for (auto *op : set) { - RunOp(ready_vars, op); - } - set.clear(); - }; - // Clean run context - run_op_futures_.clear(); - - while (!pending_vars.empty()) { - // 1. Run All Ready ops - // Keep loop until all vars are ready. - run_all_ops(ready_ops); - - // 2. Find ready variable - bool timeout = false; - auto cur_ready_vars = ready_vars->PopAll(1, &timeout); - if (timeout) { - for (auto &run_op_future : run_op_futures_) { - run_op_future.wait(); - } - if (exception_holder_.IsCaught()) { - ExecutionFinal(&fetch_ops); - } else { - continue; - } - } - - // 3. Remove the dependency of ready_var. - // Find the ready_ops after the ready_var. - for (auto ready_var : cur_ready_vars) { - pending_vars.erase(ready_var); - for (auto *op : ready_var->PendingOps()) { - auto &deps = pending_ops[op]; - --deps; - if (deps == 0) { - ready_ops.insert(op); - } - } - } - } - PADDLE_ENFORCE_EQ( - ready_ops.empty(), - true, - platform::errors::Fatal("After the execution of computation graph, " - "there are unexecuted operators left.")); - } - - // Wait FetchOps. - ClearFetchOp(graph_, &fetch_ops); - - return fetch_data; -} - -FetchResultType ThreadedSSAGraphExecutor::Run( - const std::vector &fetch_tensors, bool return_merged) { - for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) { - RunImpl({}, return_merged); - } - return RunImpl(fetch_tensors, return_merged); -} - -void ThreadedSSAGraphExecutor::InsertFetchOps( - const std::vector &fetch_tensors, - std::vector *fetch_ops, - std::unordered_set *fetch_dependencies, - std::unordered_set *ready_ops, - std::unordered_map *pending_ops, - std::unordered_set *pending_vars, - FetchResultType *fetch_data, - bool return_merged) { - std::unordered_map> fetched_vars; - std::unordered_set local_ready_vars; - - for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_->Get(details::kGraphVars)) { - auto it = var_map.find(fetch_var_name); - if (it != var_map.end()) { - fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin()); - } - } - } - - for (size_t i = 0; i < fetch_tensors.size(); ++i) { - auto &var_name = fetch_tensors[i]; - auto fetched_var_it = fetched_vars.find(var_name); - PADDLE_ENFORCE_NE( - fetched_var_it, - fetched_vars.end(), - platform::errors::PreconditionNotMet( - "Cannot find fetched variable(%s) in current computation graph. " - "Possible reasons are:\n" - " 1. The variable to be fetched is not defined in main program.\n" - " 2. The variable to be fetched is not an input or output of any " - "operator.\n" - " 3. Confirm that you have used the fetch `Variable` format " - "instead of the string literal('%s') in `fetch_list` parameter " - "when using `executor.run` method. In other words, the format of " - "`executor.run(fetch_list=[fetch_var])`(fetch_var is a Variable) " - "is recommended.", - var_name, - var_name)); - - auto &vars = fetched_var_it->second; - - ir::Node *fetch_node = - graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); - auto *op = new FetchOpHandle(fetch_node, - fetch_data, - i, - &local_scopes_, - &local_exec_scopes_, - return_merged); - fetch_ops->emplace_back(op); - - for (auto &p : places_) { - op->SetDeviceContext(p, fetch_ctxs_[p].get().get()); - } - - for (auto *var : vars) { - op->AddInput(var); - } - - ir::Node *fetch_var = - graph_->CreateEmptyNode("fetch", ir::Node::Type::kVariable); - auto *fetch_dummy = new DummyVarHandle(fetch_var); - op->AddOutput(fetch_dummy); - fetch_dependencies->emplace(fetch_dummy); - - this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy); - - size_t wait_input_num = 0; - std::unordered_set input_set(vars.begin(), vars.end()); - for (auto *var : input_set) { - if (pending_vars->count(var)) { - ++wait_input_num; - } - } - if (wait_input_num) { - pending_ops->insert({op, wait_input_num}); - } else { - ready_ops->insert(static_cast(op)); - } - } - PADDLE_ENFORCE_EQ( - local_ready_vars.size(), - 0, - platform::errors::Fatal( - "The number of ready variables should be 0, but got %d.", - local_ready_vars.size())); -} - -void ThreadedSSAGraphExecutor::InsertPendingOp( - std::unordered_map *pending_ops, - OpHandleBase *op_instance) const { - pending_ops->insert({op_instance, op_instance->NoDupInputSize()}); -} - -void ThreadedSSAGraphExecutor::InsertPendingVar( - std::unordered_set *pending_vars, - std::unordered_set *ready_vars, - VarHandleBase *var) const { - pending_vars->insert(var); - if (var->GeneratedOp() == nullptr) { - ready_vars->insert(var); - } -} - -void ThreadedSSAGraphExecutor::PrepareOpDeps() { - op_deps_ = std::make_unique(); - std::unordered_map &pending_ops = - op_deps_->pending_ops_; - std::unordered_set &pending_vars = op_deps_->pending_vars_; - std::unordered_set &ready_ops = op_deps_->ready_ops_; - std::unordered_set ready_vars; - - // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->Get(details::kGraphVars)) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - InsertPendingVar(&pending_vars, &ready_vars, version_pair); - } - } - } - for (auto &var : graph_->Get(details::kGraphDepVars)) { - InsertPendingVar(&pending_vars, &ready_vars, var); - } - - for (auto &op : ir::FilterByNodeWrapper(*graph_)) { - if (op->Inputs().empty()) { // Special case, Op has no input. - ready_ops.insert(op); - } else { - InsertPendingOp(&pending_ops, op); - } - } - op_deps_->num_ops_ = ready_ops.size() + pending_ops.size(); - PADDLE_ENFORCE_GT( - op_deps_->num_ops_, - 0, - platform::errors::InvalidArgument("The graph doesn't have operators.")); - - for (auto ready_var : ready_vars) { - pending_vars.erase(ready_var); - for (auto *op : ready_var->PendingOps()) { - auto &deps = pending_ops[op]; - --deps; - if (deps == 0) { - ready_ops.insert(op); - } - } - } -} - -void ThreadedSSAGraphExecutor::CopyOpDeps() { - op_deps_futures_ = prepare_pool_.enqueue([&] { - auto *op_deps = new OpDependentData(); - op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(), - op_deps_->pending_ops_.end()); - op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(), - op_deps_->pending_vars_.end()); - op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(), - op_deps_->ready_ops_.end()); - op_deps->num_ops_ = op_deps_->num_ops_; - return std::unique_ptr(op_deps); - }); -} - -void ThreadedSSAGraphExecutor::RunOp( - const std::shared_ptr> &ready_var_q, - details::OpHandleBase *op) { - auto op_run = [ready_var_q, op, this] { - RunOpSync(op); - try { - ready_var_q->Extend(op->Outputs()); - VLOG(10) << op << " " << op->Name() << " Signal posted"; - } catch (...) { - exception_holder_.Catch(std::current_exception()); - } - }; - - if (pool_) { - run_op_futures_.emplace_back(pool_->enqueue(op_run)); - } else { - op_run(); - } - - RecordOps(op); -} - -bool ThreadedSSAGraphExecutor::RunTracedOps( - const std::vector &traced_ops) { - for (auto &op : traced_ops) { - if (!RunOpSync(op)) return false; - } - return true; -} - -bool ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) { - try { - VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); - if (LIKELY(!strategy_.dry_run_)) { - op->Run(strategy_.use_device_); - } - VLOG(10) << op << " " << op->Name() << " Done "; - return true; - } catch (...) { - exception_holder_.Catch(std::current_exception()); - return false; - } -} - -void ThreadedSSAGraphExecutor::ExecutionFinal( - std::vector *fetch_ops) { -#if defined PADDLE_WITH_PSCORE - if (strategy_.thread_barrier_) { - paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement(); - } -#endif - VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it"; - ClearFetchOp(graph_, fetch_ops); - exception_holder_.ReThrow(); -} - -void ThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) { - if (strategy_.num_threads_ == 1 && !dynamic_cast(op)) { - traced_ops_.emplace_back(op); - } -} -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h deleted file mode 100644 index 0633bffd5bdfb..0000000000000 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // ThreadPool in thrird party - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/details/exception_holder.h" -#include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/fetch_op_handle.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/details/ssa_graph_executor.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -class Scope; - -namespace details { - -struct OpDependentData { - std::unordered_map pending_ops_; - std::unordered_set pending_vars_; - std::unordered_set ready_ops_; - size_t num_ops_{0}; -}; - -class ThreadedSSAGraphExecutor final : public SSAGraphExecutor { - public: - ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, - const std::vector &local_scopes, - const std::vector &local_exec_scopes, - const std::vector &places, - ir::Graph *graph); - - const ir::Graph &Graph() const override { return *graph_; } - // Run a SSAGraph by a thread pool - // Use topological sort algorithm - FetchResultType Run(const std::vector &fetch_tensors, - bool return_merged) override; - - ~ThreadedSSAGraphExecutor() final = default; - - private: - inline FetchResultType RunImpl(const std::vector &fetch_tensors, - bool return_merged); - void RunOp(const std::shared_ptr> &ready_var_q, - details::OpHandleBase *op); - - private: - // Note(zcd): the ThreadPool should be placed last so that ThreadPool should - // be destroyed first. - ir::Graph *graph_; - std::vector local_scopes_; - std::vector local_exec_scopes_; - - std::vector places_; - std::map>> - fetch_ctxs_; - - ExceptionHolder exception_holder_; - std::unique_ptr op_deps_; - std::future> op_deps_futures_; - ExecutionStrategy strategy_; - // use std::list because clear(), push_back, and for_each are O(1) - std::list> run_op_futures_; - ::ThreadPool prepare_pool_; - std::unique_ptr<::ThreadPool> pool_; - std::vector traced_ops_; - - void InsertPendingOp(std::unordered_map *pending_ops, - OpHandleBase *op_instance) const; - - void InsertPendingVar(std::unordered_set *pending_vars, - std::unordered_set *ready_vars, - VarHandleBase *var) const; - - void InsertFetchOps(const std::vector &fetch_tensors, - std::vector *fetch_ops, - std::unordered_set *fetch_dependencies, - std::unordered_set *ready_ops, - std::unordered_map *pending_ops, - std::unordered_set *pending_vars, - FetchResultType *fetch_data, - bool return_merged); - - void PrepareOpDeps(); - - void CopyOpDeps(); - - inline void RecordOps(OpHandleBase *op); - - inline void ExecutionFinal(std::vector *fetch_ops); - - inline bool RunOpSync(OpHandleBase *op); - - bool RunTracedOps(const std::vector &traced_ops); -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index 97bcf41845039..1f62e57faf119 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -38,116 +38,6 @@ class ProgramDesc; namespace paddle::framework::details { -static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) { - framework::ExecutionStrategy execution_strategy; - - auto device_type = platform::Place2DeviceType(place); - switch (device_type) { - case platform::DeviceType::CPU: { - execution_strategy.num_threads_ = 2; - break; - } - case platform::DeviceType::CUDA: { // NOLINT - // NOTE: According experiments, one thread is faster in - // most model training. - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::XPU: { - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::IPU: { - execution_strategy.num_threads_ = 1; - break; - } - case platform::DeviceType::CUSTOM_DEVICE: { - execution_strategy.num_threads_ = 1; - break; - } - default: - PADDLE_THROW(platform::errors::Unavailable("Unsupported Device type %d.", - device_type)); - } - execution_strategy.use_device_ = device_type; - - return execution_strategy; -} - -void AppendSkipDeletionVars(const std::vector &append_vars, - std::vector *all_vars) { - for (auto &var : append_vars) { - all_vars->emplace_back(var); - } -} - -/* - * NOTE(Aurelius84): In ParallelExecutor, memory optimized pass will be applied. - * To avoid eagerly deleting last alive variables which are necessary in - * backward program, we firstly parse these variable names as - * skip_eager_vars. While executing pe.run skip_eager_vars are used to - * skip memory optimization. - * - * Variables satisfying the following rules are considered as skip_eager_var: - * - * 1. it is an output var in run_program_op - * 2. it is an input var used in backward_op - */ -void ParseSafeEagerDeletionSkipVars( - const ProgramDesc &program, - int64_t forward_op_nums, - const std::vector &output_var_names, - std::vector *skip_eager_delete_vars) { - auto all_ops = program.Block(0).AllOps(); - auto &op_info_map = OpInfoMap::Instance(); - // NOTE: skip `shape` and `fill_constant` op created by - // fluid.backward.gradients, one forward output will generate one `shape` - // and `fill_constant`. - size_t backward_op_start_index = - forward_op_nums + (output_var_names.size() * 2); - - // step 2: parse the necessary variable of backward op - std::unordered_set op_outputs; - std::unordered_set op_inputs; - std::unordered_set no_need_buffer_ins; - - for (auto i = backward_op_start_index; i < all_ops.size(); ++i) { - framework::OpDesc *op = all_ops[i]; - // NOTE: skip NoNeedBufferVars of grad_op and GC its memory in advance. - auto &op_info = op_info_map.Get(op->Type()); - auto &inferer = op_info.NoNeedBufferVarsInferer(); - no_need_buffer_ins.clear(); - if (inferer != nullptr) { - no_need_buffer_ins = - inferer(op->Inputs(), op->Outputs(), op->GetAttrMap()); - } - for (auto &in_names : op->Inputs()) { - if (no_need_buffer_ins.count(in_names.first) == 0) { - for (auto &in_name : in_names.second) { - op_inputs.emplace(in_name); - } - } else { - VLOG(2) << op->Type() << " has no_need_buffer_in: " << in_names.first - << " , skip it."; - } - } - - for (const std::string &out_arg_name : op->OutputArgumentNames()) { - op_outputs.emplace(out_arg_name); - } - } - // For the grad op input variables, if it is not output of grad_op, it may - // be output of forward op and we should set the variables as skip_var to - // prevent it being deleted when grad op is called multiple times. - for (const std::string &var_name : op_inputs) { - if (op_outputs.find(var_name) == op_outputs.end()) { - VLOG(2) << "skip eager var: " << var_name; - skip_eager_delete_vars->emplace_back(var_name); - } - } - VLOG(3) << "Found skip_eager_delete_vars: " << skip_eager_delete_vars->size(); -} - void AppendSkipDeletionVars(const std::vector &append_vars, std::set *all_vars) { for (auto &var : append_vars) { @@ -214,89 +104,6 @@ int64_t hash_with_seed(int64_t value, int64_t seed) { return value + 0x9e3779b9 + (value << 6) + (seed >> 2); } -ExecutorInfoCache &ExecutorInfoCache::Instance() { - static ExecutorInfoCache g_exe_cache_info_map; - return g_exe_cache_info_map; -} - -static PEAndGraphPair CreateExecutorInfo( - const ProgramDesc &program_desc, - const platform::Place &place, - int64_t start_op_index, - int64_t end_op_index, - framework::Scope *scope, - const details::BuildStrategy &build_strategy) { - auto execution_strategy = details::GetExecutionStrategy(place); - auto graph = std::make_shared( - program_desc, start_op_index, end_op_index); - auto parallel_executor = std::make_shared( - place, scope, execution_strategy, build_strategy, graph.get()); - parallel_executor->PrepareVariables(scope); - return std::make_pair(parallel_executor, graph); -} - -PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc &program_desc, - const platform::Place &place, - int64_t start_op_index, - int64_t end_op_index, - framework::Scope *scope) { - details::BuildStrategy build_strategy; - build_strategy.fix_op_run_order_ = true; - auto pe_and_graph = CreateExecutorInfo( - program_desc, place, start_op_index, end_op_index, scope, build_strategy); - return pe_and_graph; -} - -CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc, - const platform::Place &place, - int64_t start_op_index, - int64_t end_op_index, - bool is_grad, - int64_t program_id, - framework::Scope *scope) { - auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); - - if (!cached_exe_info.Has(program_id, is_grad)) { - // TODO(Aurelius84): Consider to use LRU algorithm to replace this. - if (cached_exe_info.Size() > 4u /* max_cached_size*/) { - VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear " - "all cache!"; - cached_exe_info.Finalize(); - } - - VLOG(1) << "create exe_info for " << program_id << " is_grad: " << is_grad; - auto &build_strategy = cached_exe_info.GetBuildStrategy(program_id); - - // 2. Construct Graph and ParallelExecutor. - auto pe_and_graph = CreateExecutorInfo(program_desc, - place, - start_op_index, - end_op_index, - scope, - build_strategy); - - // 3. Insert value into cached map. - auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad); - cached_value.executor_ = pe_and_graph.first; - cached_value.graph_ = pe_and_graph.second; - return std::make_pair(pe_and_graph.first, true); - } else { - VLOG(1) << "get exe_info from cache by: " << program_id - << " is_grad: " << is_grad; - auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad); - - auto ¶llel_executor = cached_value.executor_; - // update op_handle scope_map in pe->executor_->Graph - std::unordered_map scope_map = { - {parallel_executor->GetLocalScopes().front(), scope}}; - parallel_executor->ResetOpHandleScopeMapOfGraphs(scope_map); - // need to recreate tmp variables in new scope - parallel_executor->PrepareVariables(scope); - - return std::make_pair(parallel_executor, false); - } -} - InterpreterCoreInfoCache &InterpreterCoreInfoCache::Instance() { static InterpreterCoreInfoCache g_info_cache; return g_info_cache; diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h index 1e5136892d13f..5643fe3b75198 100644 --- a/paddle/fluid/framework/executor_cache.h +++ b/paddle/fluid/framework/executor_cache.h @@ -24,8 +24,8 @@ #include #include "paddle/fluid/framework/op_proto_maker.h" -#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/macros.h" #include "paddle/utils/string/string_helper.h" @@ -46,15 +46,6 @@ class Graph; class InterpreterCore; namespace details { -void AppendSkipDeletionVars(const std::vector& append_vars, - std::vector* all_vars); - -void ParseSafeEagerDeletionSkipVars( - const ProgramDesc& program, - int64_t forward_op_nums, - const std::vector& output_var_names, - std::vector* skip_eager_delete_vars); - void AppendSkipDeletionVars(const std::vector& append_vars, std::set* all_vars); @@ -65,106 +56,6 @@ std::set ParseSafeEagerDeletionSkipVarsSet( } // namespace details -class ExecutorInfo { - public: - struct CacheValue { - std::shared_ptr executor_{nullptr}; - std::shared_ptr graph_{nullptr}; - - std::vector skip_eager_delete_vars_; - }; - - bool IsAvailable(bool is_grad) { - const auto& executor = - is_grad ? backward_info_.executor_ : forward_info_.executor_; - return executor != nullptr; - } - - CacheValue& GetMutable(bool is_grad) { - return is_grad ? backward_info_ : forward_info_; - } - - private: - CacheValue forward_info_; - CacheValue backward_info_; -}; - -class ExecutorInfoCache { - public: - static ExecutorInfoCache& Instance(); - - const BuildStrategy& GetBuildStrategy(int64_t program_id) { - // If not found, insert build_strategy with default value. - return strategy_map_[program_id]; - } - - void SetBuildStrategy(int64_t program_id, - const BuildStrategy& build_strategy) { - PADDLE_ENFORCE_EQ( - strategy_map_.count(program_id), - 0, - platform::errors::PreconditionNotMet( - "program_id: %s already exist in ExecutorInfoCache", program_id)); - strategy_map_[program_id] = build_strategy; - } - - bool Has(int64_t program_id, bool is_grad) { - return info_map_.find(program_id) != info_map_.end() && - info_map_[program_id].IsAvailable(is_grad); - } - - ExecutorInfo::CacheValue& GetMutable(int64_t program_id, bool is_grad) { - return info_map_[program_id].GetMutable(is_grad); - } - - void UpdateSkipEagerDeleteVars(int64_t program_id, - bool is_grad, - const std::vector& skip_vars) { - auto& cached_value = GetMutable(program_id, is_grad); - cached_value.skip_eager_delete_vars_ = std::move(skip_vars); - } - - std::vector& SkipEagerDeleteVars(int64_t program_id, - bool is_grad) { - auto& cached_value = GetMutable(program_id, is_grad); - return cached_value.skip_eager_delete_vars_; - } - - size_t Size() const { return info_map_.size(); } - - void Finalize() { - // NOTE(Aurelius84): DO NOT perform finalize in destructor - // to avoid problems caused by destructor order of static - // object. - info_map_.clear(); - strategy_map_.clear(); - } - - private: - std::unordered_map info_map_; - std::unordered_map strategy_map_; -}; - -using CacheInfo = - std::pair, bool /*is_new_created*/>; - -using PEAndGraphPair = - std::pair, std::shared_ptr>; - -CacheInfo GetExecutorInfoFromCache(const ProgramDesc& program_desc, - const platform::Place& place, - int64_t start_op_index, - int64_t end_op_index, - bool is_grad, - int64_t program_id, - framework::Scope* scope); - -PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc& program_desc, - const platform::Place& place, - int64_t start_op_index, - int64_t end_op_index, - framework::Scope* scope); - int64_t hash_with_seed(int64_t value, int64_t seed); class InterpreterCoreInfo { diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc index 4b830e7b05e55..f97a78fd156ac 100644 --- a/paddle/fluid/framework/new_executor/executor_statistics.cc +++ b/paddle/fluid/framework/new_executor/executor_statistics.cc @@ -67,7 +67,7 @@ class StatisticsEngine { : evt_idx(idx), start_ns(start), end_ns(end) {} }; - enum class ExecutorType { EXECUTOR, PARALLEL_EXECUTOR, INTERPRETER_CORE }; + enum class ExecutorType { EXECUTOR, INTERPRETER_CORE }; using Filter = std::function; @@ -83,8 +83,6 @@ class StatisticsEngine { int InitFiltersForExecutor(); - int InitFiltersForParallelExecutor(); - int InitFiltersForInterpreterCore(); int RegisterEventFilter(const std::string& std_event, Filter filter) { @@ -154,10 +152,6 @@ int StatisticsEngine::Init(const platform::NodeTrees& trees) { VLOG(10) << "type: Executor"; executor_type_ = ExecutorType::EXECUTOR; return InitFiltersForExecutor(); - } else if (name.find("ParallelExecutor::") == 0) { - VLOG(10) << "type: ParallelExecutor"; - executor_type_ = ExecutorType::PARALLEL_EXECUTOR; - return InitFiltersForParallelExecutor(); } else if (name.find("StandaloneExecutor::") == 0) { VLOG(10) << "type: InterpreterCore"; executor_type_ = ExecutorType::INTERPRETER_CORE; @@ -295,57 +289,6 @@ int StatisticsEngine::InitFiltersForExecutor() { }); } -int StatisticsEngine::InitFiltersForParallelExecutor() { - return RegisterEventFilter("Total", - [](const platform::HostTraceEventNode& evt) { - return evt.Name().find("ProfileStep") == 0; - }) || - RegisterEventFilter("CplusplusEnd", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == "ParallelExecutor::Run"; - }) || - RegisterEventFilter("RunOp", - [](const platform::HostTraceEventNode& evt) { - return evt.Type() == - platform::TracerEventType::Operator; - }) || - RegisterEventFilter( - "OpCompute", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == "compute" && - evt.Type() == platform::TracerEventType::OperatorInner; - }) || - RegisterEventFilter( - "OpInfershape", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == "infer_shape" && - evt.Type() == platform::TracerEventType::OperatorInner; - }) || - RegisterEventFilter("GarbageCollect", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == "eager_deletion" || - evt.Name() == "CheckGC"; - }) || - RegisterEventFilter("AllocateDeviceMem", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == alloc_device_mem; - }) || - RegisterEventFilter("FreeDeviceMem", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == free_device_mem; - }) || - RegisterEventFilter( - "DataTransform", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == "prepare_data" && - evt.Type() == platform::TracerEventType::OperatorInner; - }) || - RegisterEventFilter("ThreadpoolAddTask", - [](const platform::HostTraceEventNode& evt) { - return evt.Name() == "WorkQueue::AddTask"; - }); -} - int StatisticsEngine::InitFiltersForInterpreterCore() { return RegisterEventFilter("Total", [](const platform::HostTraceEventNode& evt) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc deleted file mode 100644 index 3b6206284e9c6..0000000000000 --- a/paddle/fluid/framework/parallel_executor.cc +++ /dev/null @@ -1,1929 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/parallel_executor.h" - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" -#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" -#include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/event.h" -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/cuda_device_guard.h" -#endif -#include "paddle/fluid/platform/flags.h" - -COMMON_DECLARE_double(eager_delete_tensor_gb); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -COMMON_DECLARE_bool(sync_nccl_allreduce); -#endif - -#ifdef WITH_GPERFTOOLS -#include "gperftools/profiler.h" -#endif -PADDLE_DEFINE_EXPORTED_string( - pe_profile_fname, - "", - "Profiler filename for PE, which generated by gperftools." - "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); - -namespace paddle { -namespace framework { - -static std::once_flag gProfileOnce; -#ifdef WITH_GPERFTOOLS -static bool gProfileStarted = false; -#endif - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -std::once_flag p2p_init_flag_pe; -#endif - -class ParallelExecutorPrivate { - public: - ParallelExecutorPrivate(const std::vector &places, - Scope *global_scope) - : places_(places), - local_scopes_(), - local_exec_scopes_(), - global_scope_(global_scope), - executor_(nullptr), - is_persistable_(), - own_local_scope_(false), - use_device_(DeviceType::CPU), - use_all_reduce_(false), - nranks_(0), - mem_opt_var_infos_(), - gcs_() { - if (!FLAGS_pe_profile_fname.empty()) { - std::call_once(gProfileOnce, [] { -#ifdef WITH_GPERFTOOLS - ProfilerStart(FLAGS_pe_profile_fname.c_str()); - gProfileStarted = true; -#else - LOG(WARNING) << "Paddle is not compiled with gperftools. " - "FLAGS_pe_profile_fname will be ignored"; -#endif - }); - } - } - - ~ParallelExecutorPrivate() { - if (own_local_scope_) { - for (size_t i = 1; i < local_scopes_.size(); ++i) { - // Skip the first scope, since it is the global scope. - Scope *local_scope = local_scopes_[i]; - if (global_scope_->HasKid(local_scope)) { - global_scope_->DeleteScope(local_scope); - } - } - } - } - - bool IsUseCUDA(DeviceType use_device); - - void SetHasFeed(size_t dev_idx, bool has_feed = true); - - bool AllowPartialFeed() const; - - ir::Graph *ApplyMemoryOptimizePass(ir::Graph *graph); - - inline bool HasGarbageCollectors() const { return !gcs_.empty(); } - - void ApplyFixOpRunOrderPass(ir::Graph *graph) { - if (build_strategy_.fix_op_run_order_) { - auto pass = ir::PassRegistry::Instance().Get("fix_op_run_order_pass"); - pass->Apply(graph); - } - } - - /** - * NOTE(zengjinle): the fed variables of users should not be reused, - * because users may feed them into another network. Changing the fed - * variables that users can visit may cause calculation wrong, which is - * a very subtle bug when training networks. However, these variables - * can be garbage collected. - * - * ParallelExecutor provides 2 methods to feed variables: - * - * - FeedTensorsIntoLocalScopes: this method would share memory of fed - * variables, so we have to skip these. - * - * - FeedAndSplitTensorIntoLocalScopes: this method would copy data of fed - * variables, so we do not need to skip - * them. - */ - inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) { - if (mem_opt_var_infos_.empty()) { - VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory " - "optimization strategy is enabled"; - return; - } - auto iter = mem_opt_var_infos_[scope_idx].find(name); - if (iter != mem_opt_var_infos_[scope_idx].end()) { - iter->second->SetSkipMemoryReuse(true); - } - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { - VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ - << ", num_trainers:" << bst.num_trainers_ - << ", trainer_id:" << bst.trainer_id_; - - if (bst.use_hierarchical_allreduce_) { - VLOG(1) << ", use_hierarchical_allreduce:" - << bst.use_hierarchical_allreduce_ << ", inter_trainers_num:" - << bst.hierarchical_allreduce_inter_nranks_ - << ", exter_trainers_num:" - << bst.hierarchical_allreduce_exter_nranks_; - } - - std::vector flat_nccl_ids; - if (nranks_ == 1) { - // FIXME(gongwb): need not to create ncclid when nranks==1 - nccl_ctxs_->InitFlatCtxs( - places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); - return; - } - - if (bst.enable_parallel_graph_) { - VLOG(1) << "use only one ncclid in pg model"; - - ncclUniqueId *nccl_id = nullptr; - - std::string var_name = platform::GetFlatNCCLVarName(0); - auto nccl_id_var = scope->FindVar(var_name); - if (nccl_id_var) { - nccl_id = nccl_id_var->GetMutable(); - VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id; - } else { - nccl_id = new ncclUniqueId(); - PADDLE_ENFORCE_EQ( - platform::dynload::ncclGetUniqueId(nccl_id), - ncclSuccess, - platform::errors::PreconditionNotMet( - "PaddlePaddle failed to get NCCL unique ID. It may due to your " - "system settings or NCCL library error, please debug on NCCL")); - VLOG(10) << "can't find nccl_id_var:" << var_name - << ", nccl_id:" << nccl_id; - } - - flat_nccl_ids.push_back(nccl_id); - - nccl_ctxs_->InitFlatCtxs( - places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); - VLOG(1) << "init bst nccl context complete!"; - return; - } - - // num_trainers ==1 && places > 1 - if (bst.num_trainers_ == 1) { - nccl_ctxs_->InitFlatCtxs( - places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); - return; - } - - for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { - std::string var_name = platform::GetFlatNCCLVarName(i); - auto nccl_id_var = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - nccl_id_var, - platform::errors::NotFound("Can't find nccl_id_var '%s'.", var_name)); - auto nccl_id = nccl_id_var->GetMutable(); - flat_nccl_ids.push_back(nccl_id); - } - - nccl_ctxs_->InitFlatCtxs( - places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_); - - if (bst.use_hierarchical_allreduce_) { - std::vector inter_nccl_ids; - for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { - std::string var_name = platform::GetHierarchicalInterNCCLVarName(i); - auto nccl_id_var = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(nccl_id_var, - platform::errors::NotFound( - "Can't find nccl_id_var '%s'.", var_name)); - auto inter_nccl_id = nccl_id_var->GetMutable(); - inter_nccl_ids.push_back(inter_nccl_id); - } - - std::vector exter_nccl_ids; - for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { - std::string var_name = platform::GetHierarchicalExterNCCLVarName(i); - auto nccl_id_var = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(nccl_id_var, - platform::errors::NotFound( - "Can't find nccl_id_var '%s'.", var_name)); - auto nccl_id = nccl_id_var->GetMutable(); - exter_nccl_ids.push_back(nccl_id); - } - - nccl_ctxs_->InitHierarchicalCtxs( - places_, - inter_nccl_ids, - exter_nccl_ids, - bst.num_trainers_, - bst.trainer_id_, - bst.hierarchical_allreduce_inter_nranks_, - bst.hierarchical_allreduce_exter_nranks_); - } - } - - void InitOrGetNCCLCommunicator(framework::Scope *scope, BuildStrategy *bst) { - const std::string var_name = "NCCLCommunicator"; - auto var = scope->FindVar(var_name); - if (var != nullptr) { - PADDLE_ENFORCE_EQ(var->IsInitialized(), - true, - platform::errors::PreconditionNotMet( - "if %s exists, it must be initialized", var_name)); - VLOG(1) << "find " << var_name - << " in scope, so use it and does not recreate!"; - nccl_ctxs_ = var->GetMutable(); - return; - } - - if (bst->use_hierarchical_allreduce_) { - PADDLE_ENFORCE_GT( - bst->num_trainers_, - 1, - platform::errors::PreconditionNotMet( - "The num_trainers should be greater than 1, but received %llu.", - bst->num_trainers_)); - PADDLE_ENFORCE_GT( - bst->hierarchical_allreduce_inter_nranks_, - 1, - platform::errors::PreconditionNotMet( - "The inter_nranks should be greater than 1, but received %d.", - bst->hierarchical_allreduce_inter_nranks_)); - PADDLE_ENFORCE_EQ( - bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_, - 0, - platform::errors::PreconditionNotMet( - "num_trainers:%llu mod inter_nranks:%d != 0", - bst->num_trainers_, - bst->hierarchical_allreduce_inter_nranks_)); - - bst->hierarchical_allreduce_exter_nranks_ = - bst->num_trainers_ / bst->hierarchical_allreduce_inter_nranks_; - } - - VLOG(1) << "not find " << var_name << " in scope, so recreate it!"; - nccl_ctxs_ = scope->Var(var_name)->GetMutable(); - InitNCCLCtxs(scope, *bst); - } -#endif - -#if defined(PADDLE_WITH_XPU_BKCL) - void InitBKCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { - VLOG(1) << "bkcl comm num:" << bst.bkcl_comm_num_ << ", nranks:" << nranks_ - << ", num_trainers:" << bst.num_trainers_ - << ", trainer_id:" << bst.trainer_id_; - - PADDLE_ENFORCE_EQ(bst.use_hierarchical_allreduce_, - false, - platform::errors::Unimplemented( - "xpu doesn't support use_hierarchical_allreduce")); - - std::vector flat_bkcl_ids; - if (nranks_ == 1) { - // FIXME(gongwb): need not to create bkclid when nranks==1 - bkcl_ctxs_->InitFlatCtxs( - places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); - return; - } - - if (bst.enable_parallel_graph_) { - VLOG(1) << "use only one bkclid in pg model"; - - BKCLUniqueId *bkcl_id = nullptr; - - std::string var_name = platform::GetFlatBKCLVarName(0); - auto bkcl_id_var = scope->FindVar(var_name); - std::unique_ptr id(new BKCLUniqueId()); - if (bkcl_id_var) { - bkcl_id = bkcl_id_var->GetMutable(); - } else { - PADDLE_ENFORCE_EQ( - bkcl_get_unique_id(id.get()), - BKCL_SUCCESS, - platform::errors::Unavailable("bkcl get unique id failed")); - bkcl_id = id.get(); - } - - flat_bkcl_ids.push_back(bkcl_id); - - bkcl_ctxs_->InitFlatCtxs( - places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); - VLOG(1) << "init bst bkcl context complete!"; - return; - } - - // num_trainers ==1 && places > 1 - if (bst.num_trainers_ == 1) { - bkcl_ctxs_->InitFlatCtxs( - places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); - return; - } - - for (int i = 0; i < static_cast(bst.bkcl_comm_num_); i++) { - std::string var_name = platform::GetFlatBKCLVarName(i); - auto bkcl_id_var = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - bkcl_id_var, - platform::errors::NotFound("can't find %s bkcl_id_var", var_name)); - auto bkcl_id = bkcl_id_var->GetMutable(); - flat_bkcl_ids.push_back(bkcl_id); - } - - bkcl_ctxs_->InitFlatCtxs( - places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_); - } - - void InitOrGetBKCLCommunicator(framework::Scope *scope, - const BuildStrategy &bst) { - const std::string var_name = "BKCLCommunicator"; - auto var = scope->FindVar(var_name); - if (var != nullptr) { - PADDLE_ENFORCE_EQ(var->IsInitialized(), - true, - platform::errors::PreconditionNotMet( - "if %s exists, it must be initialized", var_name)); - VLOG(1) << "find " << var_name - << " in scope, so use it and does not recreate!"; - bkcl_ctxs_ = var->GetMutable(); - return; - } - - VLOG(1) << "not find " << var_name << " in scope, so recreate it!"; - bkcl_ctxs_ = scope->Var(var_name)->GetMutable(); - InitBKCLCtxs(scope, bst); - } -#endif - - inline bool IsPersistable(const std::string &name) const { - auto iter = is_persistable_.find(name); - return iter != is_persistable_.end() && iter->second; - } - - BuildStrategy build_strategy_; - std::vector places_; - std::vector local_scopes_; - std::vector local_exec_scopes_; - Scope *global_scope_; // not owned - std::unique_ptr executor_; - - std::unordered_map is_persistable_; - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - platform::NCCLCommunicator *nccl_ctxs_{nullptr}; -#elif defined(PADDLE_WITH_XPU_BKCL) - platform::BKCLCommunicator *bkcl_ctxs_{nullptr}; -#endif - bool own_local_scope_; - DeviceType use_device_; - bool use_all_reduce_; - size_t nranks_; - - ir::MemOptVarInfoMapList mem_opt_var_infos_; - ir::GarbageCollectorMap gcs_; - - details::ParallelSSAGraphExecutor *inference_executor_{nullptr}; -}; - -bool ParallelExecutorPrivate::IsUseCUDA(DeviceType use_device) { - return use_device == p::kCUDA; -} - -void ParallelExecutorPrivate::SetHasFeed(size_t dev_idx, bool has_feed) { - if (inference_executor_) { - inference_executor_->SetHasFeed(dev_idx, has_feed); - } -} - -bool ParallelExecutorPrivate::AllowPartialFeed() const { - return inference_executor_ && inference_executor_->SupportPartialFeed(); -} - -ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { - /** - * NOTE(zengjinle): If BuildStrategy.memory_optimize = None in Python, - * set BuildStrategy.memory_optimize according to whether gc is enabled. - * If gc is enabled, BuildStrategy.memory_optimize = False. - * If gc is disabled, BuildStrategy.memory_optimize = True. - * This is because gc+memory_optimize is worse than gc only. - * - * As an option, users can enable BuildStrategy.memory_optimize forcely - * by setting True, and disable it forcely by setting False. - */ - bool is_gc_enabled = (GetEagerDeletionThreshold() >= 0); - if (!build_strategy_.memory_optimize_) { - build_strategy_.memory_optimize_ = !is_gc_enabled; - } - - bool need_mem_opt = build_strategy_.enable_inplace_ || - build_strategy_.enable_addto_ || - build_strategy_.memory_optimize_.get() || is_gc_enabled; - - if (!need_mem_opt) return graph; - - std::vector last_live_ops_of_vars; - - auto ref_cnt_pass = ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); - ref_cnt_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); - graph = ref_cnt_pass->Apply(graph); - VLOG(10) << "ReferenceCountPass Applied"; - - if (build_strategy_.enable_addto_) { - auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass"); - addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); - addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); - addto_pass->Set(ir::kUseCuda, new bool(use_device_ == p::kCUDA)); - VLOG(10) << "Start to apply inplace_addto_op_pass"; - graph = addto_pass->Apply(graph); - VLOG(10) << "inplace_addto_op_pass Applied"; - } - - if (build_strategy_.enable_inplace_) { - auto inplace_pass = - ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass"); - inplace_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); - inplace_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); - inplace_pass->Set(ir::kUseCuda, new bool(use_device_ == p::kCUDA)); - VLOG(10) << "Start to apply buffer_shared_inplace_pass"; - graph = inplace_pass->Apply(graph); - VLOG(10) << "buffer_shared_inplace_pass Applied"; - VLOG(1) << "Inplace strategy is enabled, when " - "build_strategy.enable_inplace = True"; - } - - if (build_strategy_.memory_optimize_.get()) { - auto cross_op_memory_reuse_pass = ir::PassRegistry::Instance().Get( - "buffer_shared_cross_op_memory_reuse_pass"); - cross_op_memory_reuse_pass->SetNotOwned(ir::kMemOptVarInfoMapList, - &mem_opt_var_infos_); - cross_op_memory_reuse_pass->SetNotOwned(ir::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - cross_op_memory_reuse_pass->Set(ir::kUseCuda, - new bool(use_device_ == p::kCUDA)); - VLOG(10) << "Start to apply buffer_shared_cross_op_memory_reuse_pass"; - graph = cross_op_memory_reuse_pass->Apply(graph); - VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied"; - LOG(INFO) << "Cross op memory reuse strategy is enabled, when " - "build_strategy.memory_optimize = True or garbage collection " - "strategy is disabled, which is not recommended"; - } - - if (!is_gc_enabled) { - return graph; - } - size_t max_memory_size = static_cast(GetEagerDeletionThreshold()); - - for (size_t i = 0; i < places_.size(); ++i) { - auto &place = places_[i]; - if (gcs_.count(place) > 0) { - continue; - } - std::unique_ptr gc; - if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (IsFastEagerDeletionModeEnabled()) { - gc = std::make_unique(place, - max_memory_size); - } else { - gc = std::make_unique(place, max_memory_size); - } - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use CUDA device since it's not compiled with CUDA," - "Please recompile or reinstall Paddle with GPU support.")); -#endif - } else if (platform::is_xpu_place(place)) { -#if defined(PADDLE_WITH_XPU) - gc = std::make_unique(place, max_memory_size); - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use XPU device since it's not compiled with XPU," - "Please recompile or reinstall Paddle with XPU support.")); -#endif - } else if (platform::is_ipu_place(place)) { -#if defined(PADDLE_WITH_IPU) - gc = std::make_unique(place, max_memory_size); - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use IPU device since it's not compiled with IPU," - "Please recompile or reinstall Paddle with IPU support.")); -#endif - } else if (platform::is_custom_place(place)) { -#if defined(PADDLE_WITH_CUSTOM_DEVICE) - if (IsFastEagerDeletionModeEnabled()) { - gc = std::make_unique( - place, max_memory_size); - } else { - gc = std::make_unique(place, - max_memory_size); - } - VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use custom device since it's not compiled with " - "CustomDevice," - "Please recompile or reinstall Paddle with CustomDevice support.")); -#endif - } else if (platform::is_cpu_place(place)) { - gc = std::make_unique(place, max_memory_size); - VLOG(10) << "Created GarbageCollector at " << place; - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Unsupported place for garbage collection")); - } - gcs_.emplace(place, std::move(gc)); - } - - if (!gcs_.empty()) { - auto eager_deletion_pass = - ir::PassRegistry::Instance().Get("eager_deletion_pass"); - eager_deletion_pass->SetNotOwned(ir::kMemOptVarInfoMapList, - &mem_opt_var_infos_); - eager_deletion_pass->SetNotOwned(ir::kGarbageCollector, &gcs_); - eager_deletion_pass->SetNotOwned(ir::kLastLiveOpsOfVars, - &last_live_ops_of_vars); - eager_deletion_pass->SetNotOwned(ir::kAllPlaces, &places_); - graph = eager_deletion_pass->Apply(graph); - VLOG(10) << "EagerDeletionPass Applied"; - VLOG(1) << "Garbage collection strategy is enabled, when " - << "FLAGS_eager_delete_tensor_gb = " - << FLAGS_eager_delete_tensor_gb; - } - return graph; -} - -class ResetHasFeedGuard { - public: - explicit ResetHasFeedGuard(ParallelExecutorPrivate *pe_member) - : pe_member_(pe_member) {} - - ~ResetHasFeedGuard() { - for (size_t i = 0; i < pe_member_->places_.size(); ++i) { - pe_member_->SetHasFeed(i, false); - } - } - - private: - ParallelExecutorPrivate *pe_member_; -}; - -size_t ParallelExecutor::DeviceCount() const { return member_->places_.size(); } - -std::vector &ParallelExecutor::GetLocalScopes() { - return member_->local_scopes_; -} - -void ParallelExecutor::DropLocalExeScopes() { - auto executor = dynamic_cast( - member_->executor_.get()); - if (executor) { - executor->DropLocalExeScopes(); - } -} - -bool ParallelExecutor::NeedCreateLocalExeScope() { - auto executor = dynamic_cast( - member_->executor_.get()); - return executor && executor->NeedCreateLocalExeScope(); -} - -void InitP2PInPE(const std::vector &places) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - std::call_once(p2p_init_flag_pe, [&]() { - int count = places.size(); - if (count <= 1) return; - - std::vector devices; - for (int i = 0; i < count; i++) { - if (!platform::is_gpu_place(places[i])) return; - - platform::CUDAPlace device = places[i]; - devices.push_back(device.GetDeviceId()); - } - - for (int i = 0; i < count; ++i) { - for (int j = 0; j < count; ++j) { - if (devices[i] == devices[j]) continue; - int can_access = -1; -#ifdef PADDLE_WITH_HIP - hipError_t ret = - hipDeviceCanAccessPeer(&can_access, devices[i], devices[j]); - if (ret != hipSuccess || can_access != 1) { -#else - cudaError_t ret = - cudaDeviceCanAccessPeer(&can_access, devices[i], devices[j]); - if (ret != cudaSuccess || can_access != 1) { -#endif - LOG(WARNING) << "Cannot enable P2P access from " << devices[i] - << " to " << devices[j]; - } else { - platform::CUDADeviceGuard guard(devices[i]); -#ifdef PADDLE_WITH_HIP - hipDeviceEnablePeerAccess(devices[j], 0); -#else - cudaDeviceEnablePeerAccess(devices[j], 0); -#endif - } - } - } - VLOG(1) << "init p2p"; - }); -#endif -} - -ParallelExecutor::ParallelExecutor(const std::vector &places, - const std::vector &bcast_vars, - const std::string &loss_var_name, - Scope *scope, - const std::vector &local_scopes, - const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - ir::Graph *graph) - : member_(new ParallelExecutorPrivate(places, scope)), - async_graphs_(), - var_infos_() { - PADDLE_ENFORCE_EQ(!places.empty(), - true, - platform::errors::Unavailable( - "NPU is not supported in ParallelExecutor.")); - InitP2PInPE(places); - ir::InitReaderQueueDeviceCount( - graph, *(member_->global_scope_), member_->places_.size()); - // Initialize necessary info of member_ with strategy. - InitExecutorPrivateMemberInfo( - exec_strategy, build_strategy, places.size(), *graph); - - // Step 1. Create local scopes and Clone graph into multi device - CreateLocalScopes(scope, local_scopes, /*create_new*/ true); - std::vector graphs = CloneGraphToMultiDevices(graph); - PrepareNCCLCommunicator(scope); - - // broadcast parameters from the 0th device to others: - auto need_broadcast = [&]() -> bool { - if (member_->build_strategy_.num_trainers_ > 1) { // NOLINT - // 1. num_tariners would be grater than 1 for nccl distributed training. - return true; - } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { - // 2. Only one trainer process, but ParallelExecutor hold multiple - // devices. - return true; - } - return false; - }; - if (need_broadcast()) { - BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_); - } - - // Step 2. Convert main_program to SSA form and dependency graph. Also, insert - // ncclOp - std::vector async_graphs = - CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name); - PrepareForCUDAGraphCapture(graph); - graph = member_->ApplyMemoryOptimizePass(graph); - async_graphs[0] = graph; - - // Step 3. Create vars in each scope. Passes may also create new vars. - // skip control vars and empty vars - // std::vector var_infos; - // CreateVariableInfos(&var_infos, graph); - // std::unordered_map scope_map = - // CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true); - - // Step 4. Create SSAGraph executor - /* std::vector final_graphs = - CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph); - - VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; - if (!member_->build_strategy_.async_mode_) { - member_->executor_ = - std::make_unique( - exec_strategy, - member_->local_scopes_, - member_->local_exec_scopes_, - std::move(var_infos), - member_->places_, - std::move(member_->executor_)); - } - - ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map); - SetReaderOpDeviceInfoOfGraphs(final_graphs); */ -} - -ParallelExecutor::ParallelExecutor(const platform::Place &place, - Scope *scope, - const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - ir::Graph *graph) - : member_(new ParallelExecutorPrivate({place}, scope)), - async_graphs_(), - var_infos_() { - // Initialize necessary info of member_ with strategy. - InitExecutorPrivateMemberInfo(exec_strategy, - build_strategy, - /*device_count=*/1, - *graph); - - CreateLocalScopes(scope, /*local_scopes=*/{scope}, /*create_new=*/false); - - // Apply BuildStrategy to compile graph. - std::vector graphs = {graph}; - std::vector async_graphs = - CompileGraphWithBuildStrategy(graph, &graphs, /*loss_var_name=*/""); - - graph = member_->ApplyMemoryOptimizePass(graph); - - // Create vars in each scope. Passes may also create new vars. - // skip control vars and empty vars - CreateVariableInfos(&var_infos_, graph); - - // Create local execution scopes - std::unordered_map scope_map = - CreateLocalExecScopes(member_->local_scopes_, /*create_new=*/false); - - std::vector final_graphs = - CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph); - - // Set scope_map of op from each graph - ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map); -} - -void ParallelExecutor::PrepareVariables(Scope *scope) { - for (auto &info : var_infos_) { - auto var = scope->FindVar(info.name_); - if (var != nullptr) { - VLOG(2) << info.name_ - << " has been initialized beforehand in global scope, skipped."; - continue; - } - framework::InitializeVariable(scope->Var(info.name_), info.type_); - } -} - -void ParallelExecutor::BCastParamsToDevices( - const std::vector &vars, int trainer_id) const { - VLOG(3) << "BCastParamsToDevices"; - // the initializing bcast, all vars would be bcast from device(0). - for (auto &var : vars) { - framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); - if (main_var == nullptr || !main_var->IsType()) { - continue; - } - - auto &main_tensor = main_var->Get(); - if (!main_tensor.IsInitialized()) { - VLOG(3) << "one in var not inited, return!"; - continue; - } - auto &dims = main_tensor.dims(); - if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - std::vector buffers; - buffers.reserve(member_->places_.size()); - size_t numel = main_tensor.numel(); - auto dtype = framework::TransToProtoVarType(main_tensor.dtype()); - ncclDataType_t data_type = platform::ToNCCLDataType(dtype); - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - - if (i == 0 && trainer_id == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.dtype()); - } - buffers.push_back(buffer); - } - - PADDLE_ENFORCE_EQ(member_->places_.size(), - buffers.size(), - platform::errors::PreconditionNotMet( - "variables' buffer size to bcast is %d, which is " - "NOT equal to places size %d", - buffers.size(), - member_->places_.size())); - if (member_->nccl_ctxs_ != nullptr) { - auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx(); - platform::NCCLGroupGuard guard; - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]); - platform::dynload::ncclBcast(buffers[i], - numel, - data_type, - 0, - nccl_ctx.comm_, - nccl_ctx.stream()); - } - nccl_ctxs->WaitAll(); - } else { - auto src_place = member_->places_[0]; - auto src_dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(src_place)); - auto sizeof_dtype = framework::SizeOfType(dtype) * numel; - for (size_t i = 1; i < member_->places_.size(); ++i) { - auto dst_place = member_->places_[i]; - auto dst_dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(dst_place)); - src_dev_ctx->Wait(); - dst_dev_ctx->Wait(); - memory::Copy(dst_place, - buffers[i], - src_place, - buffers[0], - sizeof_dtype, - src_dev_ctx->stream()); - src_dev_ctx->Wait(); - dst_dev_ctx->Wait(); - } - } -#endif - } else if (paddle::platform::is_xpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_XPU_BKCL) - std::vector buffers; - buffers.reserve(member_->places_.size()); - size_t numel = main_tensor.numel(); - auto dtype = framework::TransToProtoVarType(main_tensor.dtype()); - BKCLDataType data_type = platform::ToBKCLDataType(dtype); - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - - if (i == 0 && trainer_id == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.dtype()); - } - buffers.push_back(buffer); - } - - PADDLE_ENFORCE_EQ(member_->places_.size(), - buffers.size(), - platform::errors::PreconditionNotMet( - "variables' buffer size to bcast is %d, which is " - "NOT equal to places size %d", - buffers.size(), - member_->places_.size())); - { - auto *bkcl_ctxs = member_->bkcl_ctxs_->DefaultFlatCtx(); - platform::BKCLGroupGuard guard; - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[i]); - PADDLE_ENFORCE_EQ( - bkcl_broadcast(bkcl_ctx.comm(), - buffers[i], - buffers[i], - numel, - data_type, - 0, - NULL), - BKCL_SUCCESS, - platform::errors::Unavailable("bkcl_broadcast failed")); - } - bkcl_ctxs->WaitAll(); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with BKCL.")); -#endif - } else { - platform::CPUPlace cpu; - for (size_t i = 1; i < member_->places_.size(); ++i) { - auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var)->GetMutable(); - - auto copy_memory = [&] { - t->Resize(dims); - t->mutable_data(cpu, main_tensor.dtype()); - paddle::framework::TensorCopy(main_tensor, cpu, t); - }; - - auto share_memory = [&] { t->ShareDataWith(main_tensor); }; - - // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. - if (member_->use_all_reduce_ || - member_->IsUseCUDA(member_->use_device_) || - var == "@LR_DECAY_COUNTER@") { - copy_memory(); - } else { - share_memory(); - } - } - } - } -} - -FetchUnmergedList ParallelExecutor::Run( - const std::vector &fetch_tensors) { - LOG_FIRST_N(INFO, 1) << "ParallelExecutor is Running (Run)."; - PreludeToRun(fetch_tensors); - platform::RecordBlock b(0); - - ResetHasFeedGuard reset_has_feed_guard(member_); - - ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), - fetch_tensors, - member_->HasGarbageCollectors()); - - VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; - auto fetch_data = - member_->executor_->Run(fetch_tensors, /*return_merged=*/false); - return PADDLE_GET(FetchUnmergedList, fetch_data); -} - -FetchList ParallelExecutor::RunAndMerge( - const std::vector &fetch_tensors) { - LOG_FIRST_N(INFO, 1) << "ParallelExecutor is Running (RunAndMerge)."; - PreludeToRun(fetch_tensors); - platform::RecordBlock b(0); - - ResetHasFeedGuard reset_has_feed_guard(member_); - - ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), - fetch_tensors, - member_->HasGarbageCollectors()); - - VLOG(3) << "ParallelExecutor begin to run member_->executor_->RunAndMerge"; - auto fetch_data = - member_->executor_->Run(fetch_tensors, /*return_merged=*/true); - return PADDLE_GET(FetchList, fetch_data); -} - -void ParallelExecutor::RunWithoutFetch( - const std::vector &skip_eager_vars) { - VLOG(3) << "enter ParallelExecutor RunWithoutFetch"; -#ifdef WITH_GPERFTOOLS - if (gProfileStarted) { - ProfilerFlush(); - } -#endif - platform::RecordBlock b(0); - - ResetHasFeedGuard reset_has_feed_guard(member_); - - ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), - skip_eager_vars, - member_->HasGarbageCollectors()); - - VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; - member_->executor_->Run(/*fetch_tensors*/ {}, /*return_merged*/ false); -} - -void ParallelExecutor::SkipMemoryReuse( - size_t scope_idx, const std::vector &skip_vars) { - for (auto &var_name : skip_vars) { - bool is_persistable = member_->IsPersistable(var_name); - if (!is_persistable) { - VLOG(3) << "SkipMemoryReuse for var: " << var_name; - member_->SetSkipMemoryReuse(scope_idx, var_name); - } - } -} - -void ParallelExecutor::FeedTensorsIntoLocalScopes( - const std::vector> - &tensors) { - if (platform::IsCUDAGraphCapturing()) { - for (auto &tensor : tensors) { - PADDLE_ENFORCE_EQ( - tensor.empty(), - true, - platform::errors::PermissionDenied( - "Feeding data is not permitted when capturing CUDA Graph.")); - } - return; - } - - if (!member_->AllowPartialFeed()) { - PADDLE_ENFORCE_EQ(tensors.size(), - member_->local_scopes_.size(), - platform::errors::Unimplemented( - "The feed data number %d does not match the device " - "number %d. If you are using DataLoader to feed " - "data, this may be because you set drop_last=False " - "in training network. Currently, drop_last=False for " - "DataLoader is not supported for training network. " - "Please set drop_last=True when defining DataLoader.", - tensors.size(), - member_->local_scopes_.size())); - } else { - PADDLE_ENFORCE_GE(member_->local_scopes_.size(), - tensors.size(), - platform::errors::InvalidArgument( - "The feed tensor number exceeds the device number")); - } - - size_t feed_num = 0; - for (size_t i = 0; i < tensors.size(); ++i) { - auto &map = tensors[i]; - if (map.empty()) { - continue; - } - - member_->SetHasFeed(i); - ++feed_num; - for (auto &pair : map) { - bool is_persistable = member_->IsPersistable(pair.first); - if (!is_persistable) { - member_->SetSkipMemoryReuse(i, pair.first); - } - auto *feed_scope = is_persistable ? member_->local_scopes_[i] - : member_->local_exec_scopes_[i]; - auto *feed_var = feed_scope->Var(pair.first); - - auto *trg = feed_var->GetMutable(); - trg->ShareDataWith(pair.second); - trg->set_lod(pair.second.lod()); - } - } - - if (!member_->AllowPartialFeed()) { - PADDLE_ENFORCE_EQ(feed_num, - member_->local_scopes_.size(), - platform::errors::Unimplemented( - "The feed data number %d does not match the device " - "number %d. If you are using DataLoader to feed " - "data, this may be because you set drop_last=False " - "in training network. Currently, drop_last=False for " - "DataLoader is not supported for training network. " - "Please set drop_last=True when defining DataLoader.", - feed_num, - member_->local_scopes_.size())); - } -} - -void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( - const std::unordered_map &tensors) { - if (platform::IsCUDAGraphCapturing()) { - PADDLE_ENFORCE_EQ( - tensors.empty(), - true, - platform::errors::PermissionDenied( - "Feeding data is not permitted when capturing CUDA Graph.")); - return; - } - - size_t num_places = member_->places_.size(); - bool allow_partial_feed = member_->AllowPartialFeed(); - - size_t persistable_feed_len = -1UL; - size_t non_persistable_feed_len = -1UL; - - for (auto &pair : tensors) { - bool is_persistable = member_->IsPersistable(pair.first); - VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable") - << " data (" << pair.first << "), dim:" << pair.second.dims() - << ", place: " << pair.second.place(); - auto lod_tensors = SplitLoDTensor(pair.second, member_->places_); - bool is_cpu_place = platform::is_cpu_place(member_->places_.front()); - if (!is_persistable && num_places != lod_tensors.size() && - !allow_partial_feed) { - auto error_info = string::Sprintf( - "The number(%d) of samples[%s] of current batch is less than the " - "count(%d) of devices(%s), currently, it is not allowed. ", - lod_tensors.size(), - pair.first, - num_places, - (is_cpu_place ? "CPU" : "GPU")); - if (is_cpu_place) { - error_info += - "You should set the environment variable CPU_NUM in the system " - "to determine the number of devices you need."; - } - PADDLE_THROW(platform::errors::PreconditionNotMet(error_info)); - } else if (is_persistable) { - if (lod_tensors.size() == 1) { - lod_tensors.reserve(num_places); - auto &tensor = lod_tensors.front(); - PADDLE_ENFORCE_EQ( - tensor.dims(), - pair.second.dims(), - platform::errors::PreconditionNotMet("The dim doesn't match.")); - PADDLE_ENFORCE_EQ( - tensor.place(), - member_->places_.at(0), - platform::errors::PreconditionNotMet("The place doesn't match.")); - for (size_t i = 1; i < num_places; ++i) { - lod_tensors.emplace_back(); - auto &tmp = lod_tensors.back(); - framework::TensorCopy(pair.second, member_->places_.at(i), &tmp); - } - } - if (lod_tensors.size() != num_places && !allow_partial_feed) { - auto error_info = string::Sprintf( - "The number(%d) of samples[%s] of the current batch does not match " - "the count(%d) of devices(%s). Because that %s is a persistable " - "variable, you can feed just one sample, in that case, the input " - "sample will be copied in %d copies and be sent to different " - "places separately. If you need that different place has different " - "value, you should feed %d samples.", - lod_tensors.size(), - pair.first, - num_places, - (is_cpu_place ? "CPU" : "GPU"), - pair.first, - num_places, - num_places); - PADDLE_THROW(platform::errors::PreconditionNotMet(error_info)); - } - } - - if (allow_partial_feed) { - if (is_persistable) { - if (persistable_feed_len == -1UL) { - persistable_feed_len = lod_tensors.size(); - } else { - PADDLE_ENFORCE_EQ( - persistable_feed_len, - lod_tensors.size(), - platform::errors::InvalidArgument( - "The feeded number of different persistable variables " - "should be the same")); - } - } else { - if (non_persistable_feed_len == -1UL) { - non_persistable_feed_len = lod_tensors.size(); - } else { - PADDLE_ENFORCE_EQ( - non_persistable_feed_len, - lod_tensors.size(), - platform::errors::InvalidArgument( - "The feeded number of different non-persistable variables " - "should be the same")); - } - } - } - - for (size_t j = 0; j < lod_tensors.size(); ++j) { - auto *feed_scope = is_persistable ? member_->local_scopes_[j] - : member_->local_exec_scopes_[j]; - auto *feed_var = feed_scope->Var(pair.first); - - auto t = feed_var->GetMutable(); - t->ShareDataWith(lod_tensors[j]); - t->set_lod(lod_tensors[j].lod()); - } - } - - if (allow_partial_feed && persistable_feed_len != -1UL && - non_persistable_feed_len != -1UL) { - VLOG(10) << "Persistable len " << persistable_feed_len; - VLOG(10) << "Non persistable len " << non_persistable_feed_len; - PADDLE_ENFORCE_GE(persistable_feed_len, - non_persistable_feed_len, - platform::errors::InvalidArgument( - "The feeded number of persistable variables should " - "not be less than non-persistable variables")); - } - - if (non_persistable_feed_len != -1UL) { - for (size_t i = 0; i < non_persistable_feed_len; ++i) { - member_->SetHasFeed(i); - } - } -} - -ParallelExecutor::~ParallelExecutor() { - for (auto &p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - delete member_; -} - -bool ParallelExecutor::EnableParallelGraphExecution( - const ir::Graph &graph, const BuildStrategy &build_strategy) const { - return false; - - bool enable_parallel_graph = true; - - for (ir::Node *node : graph.Nodes()) { - if (node->IsVar() && node->Var()) { - // TODO(Yancey1989): support sparse update in ParallelGraph mode. - if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) { - enable_parallel_graph = false; - break; - } - } else if (node->IsOp() && node->Op()) { - // TODO(Yancey1989): support pserver mode - if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") { - enable_parallel_graph = false; - break; - } - } - } - - if (!member_->use_all_reduce_ || !member_->IsUseCUDA(member_->use_device_)) { - if (build_strategy.enable_sequential_execution_) { - enable_parallel_graph = false; - } - } - -#ifdef WIN32 - VLOG(1) << "Windows has no support to parallel graph, enable_parallel_graph " - "would be forced to false."; - enable_parallel_graph = false; -#endif - - return enable_parallel_graph; -} - -void ParallelExecutor::InitExecutorPrivateMemberInfo( - const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - size_t device_count, - const ir::Graph &graph) { - member_->use_device_ = exec_strategy.use_device_; - member_->build_strategy_ = build_strategy; - member_->use_all_reduce_ = member_->build_strategy_.reduce_ == - BuildStrategy::ReduceStrategy::kAllReduce; - member_->nranks_ = build_strategy.num_trainers_ * device_count; - if (!member_->use_all_reduce_ && member_->nranks_ == 1) { - LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," - "the number of places should be greater than 1."; - member_->build_strategy_.reduce_ = - BuildStrategy::ReduceStrategy::kAllReduce; - member_->use_all_reduce_ = true; - } - -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - device_count, - 1, - platform::errors::Unavailable("Windows can support Single GPU only.")); - } -#endif - -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - device_count, - 1, - platform::errors::PermissionDenied( - "Your machine has multiple cards, " - "but the WITH_NCCL option is not turned on during compilation, " - "and you cannot use multi-card training or prediction. " - "Please recompile and turn on the WITH_NCCL option.")); - } -#endif - - std::string device_name; - if (member_->use_device_ == p::kCPU) { - device_name = "CPU"; - } else if (member_->use_device_ == p::kCUDA) { - device_name = "CUDA"; - } else if (member_->use_device_ == p::kXPU) { - device_name = "XPU"; - } else { - PADDLE_THROW( - platform::errors::Unavailable("Only CPU/CUDA/XPU is supported. " - "please use CPU/CUDA/XPU backend.")); - } - - VLOG(1) << string::Sprintf( - "The Program will be executed on %s using ParallelExecutor, %lu " - "cards are used, so %lu programs are executed in parallel.", - device_name, - device_count, - device_count); - // FIXME(Yancey1989): parallel graph mode get better performance - // in GPU allreduce distributed training. Need an elegant way to - // choice the execution strategy. - member_->build_strategy_.enable_parallel_graph_ = - EnableParallelGraphExecution(graph, member_->build_strategy_); - if (member_->build_strategy_.enable_parallel_graph_) { - LOG(INFO) << "The Executor would execute the graph by ParallelGraph " - "Execution which can get better performance," - << "you can force it off by env FLAGS_enable_parallel_graph=0"; - } -} - -void ParallelExecutor::CreateLocalScopes( - Scope *global_scope, - const std::vector &local_scopes, - bool create_new) { - if (local_scopes.empty()) { - member_->own_local_scope_ = true; - member_->local_scopes_.emplace_back(global_scope); - for (size_t i = 1; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&global_scope->NewScope()); - } - } else { - member_->own_local_scope_ = false; - PADDLE_ENFORCE_EQ(member_->places_.size(), - local_scopes.size(), - platform::errors::PreconditionNotMet( - "member_->places_.size() = %d is not equal to " - "local_scopes.size() = %d", - member_->places_.size(), - local_scopes.size())); - for (size_t i = 0; i < member_->places_.size(); ++i) { - if (create_new) { - member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); - } else { - // Use local scopes directly - member_->local_scopes_.emplace_back(local_scopes[i]); - } - } - } -} - -std::unordered_map ParallelExecutor::CreateLocalExecScopes( - const std::vector &local_scopes, bool create_new) { - std::unordered_map scope_map; - - for (auto *scope : local_scopes) { - Scope *local_exec_scope = scope; - if (create_new) { - local_exec_scope = &scope->NewScope(); - } - member_->local_exec_scopes_.emplace_back(local_exec_scope); - scope_map.emplace(scope, local_exec_scope); - } - - PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), - member_->local_exec_scopes_.size(), - platform::errors::PreconditionNotMet( - "member_->local_scopes_.size() = %d is not equal to " - "member_->local_exec_scopes_.size() = %d", - member_->local_scopes_.size(), - member_->local_exec_scopes_.size())); - - return scope_map; -} - -std::vector ParallelExecutor::CloneGraphToMultiDevices( - ir::Graph *graph) { - std::vector graphs; - if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), - false, - platform::errors::Unavailable( - "gpu mode does not support async_mode_ now!")); - graphs.push_back(graph); - for (size_t i = 1; i < member_->places_.size(); ++i) { - auto *tmp_graph = new ir::Graph(graph->OriginProgram()); - async_graphs_.emplace_back(tmp_graph); - graphs.push_back(tmp_graph); - } - } - - return graphs; -} - -void ParallelExecutor::PreludeToRun( - const std::vector &fetch_tensors) { - platform::RecordEvent record_run( - "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1); - VLOG(3) << "enter ParallelExecutor Run"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::IsCUDAGraphCapturing()) { - PADDLE_ENFORCE_EQ(fetch_tensors.empty(), - true, - platform::errors::InvalidArgument( - "Cannot fetch data when using CUDA Graph.")); - PADDLE_ENFORCE_EQ( - member_->build_strategy_.allow_cuda_graph_capture_, - true, - platform::errors::InvalidArgument( - "You must turn on build_strategy.allow_cuda_graph_capture = True " - "to enable CUDA Graph capturing.")); - PADDLE_ENFORCE_EQ( - member_->places_[0], - platform::CUDAGraphCapturingPlace(), - platform::errors::InvalidArgument("The place to capture CUDAGraph is " - "not the same as the place to run.")); - } -#endif - -#ifdef WITH_GPERFTOOLS - if (gProfileStarted) { - ProfilerFlush(); - } -#endif -} - -void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { - if (member_->build_strategy_.reduce_ == - BuildStrategy::ReduceStrategy::kNoReduce) { - return; - } - - if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_); - - // Initialize device context's nccl comm, will be used by normal - // Operators like sync_batch_norm, and collective ops. - // NOTE: more than one ParallelExecutor with same place, the nccl comm will - // be rewrite and there will be some problem. - // NOTE: NCCL group-calls and non-group-calls can not use the same - // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use - // same communicators. - auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx( - global_scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (auto &place : member_->places_) { - auto *dev_ctx = static_cast(pool.Get(place)); - auto &nccl_ctx = nccl_ctxs->at(place); - dev_ctx->set_nccl_comm(nccl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with CUDA.")); -#endif - } - if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_XPU_BKCL) - member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_); - - auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx( - global_scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); - auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); - dev_ctx->SetBkclContext(bkcl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with XPU.")); -#endif - } -} - -std::vector ParallelExecutor::CompileGraphWithBuildStrategy( - ir::Graph *graph, - std::vector *device_graphs, - const std::string &loss_var_name) { - auto device_count = member_->places_.size(); - std::vector async_graphs(device_count); - - auto &graphs = *device_graphs; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(graphs.size(), - device_count, - platform::errors::PreconditionNotMet( - "graphs.size() should be %d, but received %d", - device_count, - graphs.size())); - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply(graph, // NOLINT - {member_->places_[0]}, - loss_var_name, - {member_->local_scopes_[0]}, - 1, - member_->use_device_, - member_->nccl_ctxs_); - for (size_t i = 1; i < device_count; ++i) { - graphs[i] = member_->build_strategy_.Apply(graphs[i], - {member_->places_[i]}, - loss_var_name, - {member_->local_scopes_[i]}, - 1, - member_->use_device_, - member_->nccl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply(graph, // NOLINT - member_->places_, - loss_var_name, - member_->local_scopes_, - member_->nranks_, - member_->use_device_, - member_->nccl_ctxs_); - } -#elif defined(PADDLE_WITH_XPU_BKCL) - if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(graphs.size(), - device_count, - platform::errors::PreconditionNotMet( - "graphs.size() should be %d, but received %d", - device_count, - graphs.size())); - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply(graph, - {member_->places_[0]}, - loss_var_name, - {member_->local_scopes_[0]}, - 1, - member_->use_device_, - member_->bkcl_ctxs_); - for (size_t i = 1; i < device_count; ++i) { - graphs[i] = member_->build_strategy_.Apply(graphs[i], - {member_->places_[i]}, - loss_var_name, - {member_->local_scopes_[i]}, - 1, - member_->use_device_, - member_->bkcl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply(graph, - member_->places_, - loss_var_name, - member_->local_scopes_, - member_->nranks_, - member_->use_device_, - member_->bkcl_ctxs_); - } -#else - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply(graph, - {member_->places_[0]}, - loss_var_name, - {member_->local_scopes_[0]}, - 1, - member_->use_device_); - for (size_t i = 1; i < device_count; ++i) { - graphs[i] = member_->build_strategy_.Apply(graphs[i], - {member_->places_[i]}, - loss_var_name, - {member_->local_scopes_[i]}, - 1, - member_->use_device_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply(graph, - member_->places_, - loss_var_name, - member_->local_scopes_, - member_->nranks_, - member_->use_device_); - } -#endif - - return async_graphs; -} - -void ParallelExecutor::CreateVariableInfos( - std::vector *var_infos, ir::Graph *graph) { - PADDLE_ENFORCE_EQ( - var_infos->size(), - 0, - platform::errors::PreconditionNotMet( - "var_infos->size() should be 0, but received %d", var_infos->size())); - PADDLE_ENFORCE_EQ( - member_->is_persistable_.size(), - 0, - platform::errors::PreconditionNotMet( - "member_->is_persistable_.size() should be 0, but received %d", - member_->is_persistable_.size())); - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos->emplace_back(); - var_infos->back().name_ = node->Var()->Name(); - var_infos->back().type_ = node->Var()->GetType(); - var_infos->back().persistable_ = node->Var()->Persistable(); - - member_->is_persistable_.emplace(node->Var()->Name(), - node->Var()->Persistable()); - } - } - - if (graph->Has(details::kFusedVars)) { - auto &fused_vars = graph->Get(details::kFusedVars); - for (auto &fused_var : fused_vars) { - var_infos->emplace_back(); - var_infos->back() = fused_var.second; - - member_->is_persistable_.emplace(fused_var.first, - fused_var.second.persistable_); - } - } -} - -std::vector ParallelExecutor::CreateSSAGraphExecutor( - const ExecutionStrategy &exec_strategy, - std::vector *async_graphs, - ir::Graph *graph) { - std::vector final_graphs; - - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use AsyncSSAGraphExecutor"; - member_->executor_ = std::make_unique( - exec_strategy, - member_->local_scopes_, - member_->local_exec_scopes_, - member_->places_, - *async_graphs); - final_graphs = *async_graphs; - } else if (member_->build_strategy_.enable_parallel_graph_) { - VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // TODO(Yancey1989): Remove passing in the main_program when - // allreduce_seq_pass doesn't need it as the attr. - bool is_inference = details::IsDataParallelInferenceGraph(*graph); - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); - - auto *pg_exe = - new details::ParallelSSAGraphExecutor(exec_strategy, - member_->local_scopes_, - member_->local_exec_scopes_, - member_->places_, - graph); - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - - if (is_inference && member_->places_.size() > 1) { - member_->inference_executor_ = pg_exe; - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - } -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Paddle should be compiled with CUDA for ParallelGraph Execution.")); -#endif - } else { - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); - auto possible_inference_graphs = - details::TrySeparateToMultipleSingleDeviceGraphs(graph); - if (!possible_inference_graphs.empty()) { - for (auto &g : possible_inference_graphs) { - member_->ApplyFixOpRunOrderPass(g.get()); - } - - VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, - member_->local_scopes_, - member_->local_exec_scopes_, - member_->places_, - std::move(possible_inference_graphs)); - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - member_->inference_executor_ = pg_exe; - } else { - if (member_->places_.size() == 1) { - member_->ApplyFixOpRunOrderPass(graph); - } - LOG_IF(WARNING, details::HasKeepLastReadOp(*graph)) - << "drop_last=False for DataLoader is not supported in training " - "network. It is automatically turned to drop_last=True."; - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - VLOG(3) << "use ThreadedSSAGraphExecutor"; - member_->executor_ = - std::make_unique( - exec_strategy, - member_->local_scopes_, - member_->local_exec_scopes_, - member_->places_, - graph); - } else { - if (member_->use_device_ == p::kXPU) { -#if defined(PADDLE_WITH_XPU) - VLOG(3) << "use BindThreadedSSAGraphExecutor"; - member_->executor_ = - std::make_unique( - exec_strategy, - member_->local_scopes_, - member_->local_exec_scopes_, - member_->places_, - graph); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use XPU device since it's not compiled with XPU," - "Please recompile or reinstall Paddle with XPU support.")); -#endif - } else { - VLOG(3) << "use FastThreadedSSAGraphExecutor"; - member_->executor_ = - std::make_unique( - exec_strategy, - member_->local_scopes_, - member_->local_exec_scopes_, - member_->places_, - graph); - } - } - final_graphs.emplace_back(graph); - } - } - return final_graphs; -} - -void ParallelExecutor::ResetOpHandleScopeMapOfGraphs( - const std::vector &final_graphs, - const std::unordered_map &scope_map) { - PADDLE_ENFORCE_GE( - final_graphs.size(), - 1, - platform::errors::PreconditionNotMet( - "final_graphs shoule contain at least one graph, but received %d", - final_graphs.size())); - - PADDLE_ENFORCE_GT(scope_map.size(), - 0, - platform::errors::PreconditionNotMet( - "scope_map shoule contain at least one " - "element, but received %d", - scope_map.size())); - for (auto *g : final_graphs) { - auto ops = ir::FilterByNodeWrapper(*g); - for (auto *op : ops) { - op->SetLocalExecScopes(scope_map); - op->SetIsVariantScope(true); - } - } -} - -void ParallelExecutor::ResetOpHandleScopeMapOfGraphs( - const std::unordered_map &scope_map) { - auto inner_graph = const_cast(&Graph()); - std::vector graphs = {inner_graph}; - ResetOpHandleScopeMapOfGraphs(graphs, scope_map); -} - -void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs( - const std::vector &final_graphs) { - if (final_graphs.size() == 1) { - ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size()); - } else { - for (size_t i = 0; i < final_graphs.size(); ++i) { - ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i); - } - } -} - -const ir::Graph &ParallelExecutor::Graph() const { - return member_->executor_->Graph(); -} - -void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) { - const auto &build_strategy = member_->build_strategy_; - if (!build_strategy.allow_cuda_graph_capture_) return; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - build_strategy.async_mode_, - false, - platform::errors::InvalidArgument( - "Async Executor does not support CUDA Graph capturing.")); - PADDLE_ENFORCE_EQ( - platform::IsCUDAGraphCapturing(), - false, - platform::errors::PermissionDenied("CUDA Graph is not allowed to capture " - "when running the first batch.")); - PADDLE_ENFORCE_EQ( - member_->places_.size(), - 1, - platform::errors::InvalidArgument( - "CUDA Graph is only supported when one GPU device is running.")); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), - true, - platform::errors::InvalidArgument( - "CUDA Graph is only supported on NVIDIA GPU device.")); - PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, - false, - platform::errors::InvalidArgument( - "FLAGS_sync_nccl_allreduce must be False to support " - "CUDA Graph capturing.")); - - std::unordered_map> all_vars; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - auto *var_desc = node->Var(); - all_vars[var_desc->Name()].emplace_back(var_desc); - } - } - - auto mark_var_as_persistable = [&all_vars](const std::string &name) { - auto iter = all_vars.find(name); - if (iter != all_vars.end()) { - for (auto *var_desc : iter->second) { - var_desc->SetPersistable(true); - } - } - }; - - // Step 1: All fused vars must be persistable. - if (graph->Has(details::kFusedVars)) { - auto &fused_vars = graph->Get(details::kFusedVars); - for (auto &fused_var : fused_vars) { - fused_var.second.persistable_ = true; - mark_var_as_persistable(fused_var.first); - } - } - - // Step 2: All pinned vars must be persistable. - if (graph->Has(details::kPinnedVars)) { - auto &pinned_vars = graph->Get(details::kPinnedVars); - for (auto &pinned_var : pinned_vars) { - mark_var_as_persistable(pinned_var); - } - } - - // Step 3: Move all main programs to startup programs to make sure that - // the main programs would only be run once. - if (graph->Has(details::kProgramDescs)) { - auto &startup_programs = - graph->GetOrInit(details::kStartupProgramDescs); - auto &main_programs = - graph->Get(details::kProgramDescs); - for (auto &main_program : main_programs) { - startup_programs.emplace_back(main_program); - } - graph->Erase(details::kProgramDescs); - } - - // Step 4: Mark all vars in startup programs to be persistable. - if (graph->Has(details::kStartupProgramDescs)) { - auto &startup_programs = - graph->GetOrInit(details::kStartupProgramDescs); - for (auto &startup_program : startup_programs) { - for (auto &op_desc : startup_program.Block(0).AllOps()) { - for (auto &output : op_desc->OutputArgumentNames()) { - mark_var_as_persistable(output); - } - } - } - } - - // Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy. - auto ops = ir::FilterByNodeWrapper(*graph); - auto *scope = member_->local_scopes_[0]; - for (auto *op : ops) { - auto *loss_grad_op = dynamic_cast(op); - if (loss_grad_op == nullptr) continue; - auto loss_grad_name = loss_grad_op->LossGradName(); - mark_var_as_persistable(loss_grad_name); - loss_grad_op->RunOnVar(scope->Var(loss_grad_name)); - loss_grad_op->SetSkipRunning(true); - } -#else - PADDLE_THROW(platform::errors::Unimplemented( - "CUDA Graph is only supported on NVIDIA GPU device.")); -#endif -} - -} // namespace framework -} // namespace paddle - -USE_PASS(reference_count_pass); -USE_PASS(eager_deletion_pass); -USE_PASS(buffer_shared_inplace_pass); -USE_PASS(buffer_shared_cross_op_memory_reuse_pass); -USE_PASS(inplace_addto_op_pass); -USE_PASS(fix_op_run_order_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h deleted file mode 100644 index 43e27e81cd135..0000000000000 --- a/paddle/fluid/framework/parallel_executor.h +++ /dev/null @@ -1,160 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#endif - -namespace paddle { -namespace framework { - -class ParallelExecutorPrivate; - -using details::BuildStrategy; -using details::ExecutionStrategy; -using details::VariableInfo; -namespace p = paddle::platform; -using DeviceType = paddle::platform::DeviceType; - -class ParallelExecutor { - DISABLE_COPY_AND_ASSIGN(ParallelExecutor); - - public: - TEST_API explicit ParallelExecutor(const std::vector &places, - const std::vector &bcast_vars, - const std::string &loss_var_name, - Scope *scope, - const std::vector &local_scopes, - const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - ir::Graph *graph); - - // NOTE(Aurelius84): Construct a PE running on single device for @to_static - explicit ParallelExecutor(const platform::Place &place, - Scope *scope, - const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - ir::Graph *graph); - - TEST_API ~ParallelExecutor(); - - size_t DeviceCount() const; - - std::vector &GetLocalScopes(); - - void DropLocalExeScopes(); - - // This API is used to check whether DropLocalExeScopes work. - bool NeedCreateLocalExeScope(); - - /** - * Feed tensors to local scopes. The size of tensors should be equal to the - * size of local scopes. - */ - void FeedTensorsIntoLocalScopes( - const std::vector> - &tensors); - - void FeedAndSplitTensorIntoLocalScopes( - const std::unordered_map &tensors); - - FetchUnmergedList Run(const std::vector &fetch_tensors); - FetchList RunAndMerge(const std::vector &fetch_tensors); - - void RunWithoutFetch(const std::vector &skip_eager_vars); - - void ResetOpHandleScopeMapOfGraphs( - const std::unordered_map &scope_map); - - TEST_API const ir::Graph &Graph() const; - void PrepareVariables(Scope *scope); - - void SkipMemoryReuse(size_t scope_idx, - const std::vector &skip_vars); - - private: - // broadcast the parameters from the 0th device. - // trainer_id the trainer index in nccl distributed training. - void BCastParamsToDevices(const std::vector &vars, - int trainer_id = 0) const; - bool EnableParallelGraphExecution(const ir::Graph &graph, - const BuildStrategy &build_strategy) const; - - void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - size_t device_count, - const ir::Graph &graph); - - void CreateLocalScopes(Scope *global_scope, - const std::vector &local_scopes, - bool create_new); - - std::unordered_map CreateLocalExecScopes( - const std::vector &local_scopes, bool create_new); - - std::vector CloneGraphToMultiDevices(ir::Graph *graph); - - void PreludeToRun(const std::vector &fetch_tensors); - - void PrepareNCCLCommunicator(Scope *global_scope); - - std::vector CompileGraphWithBuildStrategy( - ir::Graph *graph, - std::vector *graphs, - const std::string &loss_var_name); - - void CreateVariableInfos(std::vector *var_infos, - ir::Graph *graph); - - std::vector CreateSSAGraphExecutor( - const ExecutionStrategy &exec_strategy, - std::vector *async_graphs, - ir::Graph *graph); - - void ResetOpHandleScopeMapOfGraphs( - const std::vector &final_graphs, - const std::unordered_map &scope_map); - - void SetReaderOpDeviceInfoOfGraphs( - const std::vector &final_graphs); - - void PrepareForCUDAGraphCapture(ir::Graph *graph); - - ParallelExecutorPrivate *member_; - std::vector> async_graphs_; - std::vector var_infos_; -}; -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt index 643145f78ddbf..2ed94b777c066 100644 --- a/paddle/fluid/jit/CMakeLists.txt +++ b/paddle/fluid/jit/CMakeLists.txt @@ -2,7 +2,7 @@ proto_library(paddle_jit_property_proto SRCS property.proto) file(GLOB_RECURSE fluid_jit_srcs "*.cc") set(fluid_jit_deps paddle_jit_property_proto proto_desc executor - parallel_executor compiled_program) + compiled_program) cc_library( fluid_jit diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index 494f533a4d4bc..b609e89806d9b 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -23,7 +23,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/executor_cache.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -218,15 +217,14 @@ template class RunProgramOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_THROW(phi::errors::InvalidArgument("Not supported yet!")); const auto &capture_mode = ctx.Attr("cuda_graph_capture_mode"); - auto is_test = ctx.Attr("is_test"); if (capture_mode.empty()) { - ComputeImpl(ctx, is_test, false); return; } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto mode = details::StringToCUDAGraphCaptureMode(capture_mode); + auto is_test = ctx.Attr("is_test"); PADDLE_ENFORCE_EQ( ctx.GetPlace().GetType() == phi::AllocationType::GPU, true, @@ -242,20 +240,6 @@ class RunProgramOpKernel : public framework::OpKernel { inner_graphs.resize(std::max(3, inner_graphs.size())); size_t graph_idx = is_test ? 0 : 1; if (inner_graphs[graph_idx].get() == nullptr) { - int64_t pool_id; - if (inner_graphs[1 - graph_idx].get() != nullptr) { - pool_id = inner_graphs[1 - graph_idx]->PoolID(); - } else { - pool_id = ctx.Attr("cuda_graph_pool_id"); - } - - framework::PEAndGraphPair pe_and_graph; - auto callable = [this, is_test, &pe_and_graph]( - const framework::ExecutionContext &exe_ctx) { - pe_and_graph = ComputeImpl(exe_ctx, is_test, true); - }; - inner_graphs[graph_idx] = CaptureCUDAGraph( - callable, ctx, {"X"}, {"Out", "DOut"}, mode, pool_id); VLOG(10) << "Capture Forward CUDA Graph"; } else { VLOG(10) << "Run Forward CUDA Graph directly"; @@ -268,162 +252,19 @@ class RunProgramOpKernel : public framework::OpKernel { "valid when using NVIDIA GPU.")); #endif } - - private: - framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx, - bool is_test, - bool use_cuda_graph) const { - VLOG(2) << "RunProgramOpKernel Compute"; - framework::PEAndGraphPair pe_and_graph; - // Step 1. prepare inputs, outputs, attrs - auto &input_vars = ctx.MultiInputVar("X"); - auto ¶m_vars = ctx.MultiInputVar("Params"); - auto output_vars = ctx.MultiOutputVar("Out"); - auto dout_vars = ctx.MultiOutputVar("DOut"); - - auto input_var_names = ctx.InputNames("X"); - auto output_var_names = ctx.OutputNames("Out"); - std::vector dout_var_names; - if (!dout_vars.empty()) { - // DOut is a dispensable out, only get the names when it exists. - // Otherwise, it will throw a NotFound error. - dout_var_names = ctx.OutputNames("DOut"); - } - - // current program may not hold parameters - std::vector param_names; - if (!param_vars.empty()) { - param_names = ctx.InputNames("Params"); - } - - auto start_op_index = ctx.Attr("start_op_index"); - auto end_op_index = ctx.Attr("end_op_index"); - auto program_id = ctx.Attr("program_id"); - - // NOTE(chenweihang): In order not to add new variable type, use vector - // here. Originally, here can use scope directly. - auto *out_scope_vec = ctx.Output("OutScope"); - std::unique_ptr inner_scope{nullptr}; - if (out_scope_vec->size() == 0) { - // For cuda graph under static graph mode usage. - // For static graph mode, we cannot set value of a tensor before any run, - // the OutScope variable passed to the op actually contains nothing. - // Just create a tmp scope to run the program. - PADDLE_ENFORCE_EQ( - use_cuda_graph, - true, - phi::errors::InvalidArgument( - "If not provide OutScope then must run under cuda graph mode.")); - inner_scope = std::make_unique(); - } else { - PADDLE_ENFORCE_EQ( - out_scope_vec->size(), - 1, - phi::errors::InvalidArgument( - "The OutScope of RunProgramGradOp should only hold one scope.")); - } - - // Step 2. prepare executor and init persistable variables - - // NOTE(Aurelius84): While training some models, forward can be called many - // times and then apply backpropagation all at once, such as Reinforcement - // Learning. Tensor data in multi-step training should be saved into single - // scope separately. Otherwise, the gradients can be miscalculated because - // always using the Tensor data of the last step in forward. - framework::Scope *global_inner_scope = - out_scope_vec->size() == 0 ? inner_scope.get() : out_scope_vec->front(); - VLOG(2) << "The number of sub scopes before forward: " - << global_inner_scope->kids().size(); - framework::Scope &scope = global_inner_scope->NewScope(); - - // share input_vars & parameters into scope - details::ShareVarsIntoScope(input_vars, input_var_names, &scope); - details::ShareVarsIntoScope(param_vars, param_names, &scope); - - auto *global_block = ctx.Attr("global_block"); - - if (end_op_index > start_op_index) { - auto *program = global_block->Program(); - bool is_new_created; - if (use_cuda_graph) { - pe_and_graph = framework::CreateFixOrderExecutorInfo( - *program, ctx.GetPlace(), start_op_index, end_op_index, &scope); - is_new_created = true; - } else { - auto cache_info = framework::GetExecutorInfoFromCache(*program, - ctx.GetPlace(), - start_op_index, - end_op_index, - /*is_grad=*/false, - program_id, - &scope); - pe_and_graph.first = cache_info.first; - is_new_created = cache_info.second; - } - - auto ¶llel_executor = pe_and_graph.first; - - // all out_vars are skip_eager_var - std::vector tmp_vars; - auto &skip_eager_delete_vars = - use_cuda_graph - ? tmp_vars - : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, false); - if (is_new_created) { - parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_var_names); - skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - output_var_names.begin(), - output_var_names.end()); - skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - dout_var_names.begin(), - dout_var_names.end()); - framework::details::ParseSafeEagerDeletionSkipVars( - *program, end_op_index, output_var_names, &skip_eager_delete_vars); - } - - // Step 3. run ops - parallel_executor->RunWithoutFetch(skip_eager_delete_vars); - } - // Step 4. Get Output - details::ShareVarsFromScope( - output_vars, output_var_names, *global_block, &scope); - details::ShareVarsFromScope( - dout_vars, dout_var_names, *global_block, &scope); - - // Debug info: scope info when run end - framework::Scope *target_scope{nullptr}; - if (out_scope_vec->size() == 0) { - target_scope = inner_scope.get(); - } else { - target_scope = out_scope_vec->front(); - } - VLOG(3) << framework::GenScopeTreeDebugInfo(target_scope); - // Step 5. Drop all children scopes while testing. - if (is_test) { - target_scope->DropKids(); - } - VLOG(2) << "The number of sub scopes after forward: " - << target_scope->kids().size(); -#ifdef PADDLE_WITH_DNNL - if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace()); -#endif - return pe_and_graph; - } }; template class RunProgramGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_THROW(phi::errors::InvalidArgument("Not supported yet!")); const auto &capture_mode = ctx.Attr("cuda_graph_capture_mode"); if (capture_mode.empty()) { - ComputeImpl(ctx, false); return; } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto mode = details::StringToCUDAGraphCaptureMode(capture_mode); PADDLE_ENFORCE_EQ( ctx.GetPlace().GetType() == phi::AllocationType::GPU, true, @@ -440,21 +281,6 @@ class RunProgramGradOpKernel : public framework::OpKernel { ->GetMutable>>()); const size_t graph_idx = 2; if (inner_graphs[graph_idx].get() == nullptr) { - framework::PEAndGraphPair pe_and_graph; - auto callable = - [this, &pe_and_graph](const framework::ExecutionContext &exe_ctx) { - pe_and_graph = ComputeImpl(exe_ctx, true); - }; - int64_t pool_id = inner_graphs[0].get() != nullptr - ? inner_graphs[0]->PoolID() - : inner_graphs[1]->PoolID(); - inner_graphs[graph_idx] = - CaptureCUDAGraph(callable, - ctx, - {framework::GradVarName("Out")}, - {framework::GradVarName("X")}, - mode, - pool_id); VLOG(10) << "Capture Backward CUDA Graph"; } else { ExecuteCUDAGraph(ctx, @@ -469,123 +295,6 @@ class RunProgramGradOpKernel : public framework::OpKernel { "valid when using NVIDIA GPU.")); #endif } - - private: - framework::PEAndGraphPair ComputeImpl(const framework::ExecutionContext &ctx, - bool use_cuda_graph) const { - VLOG(2) << "RunProgramGradOpKernel Compute"; - framework::PEAndGraphPair pe_and_graph; - // Step 1. prepare inputs and outputs - auto &output_grad_vars = ctx.MultiInputVar(framework::GradVarName("Out")); - auto input_grad_vars = ctx.MultiOutputVar(framework::GradVarName("X")); - auto param_grad_vars = ctx.MultiOutputVar(framework::GradVarName("Params")); - - // if all output vars are set to stop_gradient, grad op no need to executed - if (input_grad_vars.empty() && param_grad_vars.empty()) { - return pe_and_graph; - } - - auto output_grad_var_names = ctx.InputNames(framework::GradVarName("Out")); - // NOTE: after PR22939 [Add double grad] merged, the grad op maker's - // SetOutput will set to None if the input var stop_gradient=True, - // it will cause an NotFound error when ctx.OutputNames() is called - std::vector input_grad_var_names; - std::vector param_grad_names; - if (!input_grad_vars.empty()) { - input_grad_var_names = ctx.OutputNames(framework::GradVarName("X")); - } - if (!param_grad_vars.empty()) { - param_grad_names = ctx.OutputNames(framework::GradVarName("Params")); - } - - auto *block = ctx.Attr("global_block"); - auto orig_end_op_index = ctx.Attr("end_op_index"); - auto program_id = ctx.Attr("program_id"); - // NOTE: skip `shape` and `fill_constant` op created by - // fluid.backward.gradients, one forward output will generate one `shape` - // and `fill_constant` - int64_t start_op_index = orig_end_op_index + (output_grad_vars.size() * 2); - int64_t end_op_index = block->OpSize(); - - auto *out_scope_vec = ctx.Input("OutScope"); - PADDLE_ENFORCE_EQ( - out_scope_vec->size(), - 1, - phi::errors::InvalidArgument( - "The OutScope of RunProgramGradOp should only hold one scope.")); - - framework::Scope *global_inner_scope = out_scope_vec->front(); - auto sub_scope_num = global_inner_scope->kids().size(); - VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; - PADDLE_ENFORCE_GT(sub_scope_num, - 0, - phi::errors::InvalidArgument( - "The OutScope of RunProgramGradOp should hold at " - "least one sub scope.")); - - auto &scope = *(global_inner_scope->kids().front()); - auto *global_block = ctx.Attr("global_block"); - - if (end_op_index > start_op_index) { - // Step 2. prepare executor and scope - auto *program = global_block->Program(); - bool is_new_created; - if (use_cuda_graph) { - pe_and_graph = framework::CreateFixOrderExecutorInfo( - *program, ctx.GetPlace(), start_op_index, end_op_index, &scope); - is_new_created = true; - } else { - auto cache_info = framework::GetExecutorInfoFromCache(*program, - ctx.GetPlace(), - start_op_index, - end_op_index, - /*is_grad*/ true, - program_id, - &scope); - pe_and_graph.first = cache_info.first; - is_new_created = cache_info.second; - } - - auto ¶llel_executor = pe_and_graph.first; - std::vector tmp_vars; - auto &skip_eager_delete_vars = - use_cuda_graph - ? tmp_vars - : framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( - program_id, true); - if (is_new_created) { - parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, - output_grad_var_names); - - skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - input_grad_var_names.begin(), - input_grad_var_names.end()); - framework::details::AppendSkipDeletionVars(param_grad_names, - &skip_eager_delete_vars); - } - - details::ShareVarsIntoScope( - output_grad_vars, output_grad_var_names, &scope); - // Debug info: scope info when run end - VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front()); - - // Step 3. run ops - parallel_executor->RunWithoutFetch( - /*skip_eager_delete_vars=*/skip_eager_delete_vars); - } - - // Step 4. get outputs - details::ShareVarsFromScope( - input_grad_vars, input_grad_var_names, *global_block, &scope); - details::ShareVarsFromScope( - param_grad_vars, param_grad_names, *global_block, &scope); - - // Step5. drop current scope - global_inner_scope->DeleteScope(&scope); - VLOG(2) << "The number of sub scopes after backward: " - << global_inner_scope->kids().size(); - return pe_and_graph; - } }; } // namespace operators diff --git a/paddle/fluid/pybind/compiled_program.cc b/paddle/fluid/pybind/compiled_program.cc index 7b3ac4a2467f0..fdc9b1c468a00 100644 --- a/paddle/fluid/pybind/compiled_program.cc +++ b/paddle/fluid/pybind/compiled_program.cc @@ -996,12 +996,6 @@ void BindCompiledProgram(pybind11::module &m) { // NOLINT optimization passes should be defined in this way. BuildStrategy cannot be updated after being finalized.)DOC"); - m.def("_set_cached_executor_build_strategy", - [](int64_t program_id, const BuildStrategy &build_strategy) { - auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); - cached_exe_info.SetBuildStrategy(program_id, build_strategy); - }); - cp.def(py::init &, const std::vector &, const std::string &, diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc index adf5852aabb64..71c0699c7ca44 100644 --- a/paddle/fluid/pybind/place.cc +++ b/paddle/fluid/pybind/place.cc @@ -55,7 +55,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ae49f2594ce0a..64a7e212d8b1e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2955,7 +2955,6 @@ All parameter, weight, gradient are variables in Paddle. #endif // PADDLE_WITH_CUDA m.def("clear_executor_cache", []() { pybind11::gil_scoped_release release; - framework::ExecutorInfoCache::Instance().Finalize(); framework::InterpreterCoreInfoCache::Instance().Finalize(); }); diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index 098d7182f5a02..39cb9c645e537 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -55,7 +55,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py index f412a954c0bb0..fdf5e97e3452d 100644 --- a/python/paddle/base/core.py +++ b/python/paddle/base/core.py @@ -308,7 +308,6 @@ def to_list(s): _RecordEvent, _Scope, _set_amp_op_list, - _set_cached_executor_build_strategy, _set_current_stream, _set_eager_deletion_mode, _set_fuse_parameter_group_size, diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py index f4fc6ea387f97..1525071934805 100644 --- a/python/paddle/jit/dy2static/partial_program.py +++ b/python/paddle/jit/dy2static/partial_program.py @@ -442,9 +442,6 @@ def _train_pure_fp16_forward_backward_program(self): @LazyInitialized def _train_program_id(self): program_id = paddle.utils._hash_with_id(self._train_program, self) - core._set_cached_executor_build_strategy( - program_id, self._build_strategy - ) return program_id @LazyInitialized @@ -454,9 +451,6 @@ def _infer_program_id(self): @LazyInitialized def _train_amp_program_id(self): program_id = paddle.utils._hash_with_id(self._train_amp_program, self) - core._set_cached_executor_build_strategy( - program_id, self._build_strategy - ) return program_id @LazyInitialized @@ -468,9 +462,6 @@ def _train_pure_fp16_program_id(self): program_id = paddle.utils._hash_with_id( self._train_pure_fp16_program, self ) - core._set_cached_executor_build_strategy( - program_id, self._build_strategy - ) return program_id @LazyInitialized diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py index ff6ee46c8a1f9..cb05d818d3a86 100644 --- a/python/paddle/jit/dy2static/pir_partial_program.py +++ b/python/paddle/jit/dy2static/pir_partial_program.py @@ -789,9 +789,6 @@ def pass_fn(forward_program, backward_program): @cached_property def _train_program_id(self): program_id = paddle.utils._hash_with_id(self.train_program, self) - core._set_cached_executor_build_strategy( - program_id, self._build_strategy - ) return program_id @cached_property diff --git a/test/deprecated/legacy_test/CMakeLists.txt b/test/deprecated/legacy_test/CMakeLists.txt index 61046057f7c7a..8eab1efdf26a3 100644 --- a/test/deprecated/legacy_test/CMakeLists.txt +++ b/test/deprecated/legacy_test/CMakeLists.txt @@ -405,7 +405,6 @@ list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient) list(REMOVE_ITEM TEST_OPS test_imperative_mnist) list(REMOVE_ITEM TEST_OPS test_layers_deprecated) list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model) -list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist) list(REMOVE_ITEM TEST_OPS test_install_check) list(REMOVE_ITEM TEST_OPS test_basic_gru_api) list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index 98b741742bad8..9e1b92ab17bcf 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -427,7 +427,6 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient) list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) -list(REMOVE_ITEM TEST_OPS test_async_ssa_graph_executor_mnist) list(REMOVE_ITEM TEST_OPS test_basic_gru_api) list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op) list(REMOVE_ITEM TEST_OPS test_basic_lstm_api) diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 5d259e101b56d..7e66016dfec1b 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -291,8 +291,8 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\ ^test_trt_convert_unsqueeze2$|\ ^test_simplify_with_basic_ops_pass_autoscan$|\ ^test_trt_convert_nearest_interp$|\ -^test_trt_pool_op$|\ -^test_trt_convert_clip$|\ +^test_trt_pool_op$|\ +^test_trt_convert_clip$|\ ^test_trt_convert_grid_sampler$|\ ^test_trt_convert_p_norm$|\ ^disable_wingpu_cuda12_test$" @@ -427,6 +427,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^disable_win_inference_test$|\ ^test_imperative_double_grad$|\ ^test_comp_eager_matmul_double_grad$|\ +^test_cuda_graph_partial_graph_static_run$|\ ^test_imperative_triple_grad$" @@ -544,9 +545,9 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then if [ ${PRECISION_TEST:-OFF} == "ON" ]; then python ${PADDLE_ROOT}/tools/get_pr_ut.py || echo "Failed to obtain ut_list !" fi - + python ${PADDLE_ROOT}/tools/group_case_for_parallel.py ${PADDLE_ROOT} - + fi failed_test_lists='' @@ -715,7 +716,7 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then done < $PADDLE_ROOT/tools/single_card_tests_new single_ut_endTime_s=`date +%s` single_ut_Time_s=`expr $single_ut_endTime_s - $single_ut_startTime_s` - echo "ipipe_log_param_1_TestCases_Total_Time: $single_ut_Time_s s" + echo "ipipe_log_param_1_TestCases_Total_Time: $single_ut_Time_s s" multiple_ut_mem_0_startTime_s=`date +%s` while read line @@ -724,8 +725,8 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then done < $PADDLE_ROOT/tools/multiple_card_tests_mem0_new multiple_ut_mem_0_endTime_s=`date +%s` multiple_ut_mem_0_Time_s=`expr $multiple_ut_mem_0_endTime_s - $multiple_ut_mem_0_startTime_s` - echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $multiple_ut_mem_0_Time_s s" - + echo "ipipe_log_param_2_mem0_TestCases_Total_Time: $multiple_ut_mem_0_Time_s s" + multiple_ut_startTime_s=`date +%s` while read line do @@ -749,7 +750,7 @@ if [ "${WITH_GPU:-OFF}" == "ON" ];then done < $PADDLE_ROOT/tools/exclusive_card_tests_mem0_new exclusive_ut_mem_0_endTime_s=`date +%s` exclusive_ut_mem_0_Time_s=`expr $exclusive_ut_mem_0_endTime_s - $exclusive_ut_mem_0_startTime_s` - echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $exclusive_ut_mem_0_Time_s s" + echo "ipipe_log_param_-1_mem0_TestCases_Total_Time: $exclusive_ut_mem_0_Time_s s" exclusive_ut_startTime_s=`date +%s` while read line