Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[fix bug] program interpreter compatible with RunWithExternalStream #63350

Merged
merged 4 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions paddle/fluid/framework/new_executor/interpreter_base_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ class InterpreterBaseImpl {

virtual bool IsSharedResultsBuild() const = 0;

virtual void Build(
const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) = 0;
virtual void Build(const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
bool switch_stream = false) = 0;

virtual bool IsStaticBuild() const = 0;

Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/framework/new_executor/pir_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1927,7 +1927,8 @@ Variable* PirInterpreter::DebugVar(const std::string& name) const {

void PirInterpreter::Build(
const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
bool switch_stream) {
PADDLE_THROW(platform::errors::Unimplemented(
"Build is not implemented in PirInterpreter."));
}
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/framework/new_executor/pir_interpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ class PirInterpreter : public InterpreterBaseImpl {
void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
void PrepareForCUDAGraphCapture();

void Build(
const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
void Build(const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
bool switch_stream = false) override;

bool IsStaticBuild() const override { return static_build_; }

Expand Down
87 changes: 41 additions & 46 deletions paddle/fluid/framework/new_executor/program_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
is_in_op_profiling_mode_ = enable_op_profiling;

std::vector<paddle::framework::OpFuncNode> op_func_nodes;
Build(feed_names, &op_func_nodes);
Build(feed_names, &op_func_nodes, switch_stream);

if (!is_build_) {
SetFeedVarsInplaceSkip(feed_names);
Expand All @@ -166,7 +166,7 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (switch_stream) {
BuildOpFuncNode(&op_func_nodes);
Convert(&op_func_nodes);
}
#endif
RunImpl();
Expand Down Expand Up @@ -208,15 +208,16 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,

void ProgramInterpreter::Build(
const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
bool switch_stream) {
SetDeviceId(place_);
CheckCUDAGraphBeforeRun(feed_names);

#ifdef PADDLE_WITH_DNNL
platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif

if (!is_build_) {
if (!is_build_ || switch_stream) {
LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
paddle::framework::interpreter::BuildVariableScope(
block_, execution_config_, &var_scope_);
Expand Down Expand Up @@ -678,7 +679,42 @@ std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
void ProgramInterpreter::Convert(
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
BuildOpFuncNode(op_func_nodes);
auto nodes = *op_func_nodes;
auto op_nums = nodes.size();
vec_instruction_.clear();
vec_instruction_.reserve(op_nums);
for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
auto& op_func_node = nodes[op_idx];
stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_new_executor_use_cuda_graph) {
auto& op = op_func_node.operator_base_;
auto& op_type = op->Type();
if (op_type == interpreter::kMemcpyD2H ||
op_type == interpreter::kMemcpyH2D) {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
}
PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
true,
platform::errors::InvalidArgument(
"Device context of op %s must be [%s] while using "
"cuda graph, but got [%s].",
op_type,
typeid(phi::GPUContext).name(),
typeid(*dev_ctx_).name()));
// cuda graph needs to record all stream
phi::backends::gpu::CUDAGraphContextManager::Instance()
.RecordCapturingDeviceContext(dev_ctx_);
}
#endif
vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
vec_instruction_.back().UpdateRecordStreamForGcInfo();
#endif
}

BuildOperatorDependences();

Expand Down Expand Up @@ -715,7 +751,6 @@ void ProgramInterpreter::Convert(
}

// calculate last_live_ops_
auto op_nums = (*op_func_nodes).size();
for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
Instruction& instr = vec_instruction_[op_idx];
OpInOutInfo info;
Expand Down Expand Up @@ -852,46 +887,6 @@ void ProgramInterpreter::Convert(
AnalyseExecuteOrderForTrace();
}

void ProgramInterpreter::BuildOpFuncNode(
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
auto nodes = *op_func_nodes;
auto op_nums = nodes.size();
vec_instruction_.clear();
vec_instruction_.reserve(op_nums);
for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
auto& op_func_node = nodes[op_idx];
stream_analyzer_.SetForceEventsToWaitInfo(force_events_to_wait_);
auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_new_executor_use_cuda_graph) {
auto& op = op_func_node.operator_base_;
auto& op_type = op->Type();
if (op_type == interpreter::kMemcpyD2H ||
op_type == interpreter::kMemcpyH2D) {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Cuda memory copy d2h/h2d is not allowed while using cuda graph."));
}
PADDLE_ENFORCE_EQ(typeid(*dev_ctx_) == typeid(phi::GPUContext),
true,
platform::errors::InvalidArgument(
"Device context of op %s must be [%s] while using "
"cuda graph, but got [%s].",
op_type,
typeid(phi::GPUContext).name(),
typeid(*dev_ctx_).name()));
// cuda graph needs to record all stream
phi::backends::gpu::CUDAGraphContextManager::Instance()
.RecordCapturingDeviceContext(dev_ctx_);
}
#endif
vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
vec_instruction_.back().UpdateRecordStreamForGcInfo();
#endif
}
}

void ProgramInterpreter::BuildSkipShareLoDInfo() {
for (size_t i = 0; i < vec_instruction_.size(); ++i) {
bool can_skip_lod = true;
Expand Down
8 changes: 3 additions & 5 deletions paddle/fluid/framework/new_executor/program_interpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ class ProgramInterpreter : public InterpreterBaseImpl {

std::shared_ptr<ProgramDesc> GetMutableCopyProgram() override;

void Build(
const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
void Build(const std::vector<std::string>& feed_names,
std::vector<paddle::framework::OpFuncNode>* op_func_nodes,
bool switch_stream = false) override;

void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;

Expand Down Expand Up @@ -131,8 +131,6 @@ class ProgramInterpreter : public InterpreterBaseImpl {
void BuildSkipShareLoDInfo();
void UpdateSyncOpNum();
void AnalyseExecuteOrderForTrace();
void BuildOpFuncNode(
std::vector<paddle::framework::OpFuncNode>* op_func_nodes);

// inplace
void BuildInplace();
Expand Down