Skip to content

Commit

Permalink
MLPerf Optimization for Release/2.2 (#37109)
Browse files Browse the repository at this point in the history
* add mlperf optimization PRs

* update
  • Loading branch information
sneaxiy authored Nov 15, 2021
1 parent 70cb0a5 commit 287ca7d
Show file tree
Hide file tree
Showing 68 changed files with 6,480 additions and 561 deletions.
2 changes: 1 addition & 1 deletion cmake/operators.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ function(op_library TARGET)
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "sparse_attention_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
"fused_bn_add_activation_op" "fused_attention_op" "fused_feedforward_op")
"fused_bn_add_activation_op" "fused_attention_op" "fused_feedforward_op" "resnet_unit_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/details/build_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ struct BuildStrategy {
// Turn off inplace addto by default.
bool enable_addto_{false};

bool allow_cuda_graph_capture_{false};

// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,12 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
}
}
// Wait FetchOps.
ClearFetchOp(graph_, &fetch_ops);
if (!fetch_ops.empty()) {
ClearFetchOp(graph_, &fetch_ops);

for (auto &place : places_) {
fetch_ctxs_.Get(place)->Wait();
for (auto &place : places_) {
fetch_ctxs_.Get(place)->Wait();
}
}

return fetches;
Expand Down
19 changes: 14 additions & 5 deletions paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,28 @@ struct ScaleLossGradFunctor {
}
};

std::string ScaleLossGradOpHandle::LossGradName() const {
return static_cast<VarHandle *>(this->outputs_[0])->name();
}

void ScaleLossGradOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
// Doesn't wait any event
std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name();
RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true);
}

auto *tensor =
local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
auto *tensor = var->GetMutable<LoDTensor>();
tensor->Resize(make_ddim({1}));

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
this->dev_ctxes_.at(place_));
this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
if (record_event) {
this->RunAndRecordEvent(
[&] { framework::VisitDataType(out_dtype_, func); });
} else {
framework::VisitDataType(out_dtype_, func);
}
#else
ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
framework::VisitDataType(out_dtype_, func);
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/framework/details/scale_loss_grad_op_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase {

std::string Name() const override;

platform::Place GetPlace() const { return place_; }

void RunOnVar(Variable *var, bool record_event = false);

std::string LossGradName() const;

protected:
void RunImpl() override;

Expand Down
53 changes: 31 additions & 22 deletions paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/profiler.h"

namespace paddle {
namespace framework {
namespace details {
Expand All @@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
PrepareLocalExeScopes();
}

static void RunProgramDescs(const ProgramDescs &programs,
const std::vector<Scope *> &local_exec_scopes,
const std::vector<platform::Place> &places) {
for (auto &program : programs) {
for (auto &op_desc : program.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes[i], places[i]);
}
}
}
}

FetchResultType ScopeBufferedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors, bool return_merged) {
#ifdef PADDLE_WITH_CUDA
if (platform::IsCUDAGraphCapturing()) {
strategy_.num_iteration_per_drop_scope_ =
std::numeric_limits<size_t>::max();
DropLocalExeScopes(/*need_wait=*/false);
}
#endif

if (drop_scope_counter_ == 0) {
platform::RecordEvent e("InitLocalVars");
InitVariables();
Expand Down Expand Up @@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run(
++drop_scope_counter_;
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ ||
DropScopeOrNot()) {
DropLocalExeScopes();
DropLocalExeScopes(!platform::IsCUDAGraphCapturing());
}

if (VLOG_IS_ON(5)) {
Expand Down Expand Up @@ -128,39 +151,25 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
if (graph.Has(details::kStartupProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kStartupProgramDescs);

for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
RunProgramDescs(program_descs, local_exec_scopes_, places_);
}
is_initialized_ = true;
}

if (graph.Has(details::kProgramDescs)) {
auto &program_descs =
graph.Get<details::ProgramDescs>(details::kProgramDescs);

for (auto &program_desc : program_descs) {
for (auto &op_desc : program_desc.Block(0).AllOps()) {
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_exec_scopes_[i], places_[i]);
}
}
}
RunProgramDescs(program_descs, local_exec_scopes_, places_);
}
}

void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
platform::RecordEvent drop_scope_event("DropLocalExeScopes");
drop_scope_counter_ = 0;
for (auto &p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
if (need_wait) {
for (auto &p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
}
}
scope_monitor_.ClearHistoryLocalExecScopes();
for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
FetchResultType Run(const std::vector<std::string>& fetch_tensors,
bool return_merged) override;

void DropLocalExeScopes();
void DropLocalExeScopes(bool need_wait = true);

bool NeedCreateLocalExeScope();

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ message BuildStrategy {
optional bool enable_auto_fusion = 11 [ default = false ];
optional bool enable_addto = 12 [ default = false ];
optional bool fix_op_run_order = 13 [ default = false ];
optional bool allow_cuda_graph_capture = 14 [ default = false ];
}

message ExecutionStrategy {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ void InplaceAddToOpPass::Run(Graph *graph) const {
out_var_ptr->GeneratedOp());

// NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
if (right_generated_op->Name() != "conv2d_grad") {
if (right_generated_op->Name() != "conv2d_grad" &&
right_generated_op->Name() != "resnet_unit_grad") {
continue;
}

Expand Down Expand Up @@ -224,11 +225,13 @@ static bool IsValidConv2DGradDataGradNode(const Node &node) {
if (node.inputs.empty()) return false;
auto *generated_op = node.inputs[0];
auto *op_desc = generated_op->Op();
if (op_desc == nullptr || op_desc->Type() != "conv2d_grad") {
if (op_desc == nullptr || (op_desc->Type() != "conv2d_grad" &&
op_desc->Type() != "resnet_unit_grad")) {
return false;
}
const auto &outputs = op_desc->Outputs();
auto iter = outputs.find(GradVarName("Input"));
std::string grad_var_name = op_desc->Type() == "conv2d_grad" ? "Input" : "X";
auto iter = outputs.find(GradVarName(grad_var_name));
return iter != outputs.end() && !iter->second.empty() &&
iter->second[0] == node.Name() &&
!op_desc->GetAttrIfExists<bool>("use_addto");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper)

cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,31 @@

#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"

namespace paddle {
namespace framework {
namespace ir {

template <typename T>
static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base,
const platform::Place &place) {
auto *op = dynamic_cast<T *>(op_base);
return op && op->GetPlace() == place;
}

static bool IsLockAndRecordEventFreeComputationOpHandle(
details::ComputationOpHandle *op, const OpGraphView &graph_view) {
if (!platform::is_gpu_place(op->GetPlace()) &&
!platform::is_xpu_place(op->GetPlace()))
return false;
for (auto &pending_op : graph_view.PendingOps(op)) {
auto *tmp = dynamic_cast<details::ComputationOpHandle *>(pending_op);
if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
if (!IsMatchedPlaceSingleDeviceOp<details::ComputationOpHandle>(
pending_op, op->GetPlace()) &&
!IsMatchedPlaceSingleDeviceOp<details::ScaleLossGradOpHandle>(
pending_op, op->GetPlace())) {
return false;
}
}
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/operator_kernel_configs.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ limitations under the License. */
#pragma once

#include <algorithm>
#include <mutex>
#include <unordered_map>
#include <vector>
#include "glog/logging.h"

namespace paddle {
namespace framework {
Expand Down
Loading

0 comments on commit 287ca7d

Please sign in to comment.