Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

onednn: remove fc_elementwise_add fusion #55504

Merged
merged 3 commits into from
Jul 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion paddle/fluid/framework/ir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ if(WITH_MKLDNN)
pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
pass_library(params_quantization_mkldnn_pass inference DIR mkldnn)
pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this pass is removed, will the performance of some models be affected?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some will be impact. A simple case should be linear -> add, add won't be fused into linear and cause extra computation. However, the current post-sum fusion need memory inplace and it will be conflict with memory reuse opimization and cause wrong accuracy. We'll re-enable the fusion by using post-binary fusion which doesn't actually need memory inplace but this needs more validation work.

pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
pass_library(cpu_bfloat16_pass inference DIR mkldnn)
Expand Down

This file was deleted.

This file was deleted.

7 changes: 1 addition & 6 deletions paddle/fluid/inference/api/paddle_pass_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@ void CpuPassStrategy::EnableMKLDNN() {
// Disabled due to topology-dependent speed-up
"fc_mkldnn_pass",
"fc_act_mkldnn_fuse_pass",
"fc_elementwise_add_mkldnn_fuse_pass", //
"self_attention_fuse_pass", //
"batch_norm_act_fuse_pass", //
"softplus_activation_onednn_fuse_pass", //
Expand Down Expand Up @@ -400,7 +399,6 @@ void CpuPassStrategy::EnableMkldnnBfloat16() {
if (!use_mkldnn_bfloat16_) {
passes_.push_back("fc_mkldnn_pass");
passes_.push_back("fc_act_mkldnn_fuse_pass");
passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");

passes_.push_back("cpu_bfloat16_placement_pass");
passes_.push_back("cpu_bfloat16_pass");
Expand Down Expand Up @@ -456,7 +454,6 @@ void CpuPassStrategy::EnableMkldnnInt8() {
passes_.push_back("repeated_fc_relu_fuse_pass");
passes_.push_back("fc_mkldnn_pass");
passes_.push_back("fc_act_mkldnn_fuse_pass");
passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass");
passes_.push_back("batch_norm_act_fuse_pass");
passes_.push_back("softplus_activation_onednn_fuse_pass");
Expand Down Expand Up @@ -491,9 +488,7 @@ void CpuPassStrategy::DisableMkldnnFcPasses() {

void CpuPassStrategy::EraseFcMkldnnPasses() {
std::vector<std::string> fc_passes_to_erase(
{"fc_mkldnn_pass",
"fc_act_mkldnn_fuse_pass",
"fc_elementwise_add_mkldnn_fuse_pass"});
{"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
for (const auto &pass : fc_passes_to_erase) {
int idx = GetPassIndex(pass);
if (idx != -1) {
Expand Down
35 changes: 2 additions & 33 deletions paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ GetDNNLScales(const ExecutionContext& ctx) {
auto scale_in_data = ctx.Attr<float>("Scale_in");
auto scale_out = ctx.Attr<float>("Scale_out");
auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise")
? ctx.Attr<float>("Scale_in_eltwise")
: 1.0f;

std::vector<float> dnnl_src_scales = {1.f / scale_in_data};
size_t count = scale_weights_data.size();
Expand All @@ -57,7 +54,7 @@ GetDNNLScales(const ExecutionContext& ctx) {
for (size_t i = 0; i < count; i++) {
dnnl_wei_scales[i] = 1.f / scale_weights_data[i];
}
std::vector<float> dnnl_psum_scales = {1.f / scale_in_eltwise_data};
std::vector<float> dnnl_psum_scales = {1.f};
std::vector<float> dnnl_dst_scales = {1.f / scale_out};

return std::make_tuple(
Expand Down Expand Up @@ -127,7 +124,6 @@ class FCMKLDNNHandler
dnnl::primitive_attr attributes;
dnnl::post_ops post_operations;

float sum_scale = 1.0f;
float activation_scale = 1.0f;
if (phi::funcs::is_int8<T_w>()) {
std::vector<float> src_scales, wei_scales, psum_scales, dst_scales;
Expand Down Expand Up @@ -168,13 +164,6 @@ class FCMKLDNNHandler
dst_scales.data(),
dst_scales.size() * sizeof(float));
}

sum_scale = psum_scales[0];
}

if (ctx.HasAttr("fuse_residual_connection") &&
ctx.Attr<bool>("fuse_residual_connection")) {
post_operations.append_sum(sum_scale);
}

// ReLU from "fc_fuse_pass"
Expand Down Expand Up @@ -332,22 +321,6 @@ class FCMKLDNNHandler

std::shared_ptr<dnnl::memory> AcquireCustomDstMemory(
const ExecutionContext& ctx, phi::DenseTensor* out) {
if (ctx.HasAttr("fuse_residual_connection") &&
ctx.Attr<bool>("fuse_residual_connection")) {
auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");

PADDLE_ENFORCE_EQ(
out->dims(),
residual_param->dims(),
phi::errors::InvalidArgument(
"Output and elementwise parameter need to have the "
"same dimension sizes, but got output's dimension = %d"
" and residual param's dimension =%d .",
out->dims().size(),
residual_param->dims().size()));

out->ShareDataWith(*residual_param);
}
return this->template AcquireDstMemory<T_out>(out);
} // namespace operators

Expand Down Expand Up @@ -458,11 +431,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {

dst_memory_p =
std::make_shared<dnnl::memory>(inner_product_cache->dst_mem);
if (ctx.HasAttr("fuse_residual_connection") &&
ctx.Attr<bool>("fuse_residual_connection")) {
auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
out->ShareDataWith(*residual_param);
}

auto out_ptr = out->mutable_data<T_out>(
ctx.GetPlace(), dst_memory_p->get_desc().get_size());
dst_memory_p->set_data_handle(out_ptr);
Expand Down
4 changes: 1 addition & 3 deletions test/cpp/inference/api/analysis_predictor_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -370,9 +370,7 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) {
CpuPassStrategy cpuPassStrategy;
cpuPassStrategy.EnableMKLDNN();
const std::vector<std::string> fc_passes_to_erase(
{"fc_mkldnn_pass",
"fc_act_mkldnn_fuse_pass",
"fc_elementwise_add_mkldnn_fuse_pass"});
{"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
for (const auto& pass : fc_passes_to_erase) {
ASSERT_NE(cpuPassStrategy.GetPassIndex(pass), (size_t)-1);
}
Expand Down
2 changes: 1 addition & 1 deletion test/cpp/inference/api/analyzer_ernie_int8_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void SetInt8Config(AnalysisConfig *cfg,
pass_builder->DeletePass("constant_folding_pass");
auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(data);
cfg->mkldnn_quantizer_config()->SetEnabledOpTypes(
{"elementwise_add", "matmul", "matmul_v2", "fused_matmul"});
{"matmul", "matmul_v2", "fused_matmul"});
// Exclusion of several matmules that should not be quantized due to the fact
// that they reduce the accuracy of the model
cfg->mkldnn_quantizer_config()->SetExcludedOpIds(
Expand Down
1 change: 0 additions & 1 deletion test/cpp/inference/api/analyzer_vit_ocr_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@ TEST(Analyzer_vit_ocr, fuse_status) {

CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33);
CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2);
CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16);
}
#endif

Expand Down
2 changes: 0 additions & 2 deletions test/ir/inference/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
set_tests_properties(test_onednn_fc_activation_fuse_pass PROPERTIES TIMEOUT
300)
set_tests_properties(test_onednn_fc_elementwise_add_fuse_pass
PROPERTIES TIMEOUT 120)
set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass
PROPERTIES TIMEOUT 60)
endif()
Expand Down
Loading