From 9a62e9138051c6ae9cecb3ef22a557a73c04323a Mon Sep 17 00:00:00 2001 From: aooxin Date: Thu, 18 Jul 2024 12:49:37 +0000 Subject: [PATCH 01/13] register for auto_mixed_precision_pass.cc --- paddle/fluid/inference/api/analysis_predictor.cc | 6 +++++- paddle/fluid/inference/api/paddle_pass_builder.cc | 1 + .../pir/transforms/general/auto_mixed_precision_pass.cc | 3 +++ paddle/fluid/pir/transforms/passes.h | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 65a9e9a1eafbd..4c5fc5d05206d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -912,6 +912,10 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { // set attr for (const auto &pass : pass_pm.passes()) { pass->SetNotOwned(pir::Pass::kParamScopeAttr, sub_scope_); + pass->SetNotOwned(pir::Pass::kPlaceAttr, &place_); + if (pass->name() == "auto_mixed_precision_pass") { + pass->Set("__mixed_precision_mode__", new phi::DataType(paddle::ConvertPrecision(config_.mixed_precision_mode_))); + } if (pass->name() == "matmul_add_act_fuse_pass" || pass->name() == "conv2d_add_act_fuse_pass" || pass->name() == "conv2d_add_fuse_pass") { @@ -2200,7 +2204,7 @@ void AnalysisPredictor::PrepareArgument() { pass_builder->AppendPass("is_test_pass"); pass_builder->AppendPass("constant_folding_pass"); } - pass_builder->AppendPass("auto_mixed_precision_pass"); + // pass_builder->AppendPass("auto_mixed_precision_pass"); if (!config_.new_ir_enabled()) { pass_builder->AppendPass("inplace_op_var_pass"); } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 5ebbff362b80a..feaa3c3eedb58 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -615,6 +615,7 @@ const std::vector kPirGpuPasses{ "transpose_flatten_concat_fuse_pass", "remove_redundant_transpose_pass", "transfer_layout_pass", + "auto_mixed_precision_pass", }; const std::vector kPirXpuPasses{ diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index 001988f20d5fc..ffe5c972094c8 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -50,6 +50,7 @@ #include "paddle/pir/include/pattern_rewrite/frozen_rewrite_pattern_set.h" #include "paddle/pir/include/pattern_rewrite/pattern_match.h" #include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h" +#include "paddle/pir/include/pass/pass_registry.h" namespace { @@ -677,3 +678,5 @@ std::unique_ptr CreateAutoMixedPrecisionPass() { } } // namespace pir + +REGISTER_IR_PASS(auto_mixed_precision_pass, AutoMixedPrecisionPass); diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h index ce13547980d6d..6cda5f5f7e8e1 100644 --- a/paddle/fluid/pir/transforms/passes.h +++ b/paddle/fluid/pir/transforms/passes.h @@ -46,6 +46,7 @@ USE_PIR_PASS(delete_weight_dequant_linear_op_pass); USE_PIR_PASS(delete_quant_dequant_linear_op_pass); USE_PIR_PASS(transfer_layout_pass); USE_PIR_PASS(fused_rotary_position_embedding_pass); +USE_PIR_PASS(auto_mixed_precision_pass); #ifdef PADDLE_WITH_DNNL USE_PIR_PASS(depthwise_conv_onednn_pass); From 06f8cbed3a052e5c2a6b8132f48c48b9b42a46e6 Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Wed, 28 Aug 2024 20:44:10 +0800 Subject: [PATCH 02/13] modified: paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc --- .../general/auto_mixed_precision_pass.cc | 48 ++++++++++++++++--- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index ffe5c972094c8..12a263f177a49 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -47,10 +47,10 @@ #include "paddle/pir/include/core/parameter.h" #include "paddle/pir/include/core/program.h" #include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" #include "paddle/pir/include/pattern_rewrite/frozen_rewrite_pattern_set.h" #include "paddle/pir/include/pattern_rewrite/pattern_match.h" #include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h" -#include "paddle/pir/include/pass/pass_registry.h" namespace { @@ -135,6 +135,7 @@ class AutoMixedPrecisionPass : public pir::Pass { paddle::dialect::SumOp::name(), paddle::dialect::SigmoidCrossEntropyWithLogitsOp::name(), paddle::dialect::CrossEntropyWithSoftmax_Op::name(), + "pd_op.array_to_tensor", }); } @@ -165,6 +166,10 @@ class AutoMixedPrecisionPass : public pir::Pass { auto backend = ConvertPlaceToBackend(place_); support_low_precision = OpSupportPrecision(op_type, backend, precision_mode_); + if (op_name == "pd_op.scale" && !OpHasFloatResult(op)) { + support_low_precision = false; + op_should_not_handle_.insert(op); + } } else { // pd op without float result support_low_precision = false; op_should_not_handle_.insert(op); @@ -481,6 +486,9 @@ class AutoMixedPrecisionPass : public pir::Pass { return operand.type() && operand.type().isa(); } + bool IsOperandHasDenseTensorVectorType(pir::OpOperand operand) const { + return operand.type() && operand.type().isa(); + } void DoInsertCastOp(pir::Operation* op, pir::OpOperand operand, @@ -586,7 +594,6 @@ class AutoMixedPrecisionPass : public pir::Pass { SetResultDataType(op->result(0), precision_mode_, builder.ir_context()); return; } - // Other pd ops if (OpRunLowPrecision(op)) { auto phi_kernel = @@ -659,11 +666,38 @@ class AutoMixedPrecisionPass : public pir::Pass { auto phi_dtype = phi::DataType::FLOAT32; for (size_t i = 0; i < op->num_operands(); i++) { auto operand = op->operand(i); - if (!IsOperandHasDenseTensorType(operand)) continue; - auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); - if (IsPhiDataTypeFloat(operand_phi_dtype) && - operand_phi_dtype == precision_mode_) { - DoInsertCastOp(op, operand, phi_dtype, builder); + if (IsOperandHasDenseTensorType(operand)) { + auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); + if (IsPhiDataTypeFloat(operand_phi_dtype) && + operand_phi_dtype == precision_mode_) { + DoInsertCastOp(op, operand, phi_dtype, builder); + } + } else if (IsOperandHasDenseTensorVectorType(operand)) { + LOG(INFO) << "IsOperandHasDenseTensorVectorType(operand)"; + LOG(INFO) << operand.source().defining_op()->name(); + auto defining_op_ = operand.source().defining_op(); + if (defining_op_->isa()) { + auto input_num = defining_op_->num_operands(); + for (size_t i = 0; i < input_num; ++i) { + auto operand = defining_op_->operand(i); + auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); + if (IsPhiDataTypeFloat(operand_phi_dtype) && + operand_phi_dtype != phi::DataType::FLOAT32) { + DoInsertCastOp( + defining_op_, operand, phi::DataType::FLOAT32, builder); + LOG(INFO) << "DoInsertCastOp"; + } + } + std::vector inputs_type(input_num); + for (size_t idx = 0; idx < input_num; ++idx) { + inputs_type[idx] = defining_op_->operand(idx).type(); + } + auto new_vec_type = + pir::VectorType::get(builder.ir_context(), inputs_type); + defining_op_->result(0).set_type(new_vec_type); + } + } else { + continue; } } } From 7b3d121a0721289db7a23c35200aa1ddf54dfa59 Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Mon, 9 Sep 2024 19:59:53 +0800 Subject: [PATCH 03/13] fix some bugs for amp pass --- .../fluid/inference/api/analysis_predictor.cc | 21 +++++++++++-------- .../general/auto_mixed_precision_pass.cc | 2 ++ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 79b1e62e361d9..efd6b90dfd202 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -115,6 +115,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/fluid/pir/dialect/operator/utils/utils.h" #include "paddle/fluid/pir/serialize_deserialize/include/interface.h" +#include "paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.h" #include "paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.h" #include "paddle/fluid/pir/transforms/general/constant_folding_pass.h" #include "paddle/fluid/pir/transforms/general/dead_code_elimination_pass.h" @@ -924,7 +925,9 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { pass->SetNotOwned(pir::Pass::kParamScopeAttr, sub_scope_); pass->SetNotOwned(pir::Pass::kPlaceAttr, &place_); if (pass->name() == "auto_mixed_precision_pass") { - pass->Set("__mixed_precision_mode__", new phi::DataType(paddle::ConvertPrecision(config_.mixed_precision_mode_))); + pass->Set("__mixed_precision_mode__", + new phi::DataType( + paddle::ConvertPrecision(config_.mixed_precision_mode_))); } if (pass->name() == "matmul_add_act_fuse_pass" || pass->name() == "conv2d_add_act_fuse_pass" || @@ -956,14 +959,14 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { // Apply some basic passes required by the framework ::pir::PassManager basic_pass_pm(::pir::IrContext::Instance(), config_.pm_opt_level_); - auto common_subexpression_elimination_pass = - ::pir::CreateCommonSubexpressionEliminationPass(); - if (std::find(config_.deleted_passes_.begin(), - config_.deleted_passes_.end(), - common_subexpression_elimination_pass->name()) == - config_.deleted_passes_.end()) { - basic_pass_pm.AddPass(std::move(common_subexpression_elimination_pass)); - } + // auto common_subexpression_elimination_pass = + // ::pir::CreateCommonSubexpressionEliminationPass(); + // if (std::find(config_.deleted_passes_.begin(), + // config_.deleted_passes_.end(), + // common_subexpression_elimination_pass->name()) == + // config_.deleted_passes_.end()) { + // basic_pass_pm.AddPass(std::move(common_subexpression_elimination_pass)); + // } auto params_sync_among_devices_pass = ::pir::CreateParamsSyncAmongDevicesPass(); if (std::find(config_.deleted_passes_.begin(), diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index d27b2aaa208a5..f6aa62c3a77e7 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -442,8 +442,10 @@ class AutoMixedPrecisionPass : public pir::Pass { if (result.type().isa() && IsDenseTensorTypeFloat( result.type().dyn_cast())) { + return true; } else if (result.type().isa() && IsVectorTypeFloat(result.type().dyn_cast())) { + return true; } } return false; From 7c2438232b62953152ccbdfe5aee0393169aea15 Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Tue, 10 Sep 2024 00:58:17 +0800 Subject: [PATCH 04/13] fix conflict for cse and amp pass --- .../fluid/inference/api/analysis_predictor.cc | 31 +++++++++++-------- .../inference/api/paddle_pass_builder.cc | 1 - 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index efd6b90dfd202..157c8dab0ad5a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -924,11 +924,6 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { for (const auto &pass : pass_pm.passes()) { pass->SetNotOwned(pir::Pass::kParamScopeAttr, sub_scope_); pass->SetNotOwned(pir::Pass::kPlaceAttr, &place_); - if (pass->name() == "auto_mixed_precision_pass") { - pass->Set("__mixed_precision_mode__", - new phi::DataType( - paddle::ConvertPrecision(config_.mixed_precision_mode_))); - } if (pass->name() == "matmul_add_act_fuse_pass" || pass->name() == "conv2d_add_act_fuse_pass" || pass->name() == "conv2d_add_fuse_pass") { @@ -959,14 +954,24 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { // Apply some basic passes required by the framework ::pir::PassManager basic_pass_pm(::pir::IrContext::Instance(), config_.pm_opt_level_); - // auto common_subexpression_elimination_pass = - // ::pir::CreateCommonSubexpressionEliminationPass(); - // if (std::find(config_.deleted_passes_.begin(), - // config_.deleted_passes_.end(), - // common_subexpression_elimination_pass->name()) == - // config_.deleted_passes_.end()) { - // basic_pass_pm.AddPass(std::move(common_subexpression_elimination_pass)); - // } + auto common_subexpression_elimination_pass = + ::pir::CreateCommonSubexpressionEliminationPass(); + if (std::find(config_.deleted_passes_.begin(), + config_.deleted_passes_.end(), + common_subexpression_elimination_pass->name()) == + config_.deleted_passes_.end()) { + basic_pass_pm.AddPass(std::move(common_subexpression_elimination_pass)); + } + auto auto_mixed_precision_pass = + ::pir::CreateAutoMixedPrecisionPass(); + if (std::find(config_.deleted_passes_.begin(), + config_.deleted_passes_.end(), + auto_mixed_precision_pass->name()) == + config_.deleted_passes_.end()) { + auto_mixed_precision_pass->SetNotOwned(pir::Pass::kPlaceAttr, &place_); + auto_mixed_precision_pass->SetNotOwned("__mixed_precision_mode__", new phi::DataType(paddle::ConvertPrecision(config_.mixed_precision_mode_))); + basic_pass_pm.AddPass(std::move(auto_mixed_precision_pass)); + } auto params_sync_among_devices_pass = ::pir::CreateParamsSyncAmongDevicesPass(); if (std::find(config_.deleted_passes_.begin(), diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 2609bf5a3868e..5249eb3805897 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -604,7 +604,6 @@ const std::vector kPirGpuPasses{ "transpose_flatten_concat_fuse_pass", "remove_redundant_transpose_pass", "transfer_layout_pass", - "auto_mixed_precision_pass", "horizontal_fuse_pass", }; From efe2c0e261cfaa0d47e488db8eff69ba2f8b434b Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Mon, 23 Sep 2024 14:36:59 +0800 Subject: [PATCH 05/13] fix amp pass for en_table_structure,mask_rcnn_r50_1x_coco,rec_mtb_nrtr in auto_mixed_precision_pass.cc --- paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index f6aa62c3a77e7..2a450e4592bff 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -103,6 +103,7 @@ class AutoMixedPrecisionPass : public pir::Pass { ProcessBlock(&block, builder); } } + cached_cast_ops_.clear(); } bool CanApplyOn(pir::Operation* op) const override { From 2f875edc562ec061c4bdc9225d66d9bc9e31f89b Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 24 Sep 2024 12:10:03 +0000 Subject: [PATCH 06/13] refine pir amp_pass --- .../fluid/inference/api/analysis_predictor.cc | 19 +++++-- .../general/auto_mixed_precision_pass.cc | 52 ++++++++++++++++--- paddle/fluid/pir/transforms/passes.h | 1 + 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9777c6003c1eb..5048b4ec4d5a9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -115,6 +115,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/fluid/pir/dialect/operator/utils/utils.h" #include "paddle/fluid/pir/serialize_deserialize/include/interface.h" +#include "paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.h" #include "paddle/fluid/pir/transforms/general/common_subexpression_elimination_pass.h" #include "paddle/fluid/pir/transforms/general/constant_folding_pass.h" #include "paddle/fluid/pir/transforms/general/dead_code_elimination_pass.h" @@ -922,6 +923,7 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { // set attr for (const auto &pass : pass_pm.passes()) { pass->SetNotOwned(pir::Pass::kParamScopeAttr, sub_scope_); + pass->SetNotOwned(pir::Pass::kPlaceAttr, &place_); if (pass->name() == "matmul_add_act_fuse_pass" || pass->name() == "conv2d_add_act_fuse_pass" || pass->name() == "conv2d_add_fuse_pass") { @@ -960,6 +962,19 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { config_.deleted_passes_.end()) { basic_pass_pm.AddPass(std::move(common_subexpression_elimination_pass)); } + if (config_.enable_gpu_mixed_) { + auto auto_mixed_precision_pass = ::pir::CreateAutoMixedPrecisionPass(); + if (std::find(config_.deleted_passes_.begin(), + config_.deleted_passes_.end(), + auto_mixed_precision_pass->name()) == + config_.deleted_passes_.end()) { + auto_mixed_precision_pass->SetNotOwned(pir::Pass::kPlaceAttr, &place_); + auto_mixed_precision_pass->Set("__mixed_precision_mode__", + new phi::DataType(paddle::ConvertPrecision( + config_.mixed_precision_mode_))); + basic_pass_pm.AddPass(std::move(auto_mixed_precision_pass)); + } + } auto params_sync_among_devices_pass = ::pir::CreateParamsSyncAmongDevicesPass(); if (std::find(config_.deleted_passes_.begin(), @@ -2227,9 +2242,7 @@ void AnalysisPredictor::PrepareArgument() { pass_builder->AppendPass("simplify_with_basic_ops_pass"); pass_builder->AppendPass("is_test_pass"); pass_builder->AppendPass("constant_folding_pass"); - } - pass_builder->AppendPass("auto_mixed_precision_pass"); - if (!config_.new_ir_enabled()) { + pass_builder->AppendPass("auto_mixed_precision_pass"); pass_builder->AppendPass("inplace_op_var_pass"); } LOG(INFO) << "This model run in GPU mixed precision mode with no ir " diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index 4ce136e78ec95..2a450e4592bff 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -47,6 +47,7 @@ #include "paddle/pir/include/core/parameter.h" #include "paddle/pir/include/core/program.h" #include "paddle/pir/include/pass/pass.h" +#include "paddle/pir/include/pass/pass_registry.h" #include "paddle/pir/include/pattern_rewrite/frozen_rewrite_pattern_set.h" #include "paddle/pir/include/pattern_rewrite/pattern_match.h" #include "paddle/pir/include/pattern_rewrite/pattern_rewrite_driver.h" @@ -102,6 +103,7 @@ class AutoMixedPrecisionPass : public pir::Pass { ProcessBlock(&block, builder); } } + cached_cast_ops_.clear(); } bool CanApplyOn(pir::Operation* op) const override { @@ -134,6 +136,7 @@ class AutoMixedPrecisionPass : public pir::Pass { paddle::dialect::SumOp::name(), paddle::dialect::SigmoidCrossEntropyWithLogitsOp::name(), paddle::dialect::CrossEntropyWithSoftmax_Op::name(), + "pd_op.array_to_tensor", }); } @@ -164,6 +167,10 @@ class AutoMixedPrecisionPass : public pir::Pass { auto backend = ConvertPlaceToBackend(place_); support_low_precision = OpSupportPrecision(op_type, backend, precision_mode_); + if (op_name == "pd_op.scale" && !OpHasFloatResult(op)) { + support_low_precision = false; + op_should_not_handle_.insert(op); + } } else { // pd op without float result support_low_precision = false; op_should_not_handle_.insert(op); @@ -436,8 +443,10 @@ class AutoMixedPrecisionPass : public pir::Pass { if (result.type().isa() && IsDenseTensorTypeFloat( result.type().dyn_cast())) { + return true; } else if (result.type().isa() && IsVectorTypeFloat(result.type().dyn_cast())) { + return true; } } return false; @@ -480,6 +489,9 @@ class AutoMixedPrecisionPass : public pir::Pass { return operand.type() && operand.type().isa(); } + bool IsOperandHasDenseTensorVectorType(pir::OpOperand operand) const { + return operand.type() && operand.type().isa(); + } void DoInsertCastOp(pir::Operation* op, pir::OpOperand operand, @@ -585,7 +597,6 @@ class AutoMixedPrecisionPass : public pir::Pass { SetResultDataType(op->result(0), precision_mode_, builder.ir_context()); return; } - // Other pd ops if (OpRunLowPrecision(op)) { auto phi_kernel = @@ -658,11 +669,38 @@ class AutoMixedPrecisionPass : public pir::Pass { auto phi_dtype = phi::DataType::FLOAT32; for (size_t i = 0; i < op->num_operands(); i++) { auto operand = op->operand(i); - if (!IsOperandHasDenseTensorType(operand)) continue; - auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); - if (IsPhiDataTypeFloat(operand_phi_dtype) && - operand_phi_dtype == precision_mode_) { - DoInsertCastOp(op, operand, phi_dtype, builder); + if (IsOperandHasDenseTensorType(operand)) { + auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); + if (IsPhiDataTypeFloat(operand_phi_dtype) && + operand_phi_dtype == precision_mode_) { + DoInsertCastOp(op, operand, phi_dtype, builder); + } + } else if (IsOperandHasDenseTensorVectorType(operand)) { + LOG(INFO) << "IsOperandHasDenseTensorVectorType(operand)"; + LOG(INFO) << operand.source().defining_op()->name(); + auto defining_op_ = operand.source().defining_op(); + if (defining_op_->isa()) { + auto input_num = defining_op_->num_operands(); + for (size_t i = 0; i < input_num; ++i) { + auto operand = defining_op_->operand(i); + auto operand_phi_dtype = GetPhiDataTypeFromOpOperand(operand); + if (IsPhiDataTypeFloat(operand_phi_dtype) && + operand_phi_dtype != phi::DataType::FLOAT32) { + DoInsertCastOp( + defining_op_, operand, phi::DataType::FLOAT32, builder); + LOG(INFO) << "DoInsertCastOp"; + } + } + std::vector inputs_type(input_num); + for (size_t idx = 0; idx < input_num; ++idx) { + inputs_type[idx] = defining_op_->operand(idx).type(); + } + auto new_vec_type = + pir::VectorType::get(builder.ir_context(), inputs_type); + defining_op_->result(0).set_type(new_vec_type); + } + } else { + continue; } } } @@ -677,3 +715,5 @@ std::unique_ptr CreateAutoMixedPrecisionPass() { } } // namespace pir + +REGISTER_IR_PASS(auto_mixed_precision_pass, AutoMixedPrecisionPass); diff --git a/paddle/fluid/pir/transforms/passes.h b/paddle/fluid/pir/transforms/passes.h index 3d04309b9cddf..2b3b98a663b28 100644 --- a/paddle/fluid/pir/transforms/passes.h +++ b/paddle/fluid/pir/transforms/passes.h @@ -46,6 +46,7 @@ USE_PIR_PASS(delete_weight_dequant_linear_op_pass); USE_PIR_PASS(delete_quant_dequant_linear_op_pass); USE_PIR_PASS(transfer_layout_pass); USE_PIR_PASS(fused_rotary_position_embedding_pass); +USE_PIR_PASS(auto_mixed_precision_pass); USE_PIR_PASS(horizontal_fuse_pass); USE_PIR_PASS(common_subexpression_elimination_pass); From 24bf9beff5bf7210f7c00122211f6fb886f60b59 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 24 Sep 2024 12:14:45 +0000 Subject: [PATCH 07/13] update --- .../pir/transforms/general/auto_mixed_precision_pass.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index 2a450e4592bff..7b40ae4c3a2b8 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/pir/dialect/operator/ir/control_flow_op.h" +#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h" #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" @@ -136,7 +137,7 @@ class AutoMixedPrecisionPass : public pir::Pass { paddle::dialect::SumOp::name(), paddle::dialect::SigmoidCrossEntropyWithLogitsOp::name(), paddle::dialect::CrossEntropyWithSoftmax_Op::name(), - "pd_op.array_to_tensor", + paddle::dialect::ArrayToTensorOp::name(), }); } @@ -167,7 +168,7 @@ class AutoMixedPrecisionPass : public pir::Pass { auto backend = ConvertPlaceToBackend(place_); support_low_precision = OpSupportPrecision(op_type, backend, precision_mode_); - if (op_name == "pd_op.scale" && !OpHasFloatResult(op)) { + if (op->isa() && !OpHasFloatResult(op)) { support_low_precision = false; op_should_not_handle_.insert(op); } @@ -676,8 +677,6 @@ class AutoMixedPrecisionPass : public pir::Pass { DoInsertCastOp(op, operand, phi_dtype, builder); } } else if (IsOperandHasDenseTensorVectorType(operand)) { - LOG(INFO) << "IsOperandHasDenseTensorVectorType(operand)"; - LOG(INFO) << operand.source().defining_op()->name(); auto defining_op_ = operand.source().defining_op(); if (defining_op_->isa()) { auto input_num = defining_op_->num_operands(); @@ -688,7 +687,6 @@ class AutoMixedPrecisionPass : public pir::Pass { operand_phi_dtype != phi::DataType::FLOAT32) { DoInsertCastOp( defining_op_, operand, phi::DataType::FLOAT32, builder); - LOG(INFO) << "DoInsertCastOp"; } } std::vector inputs_type(input_num); From 380142ed2d9d5a1a06e41744271cce1df12926e6 Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Thu, 26 Sep 2024 18:51:50 +0800 Subject: [PATCH 08/13] fix amp bugs about SetOptimizationLevel(3) --- paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index 7b40ae4c3a2b8..ecfa376ec00af 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -138,6 +138,7 @@ class AutoMixedPrecisionPass : public pir::Pass { paddle::dialect::SigmoidCrossEntropyWithLogitsOp::name(), paddle::dialect::CrossEntropyWithSoftmax_Op::name(), paddle::dialect::ArrayToTensorOp::name(), + "pd_op.fused_bias_residual_layernorm", }); } From 01bc0b2a93bbbb1f7195c6e3ec0f45c2b0786616 Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Fri, 27 Sep 2024 22:45:40 +0800 Subject: [PATCH 09/13] add some config settings for AutoMixedPrecisionPass --- paddle/fluid/inference/api/analysis_config.cc | 20 +++++++++++ .../fluid/inference/api/analysis_predictor.cc | 6 ++++ .../inference/api/paddle_analysis_config.h | 14 ++++++++ .../general/auto_mixed_precision_pass.cc | 33 +++++++++++++++++-- 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index c7cc03e552333..ae48dfd47d430 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -806,6 +806,26 @@ void AnalysisConfig::EnableLowPrecisionIO(bool x) { enable_low_precision_io_ = x; } +void AnalysisConfig::SetMixedBlackList(const std::unordered_set& black_list_){ + PADDLE_ENFORCE_EQ( + enable_gpu_mixed_, + true, + common::errors::InvalidArgument( + "To enable low precision io, please call EnableUseGPU() to specify " + "precision mode as low precision.")); + mixed_black_list_=black_list_; +} + +void AnalysisConfig::SetMixedWhiteList(const std::unordered_set& white_list_){ + PADDLE_ENFORCE_EQ( + enable_gpu_mixed_, + true, + common::errors::InvalidArgument( + "To enable low precision io, please call EnableUseGPU() to specify " + "precision mode as low precision.")); + mixed_white_list_=white_list_; +} + void AnalysisConfig::SetTRTDynamicShapeInfo( std::map> min_input_shape, std::map> max_input_shape, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5048b4ec4d5a9..02cbc8c1ac01a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -972,6 +972,12 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { auto_mixed_precision_pass->Set("__mixed_precision_mode__", new phi::DataType(paddle::ConvertPrecision( config_.mixed_precision_mode_))); + auto_mixed_precision_pass->Set("__enable_low_precision_io__", + new bool(config_.enable_low_precision_io_)); + auto_mixed_precision_pass->Set("__mixed_black_list__", + new std::unordered_set(config_.mixed_black_list_)); + auto_mixed_precision_pass->Set("__mixed_white_list__", + new std::unordered_set(config_.mixed_white_list_)); basic_pass_pm.AddPass(std::move(auto_mixed_precision_pass)); } } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index dab3a66dcab32..cea01d8267feb 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -596,6 +596,20 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableLowPrecisionIO(bool x = true); + /// + /// \brief Set MixedBlackList for low precision. + /// + /// \param black_list_ unordered_set for AutoMixedPrecisionPass black_list. + /// + void SetMixedBlackList(const std::unordered_set& black_list_ = {}); + + /// + /// \brief Set MixedWhiteList for low precision. + /// + /// \param white_list_ unordered_set for AutoMixedPrecisionPass white_list. + /// + void SetMixedWhiteList(const std::unordered_set& white_list_ = {}); + /// /// \brief Control whether to specify the inputs' names. /// The ZeroCopyTensor type has a name member, assign it with the diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index ecfa376ec00af..a4cd3a6f80603 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -63,8 +63,6 @@ class AutoMixedPrecisionPass : public pir::Pass { precision_mode_(phi::DataType::FLOAT16), enable_low_precision_io_(false), context_(nullptr), - black_list_(), - white_list_(), op_run_low_precision_(), op_should_not_handle_(), cached_cast_ops_() {} @@ -86,10 +84,39 @@ class AutoMixedPrecisionPass : public pir::Pass { "required!" "Use Set method to set the scope attribute.")); + PADDLE_ENFORCE_EQ( + Has("__enable_low_precision_io__"), + true, + common::errors::InvalidArgument( + "Pass initialize failed." + "When using AutoMixedPrecisionPass, enable_low_precision_io attribute is " + "required!" + "Use Set method to set the scope attribute.")); + + PADDLE_ENFORCE_EQ( + Has("__mixed_black_list__"), + true, + common::errors::InvalidArgument( + "Pass initialize failed." + "When using AutoMixedPrecisionPass, mixed_black_list attribute is " + "required!" + "Use Set method to set the scope attribute.")); + + PADDLE_ENFORCE_EQ( + Has("__mixed_white_list__"), + true, + common::errors::InvalidArgument( + "Pass initialize failed." + "When using AutoMixedPrecisionPass, mixed_white_list attribute is " + "required!" + "Use Set method to set the scope attribute.")); + place_ = Get(pir::Pass::kPlaceAttr); precision_mode_ = Get("__mixed_precision_mode__"); context_ = context; - enable_low_precision_io_ = false; + enable_low_precision_io_ = Get("__enable_low_precision_io__"); + black_list_ = Get>("__mixed_black_list__"); + white_list_ = Get>("__mixed_white_list__"); SetDefaultBlacklist(); return true; } From 71936168bbe9ee323588185ee060f063c309648d Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Sun, 29 Sep 2024 19:19:35 +0800 Subject: [PATCH 10/13] fix fused_bias_residual_layernorm op bugs for amp pass. --- .../pir/transforms/general/auto_mixed_precision_pass.cc | 1 - paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index a4cd3a6f80603..fec41890d0af1 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -165,7 +165,6 @@ class AutoMixedPrecisionPass : public pir::Pass { paddle::dialect::SigmoidCrossEntropyWithLogitsOp::name(), paddle::dialect::CrossEntropyWithSoftmax_Op::name(), paddle::dialect::ArrayToTensorOp::name(), - "pd_op.fused_bias_residual_layernorm", }); } diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu index 11aefc193f0be..75ba47b2bd3d9 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu @@ -1226,6 +1226,8 @@ PD_REGISTER_KERNEL(fused_bias_residual_layernorm, float, phi::dtype::float16, phi::dtype::bfloat16) { + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); @@ -1237,6 +1239,8 @@ PD_REGISTER_KERNEL(fused_bias_residual_layernorm, phi::fusion::FusedLayerNormKernel, float, phi::dtype::float16) { + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); @@ -1249,6 +1253,8 @@ PD_REGISTER_KERNEL(fused_bias_residual_layernorm, phi::fusion::FusedLayerNormKernel, float, phi::dtype::float16) { + kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); From 0eea6d09eb44d97c9c06ced3a51940ca74421ebe Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Wed, 9 Oct 2024 19:00:04 +0800 Subject: [PATCH 11/13] delete repeat func. --- paddle/fluid/inference/api/analysis_config.cc | 20 ------------------- .../inference/api/paddle_analysis_config.h | 14 ------------- 2 files changed, 34 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index ae48dfd47d430..c7cc03e552333 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -806,26 +806,6 @@ void AnalysisConfig::EnableLowPrecisionIO(bool x) { enable_low_precision_io_ = x; } -void AnalysisConfig::SetMixedBlackList(const std::unordered_set& black_list_){ - PADDLE_ENFORCE_EQ( - enable_gpu_mixed_, - true, - common::errors::InvalidArgument( - "To enable low precision io, please call EnableUseGPU() to specify " - "precision mode as low precision.")); - mixed_black_list_=black_list_; -} - -void AnalysisConfig::SetMixedWhiteList(const std::unordered_set& white_list_){ - PADDLE_ENFORCE_EQ( - enable_gpu_mixed_, - true, - common::errors::InvalidArgument( - "To enable low precision io, please call EnableUseGPU() to specify " - "precision mode as low precision.")); - mixed_white_list_=white_list_; -} - void AnalysisConfig::SetTRTDynamicShapeInfo( std::map> min_input_shape, std::map> max_input_shape, diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index cea01d8267feb..dab3a66dcab32 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -596,20 +596,6 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableLowPrecisionIO(bool x = true); - /// - /// \brief Set MixedBlackList for low precision. - /// - /// \param black_list_ unordered_set for AutoMixedPrecisionPass black_list. - /// - void SetMixedBlackList(const std::unordered_set& black_list_ = {}); - - /// - /// \brief Set MixedWhiteList for low precision. - /// - /// \param white_list_ unordered_set for AutoMixedPrecisionPass white_list. - /// - void SetMixedWhiteList(const std::unordered_set& white_list_ = {}); - /// /// \brief Control whether to specify the inputs' names. /// The ZeroCopyTensor type has a name member, assign it with the From 656bb2fe6ed28a55d5f869eff4111e353ebc8099 Mon Sep 17 00:00:00 2001 From: AO-XIN Date: Wed, 9 Oct 2024 19:28:26 +0800 Subject: [PATCH 12/13] code stytle --- paddle/fluid/inference/api/analysis_predictor.cc | 15 +++++++++------ .../general/auto_mixed_precision_pass.cc | 16 ++++++++-------- .../kernels/fusion/gpu/fused_layernorm_kernel.cu | 12 ++++++------ 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index c347caf6a209a..a20bca11a02c9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -972,12 +972,15 @@ void AnalysisPredictor::OptimizeInferencePirProgram() { auto_mixed_precision_pass->Set("__mixed_precision_mode__", new phi::DataType(paddle::ConvertPrecision( config_.mixed_precision_mode_))); - auto_mixed_precision_pass->Set("__enable_low_precision_io__", - new bool(config_.enable_low_precision_io_)); - auto_mixed_precision_pass->Set("__mixed_black_list__", - new std::unordered_set(config_.mixed_black_list_)); - auto_mixed_precision_pass->Set("__mixed_white_list__", - new std::unordered_set(config_.mixed_white_list_)); + auto_mixed_precision_pass->Set( + "__enable_low_precision_io__", + new bool(config_.enable_low_precision_io_)); + auto_mixed_precision_pass->Set( + "__mixed_black_list__", + new std::unordered_set(config_.mixed_black_list_)); + auto_mixed_precision_pass->Set( + "__mixed_white_list__", + new std::unordered_set(config_.mixed_white_list_)); basic_pass_pm.AddPass(std::move(auto_mixed_precision_pass)); } } diff --git a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc index fec41890d0af1..e6f8d509f4397 100644 --- a/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc +++ b/paddle/fluid/pir/transforms/general/auto_mixed_precision_pass.cc @@ -84,14 +84,14 @@ class AutoMixedPrecisionPass : public pir::Pass { "required!" "Use Set method to set the scope attribute.")); - PADDLE_ENFORCE_EQ( - Has("__enable_low_precision_io__"), - true, - common::errors::InvalidArgument( - "Pass initialize failed." - "When using AutoMixedPrecisionPass, enable_low_precision_io attribute is " - "required!" - "Use Set method to set the scope attribute.")); + PADDLE_ENFORCE_EQ(Has("__enable_low_precision_io__"), + true, + common::errors::InvalidArgument( + "Pass initialize failed." + "When using AutoMixedPrecisionPass, " + "enable_low_precision_io attribute is " + "required!" + "Use Set method to set the scope attribute.")); PADDLE_ENFORCE_EQ( Has("__mixed_black_list__"), diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu index 75ba47b2bd3d9..45c1c5624911b 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu @@ -575,10 +575,10 @@ struct TryDispatchLayerNormBlockSMemImplPackSize { const int64_t rows, const int64_t cols, const double epsilon, - ComputeType* mean, - ComputeType* inv_variance, - ComputeType col_divisor, - bool* success) { + ComputeType * + mean, + ComputeType * + inv_variance, ComputeType col_divisor, bool* success) { if (cols % 4 == 0 && CanPackAs(load, 4) && CanPackAs(store, 4)) { return TryDispatchLayerNormBlockSMemImplBlockSize(load, 4) && CanPackAs(store, 4)) { return LaunchLayerNormBlockUncachedImpl( From ad916bfee6dbefa6bb6d6d69605cae4fc1e7ad6a Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 9 Oct 2024 11:59:27 +0000 Subject: [PATCH 13/13] fix code style --- .../phi/kernels/fusion/gpu/fused_layernorm_kernel.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu index 45c1c5624911b..75ba47b2bd3d9 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu @@ -575,10 +575,10 @@ struct TryDispatchLayerNormBlockSMemImplPackSize { const int64_t rows, const int64_t cols, const double epsilon, - ComputeType * - mean, - ComputeType * - inv_variance, ComputeType col_divisor, bool* success) { + ComputeType* mean, + ComputeType* inv_variance, + ComputeType col_divisor, + bool* success) { if (cols % 4 == 0 && CanPackAs(load, 4) && CanPackAs(store, 4)) { return TryDispatchLayerNormBlockSMemImplBlockSize(load, 4) && CanPackAs(store, 4)) { return LaunchLayerNormBlockUncachedImpl(