From d65b004a1bab5636d4395f33a19ca11629336255 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Fri, 1 Mar 2024 18:48:04 +0800 Subject: [PATCH 01/15] [PIR] Set NCHW as default Layout for IrTensor (#62254) * fix * fix bug * fix --- paddle/fluid/pir/dialect/operator/ir/ir_tensor.h | 2 +- paddle/phi/core/kernel_factory.cc | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h index e2c3229b04df05..21d8a9fdd7ae54 100644 --- a/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h +++ b/paddle/fluid/pir/dialect/operator/ir/ir_tensor.h @@ -81,7 +81,7 @@ class IrTensor : public phi::TensorBase, private: phi::DDim dims_; phi::DataType dtype_{phi::DataType::FLOAT32}; - phi::DataLayout layout_{phi::DataLayout::ANY}; + phi::DataLayout layout_{phi::DataLayout::NCHW}; LoD lod_; size_t offset_{0}; }; diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index 35ac9e1e0db956..7f1ee799824e8b 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -177,6 +177,22 @@ bool KernelFactory::HasKernel(const std::string& kernel_name, phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name)); auto kernel_iter = iter->second.find(kernel_key); + if (kernel_iter == iter->second.end() && + kernel_key.layout() != phi::DataLayout::ALL_LAYOUT) { + phi::KernelKey any_layout_kernel_key( + kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()); + kernel_iter = iter->second.find(any_layout_kernel_key); + } + +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + if (kernel_iter == iter->second.end() && + kernel_key.backend() > phi::Backend::NUM_BACKENDS) { + kernel_iter = iter->second.find({phi::Backend::CUSTOM, + phi::DataLayout::ALL_LAYOUT, + kernel_key.dtype()}); + } +#endif + if (kernel_iter == iter->second.end()) { return false; } From 0cb9bf687a3372cf851089fd5508f4d7fafc1295 Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com> Date: Fri, 1 Mar 2024 19:29:08 +0800 Subject: [PATCH 02/15] [Inference] Add a config api to use PIR (#61968) * add a config api for pir * fix comment * fix the enable failure * fix bug * fix bug --- paddle/fluid/inference/analysis/argument.h | 1 + .../passes/inference_op_replace_pass.cc | 4 +--- .../ir_params_sync_among_devices_pass.cc | 5 ++--- paddle/fluid/inference/api/analysis_config.cc | 1 + .../fluid/inference/api/analysis_predictor.cc | 15 ++++++------- .../inference/api/demo_ci/custom_op_demo.cc | 1 + paddle/fluid/inference/api/demo_ci/run.sh | 2 +- paddle/fluid/inference/api/helper.cc | 6 ++---- paddle/fluid/inference/api/helper.h | 2 +- .../inference/api/paddle_analysis_config.h | 14 +++++++++++++ paddle/fluid/pybind/inference_api.cc | 2 ++ .../cpp/inference/analysis/analyzer_tester.cc | 2 ++ test/custom_op/test_inference_inplace.py | 13 +++++------- test/ir/inference/auto_scan_test.py | 4 ++-- test/ir/inference/program_config.py | 1 - .../inference/test_inference_predictor_run.py | 13 +++++------- .../test_decomp_inference_predictor_run.py | 21 ++++++++----------- 17 files changed, 57 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a87c919bbe2c1f..1407a8f875a297 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -227,6 +227,7 @@ struct Argument { DECL_ARGUMENT_FIELD(use_cutlass, UseCutlass, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); + DECL_ARGUMENT_FIELD(use_pir, UsePIR, bool); // Usually use for trt dynamic shape. // TRT will select the best kernel according to opt shape diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc index b422dea840af5f..993ab2e8618f47 100644 --- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc +++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc @@ -16,14 +16,12 @@ #include "paddle/fluid/inference/analysis/argument.h" -COMMON_DECLARE_bool(enable_pir_in_executor); - namespace paddle { namespace inference { namespace analysis { void InferenceOpReplacePass::RunImpl(Argument* argument) { - if (FLAGS_enable_pir_in_executor) { + if (argument->use_pir()) { return; } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 2961d5c66f9f49..2e722f9a7e6e9e 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -32,8 +32,6 @@ PD_DEFINE_bool( // NOLINT false, "Keep old mode for developers, the model is saved on cpu not device."); -COMMON_DECLARE_bool(enable_pir_in_executor); - namespace paddle { namespace inference { namespace analysis { @@ -208,9 +206,10 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) { #endif void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { - if (FLAGS_enable_pir_in_executor) { + if (argument->use_pir()) { return; } + PADDLE_ENFORCE_EQ( argument->scope_valid(), true, diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 5987483220b8ad..888e2cbe080c95 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -581,6 +581,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(skip_load_params_); CP_MEMBER(use_new_executor_); + CP_MEMBER(use_pir_); if (use_gpu_) { PADDLE_ENFORCE_EQ(use_xpu_, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9b05b9f78572e4..1cc723cd7913e8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -134,7 +134,6 @@ #include "paddle/fluid/pir/transforms/shape_optimization_pass.h" #include "paddle/pir/include/pass/pass_manager.h" -COMMON_DECLARE_bool(enable_pir_in_executor); COMMON_DECLARE_bool(pir_apply_inplace_pass); namespace paddle { @@ -376,7 +375,7 @@ AnalysisPredictor::AnalysisPredictor(const AnalysisConfig &config) } if (config_.new_executor_enabled()) { config_.EnableMemoryOptim(false); - if (FLAGS_enable_pir_in_executor) { + if (config_.new_ir_enabled()) { config_.SwitchIrOptim(false); } } @@ -893,7 +892,7 @@ bool AnalysisPredictor::PrepareExecutor() { auto output_names = GetOutputNames(); execution_config.skip_gc_vars.insert(output_names.begin(), output_names.end()); - if (FLAGS_enable_pir_in_executor) { + if (config_.new_ir_enabled()) { pir_program_ = std::move( paddle::TranslateLegacyProgramToProgram(*inference_program_)); @@ -1715,6 +1714,7 @@ void AnalysisPredictor::PrepareArgument() { argument_->SetEnableIrOptim(config_.enable_ir_optim_); argument_->SetEnableMemoryOptim(config_.enable_memory_optim()); argument_->SetModelFromMemory(config_.model_from_memory_); + argument_->SetUsePIR(config_.new_ir_enabled()); // Analyze inference_program argument_->SetPredictorID(predictor_id_); argument_->SetRootPredictorID(root_predictor_id_); @@ -1953,14 +1953,14 @@ void AnalysisPredictor::PrepareArgument() { model_precision_ == phi::DataType::FLOAT32) { argument_->SetEnableIrOptim(true); pass_builder->ClearPasses(); - if (!FLAGS_enable_pir_in_executor) { + if (!config_.new_ir_enabled()) { pass_builder->AppendPass("map_op_to_another_pass"); pass_builder->AppendPass("simplify_with_basic_ops_pass"); pass_builder->AppendPass("is_test_pass"); pass_builder->AppendPass("constant_folding_pass"); } pass_builder->AppendPass("auto_mixed_precision_pass"); - if (!FLAGS_enable_pir_in_executor) { + if (!config_.new_ir_enabled()) { pass_builder->AppendPass("inplace_op_var_pass"); } LOG(INFO) << "This model run in GPU mixed precision mode with no ir " @@ -2083,8 +2083,9 @@ CreatePaddlePredictor( // Register custom operators compiled by the user. // This function can only be executed once per process. static std::once_flag custom_operators_registered; - std::call_once(custom_operators_registered, - []() { inference::RegisterAllCustomOperator(); }); + std::call_once(custom_operators_registered, [config]() { + inference::RegisterAllCustomOperator(config.new_ir_enabled()); + }); auto SetGflags = [](const AnalysisConfig &config) { auto SetGflag = [](const char *name, const char *value) { diff --git a/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc b/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc index b4c8cccb8e7906..ec44238f008dc4 100644 --- a/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/custom_op_demo.cc @@ -52,6 +52,7 @@ int main(int argc, char **argv) { config.SetModel(FLAGS_modeldir + "/custom_relu.pdmodel", FLAGS_modeldir + "/custom_relu.pdiparams"); config.EnableNewExecutor(true); + config.EnableNewIR(true); auto predictor{paddle_infer::CreatePredictor(config)}; std::vector input_shape = {1, 1, 28, 28}; std::vector input_data(1 * 1 * 28 * 28, 1); diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 795b414258b560..3de4fd3d0335ac 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -301,7 +301,7 @@ for WITH_STATIC_LIB in ON OFF; do -DCUSTOM_OPERATOR_FILES=$CUSTOM_OPERATOR_FILES \ -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) - FLAGS_enable_pir_in_executor=1 ./custom_op_demo \ + ./custom_op_demo \ --modeldir=$DATA_DIR/custom_op/custom_relu_infer_model if [ $? -ne 0 ]; then echo "custom_op_demo runs failed " >> ${current_dir}/test_summary.txt diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc index e9eb090a771d25..80429055465ebb 100644 --- a/paddle/fluid/inference/api/helper.cc +++ b/paddle/fluid/inference/api/helper.cc @@ -22,8 +22,6 @@ #include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/pir/include/core/ir_context.h" -COMMON_DECLARE_bool(enable_pir_in_executor); - namespace paddle { namespace inference { @@ -50,11 +48,11 @@ std::string to_string>>( return ss.str(); } -void RegisterAllCustomOperator() { +void RegisterAllCustomOperator(bool use_pir) { auto &op_meta_info_map = OpMetaInfoMap::Instance(); const auto &meta_info_map = op_meta_info_map.GetMap(); for (auto &pair : meta_info_map) { - if (FLAGS_enable_pir_in_executor) { + if (use_pir) { ::pir::IrContext *ctx = ::pir::IrContext::Instance(); auto *custom_dialect = ctx->GetOrRegisterDialect(); diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 22a5319bb0dbc4..17ec8852b61df7 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -431,7 +431,7 @@ static bool IsFileExists(const std::string &path) { return exists; } -void RegisterAllCustomOperator(); +void RegisterAllCustomOperator(bool use_pir); void InitGflagsFromEnv(); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 134c0799ec663d..64b2de0eba3d4a 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -879,10 +879,22 @@ struct PD_INFER_DECL AnalysisConfig { /// int tensorrt_optimization_level() { return trt_optimization_level_; } + /// \brief A boolean state telling whether to use new executor. + /// + /// \return bool whether to use new executor. + /// void EnableNewExecutor(bool x = true) { use_new_executor_ = x; } bool new_executor_enabled() const { return use_new_executor_; } + /// \brief A boolean state telling whether to use new IR. + /// + /// \return bool whether to use new IR. + /// + void EnableNewIR(bool x = true) { use_pir_ = x; } + + bool new_ir_enabled() const { return use_pir_; } + /// /// \brief Control whether to use optimized model to inference. /// @@ -1425,6 +1437,8 @@ struct PD_INFER_DECL AnalysisConfig { // PrepareProgram(). So we add this flag to control the process. bool apply_optim_{false}; bool skip_load_params_{false}; + + bool use_pir_{false}; }; } // namespace paddle diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 268806509031e2..708866b0bac348 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -869,6 +869,8 @@ void BindAnalysisConfig(py::module *m) { .def("enable_new_executor", &AnalysisConfig::EnableNewExecutor, py::arg("x") = true) + .def("enable_new_ir", &AnalysisConfig::EnableNewIR, py::arg("x") = true) + .def("new_ir_enabled", &AnalysisConfig::new_ir_enabled) .def("enable_profile", &AnalysisConfig::EnableProfile) .def("disable_glog_info", &AnalysisConfig::DisableGlogInfo) .def("glog_info_disabled", &AnalysisConfig::glog_info_disabled) diff --git a/test/cpp/inference/analysis/analyzer_tester.cc b/test/cpp/inference/analysis/analyzer_tester.cc index 611fd757c2bcf6..f4a8a0f7669b03 100644 --- a/test/cpp/inference/analysis/analyzer_tester.cc +++ b/test/cpp/inference/analysis/analyzer_tester.cc @@ -33,6 +33,7 @@ TEST(Analyzer, analysis_without_tensorrt) { argument.SetModelDir(FLAGS_inference_model_dir); argument.SetEnableIrOptim(false); argument.SetUseGPU(false); + argument.SetUsePIR(false); argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass"}); @@ -49,6 +50,7 @@ TEST(Analyzer, analysis_with_tensorrt) { argument.SetTensorRtWorkspaceSize(1 << 20); argument.SetModelDir(FLAGS_inference_model_dir); argument.SetUseGPU(false); + argument.SetUsePIR(false); argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass"}); diff --git a/test/custom_op/test_inference_inplace.py b/test/custom_op/test_inference_inplace.py index 303b2b21d15dc8..64219d8e148d00 100644 --- a/test/custom_op/test_inference_inplace.py +++ b/test/custom_op/test_inference_inplace.py @@ -83,10 +83,7 @@ def setUp(self): def tearDown(self): self.temp_dir.cleanup() - def enable_pir(self, flag: bool): - paddle.set_flags({'FLAGS_enable_pir_in_executor': flag}) - - def init_predictor(self): + def init_predictor(self, use_pir: bool): config = Config( os.path.join( self.temp_dir.name, @@ -100,6 +97,8 @@ def init_predictor(self): config.enable_use_gpu(256, 0) config.switch_ir_optim(False) config.enable_new_executor() + if use_pir: + config.enable_new_ir() predictor = create_predictor(config) return predictor @@ -123,11 +122,9 @@ def get_outputs(self, predictor): return outputs[0] def test_output(self): - self.enable_pir(True) - pir_predictor = self.init_predictor() + pir_predictor = self.init_predictor(True) pir_output = self.get_outputs(pir_predictor) - self.enable_pir(False) - predictor = self.init_predictor() + predictor = self.init_predictor(False) output = self.get_outputs(predictor) np.testing.assert_allclose( output.numpy().flatten(), pir_output.numpy().flatten() diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py index b26725314fb1f9..02bd28d7139f97 100755 --- a/test/ir/inference/auto_scan_test.py +++ b/test/ir/inference/auto_scan_test.py @@ -352,13 +352,13 @@ def run_test_config( """ Test a single case. """ - paddle.set_flags({'FLAGS_enable_pir_in_executor': True}) + pred_config.enable_new_ir(True) pred_config.switch_ir_optim(False) pred_config.enable_new_executor() result = super().run_test_config( model, params, prog_config, pred_config, feed_data ) - paddle.set_flags({'FLAGS_enable_pir_in_executor': False}) + pred_config.enable_new_ir(False) return result diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py index f3d44361260f94..f64335fc4379e8 100644 --- a/test/ir/inference/program_config.py +++ b/test/ir/inference/program_config.py @@ -346,7 +346,6 @@ def _cast(self) -> None: def create_fake_model(program_config): '''Create a Paddle model(in memory) according to the given config.''' - paddle.set_flags({'FLAGS_enable_pir_in_executor': False}) program_config = copy.deepcopy(program_config) program_config._cast() paddle.enable_static() diff --git a/test/ir/inference/test_inference_predictor_run.py b/test/ir/inference/test_inference_predictor_run.py index 1d8abc174f1cf1..21b095d7974426 100644 --- a/test/ir/inference/test_inference_predictor_run.py +++ b/test/ir/inference/test_inference_predictor_run.py @@ -62,10 +62,7 @@ def setUp(self): def tearDown(self): self.temp_dir.cleanup() - def enable_pir(self, flag: bool): - paddle.set_flags({'FLAGS_enable_pir_in_executor': flag}) - - def init_predictor(self): + def init_predictor(self, use_pir: bool): config = Config( os.path.join( self.temp_dir.name, @@ -80,6 +77,8 @@ def init_predictor(self): config.switch_ir_optim(False) # config.enable_memory_optim() config.enable_new_executor() + if use_pir: + config.enable_new_ir() predictor = create_predictor(config) return predictor @@ -117,11 +116,9 @@ def get_inorder_output(self, predictor): return outputs[0] def test_output(self): - self.enable_pir(False) - predictor = self.init_predictor() + predictor = self.init_predictor(False) output = self.get_inorder_output(predictor) - self.enable_pir(True) - pir_predictor = self.init_predictor() + pir_predictor = self.init_predictor(True) pir_output = self.get_disorder_output(pir_predictor) np.testing.assert_allclose( diff --git a/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py b/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py index 0a9c091f05ee7f..517cd7083288a9 100644 --- a/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py +++ b/test/ir/pir/cinn/symbolic/test_decomp_inference_predictor_run.py @@ -68,10 +68,7 @@ def setUp(self): def tearDown(self): self.temp_dir.cleanup() - def enable_pir(self, flag: bool): - paddle.set_flags({'FLAGS_enable_pir_in_executor': flag}) - - def init_predictor(self): + def init_predictor(self, use_pir: bool): config = Config( os.path.join( self.temp_dir.name, @@ -86,6 +83,8 @@ def init_predictor(self): config.enable_use_gpu(256, 0) config.switch_ir_optim(False) config.enable_new_executor() + if use_pir: + config.enable_new_ir() predictor = create_predictor(config) return predictor @@ -118,12 +117,11 @@ def get_inorder_output(self, predictor): return outputs[0] def test_output_prim_inorder(self): - self.enable_pir(False) - predictor = self.init_predictor() + predictor = self.init_predictor(False) output = self.get_inorder_output(predictor) - self.enable_pir(True) + paddle.set_flags({'FLAGS_enable_pir_in_executor': True}) paddle.core._set_prim_all_enabled(True) - pir_predictor = self.init_predictor() + pir_predictor = self.init_predictor(True) pir_output = self.get_inorder_output(pir_predictor) paddle.core._set_prim_all_enabled(False) @@ -135,12 +133,11 @@ def test_output_prim_inorder(self): ) def test_output_prim_disorder(self): - self.enable_pir(False) - predictor = self.init_predictor() + predictor = self.init_predictor(False) output = self.get_disorder_output(predictor) - self.enable_pir(True) + paddle.set_flags({'FLAGS_enable_pir_in_executor': True}) paddle.core._set_prim_all_enabled(True) - pir_predictor = self.init_predictor() + pir_predictor = self.init_predictor(True) pir_output = self.get_disorder_output(pir_predictor) paddle.core._set_prim_all_enabled(False) From a77172c4dae94550a27d4e620f77b7222556ac31 Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 1 Mar 2024 23:12:35 +0800 Subject: [PATCH 03/15] Fix tensor_comsumer tensor_consumer,etc (#62213) --- paddle/fluid/pir/drr/src/attr_type_uilts.h | 6 ++--- .../fluid/pir/drr/src/ir_operation_factory.cc | 24 +++++++++---------- paddle/fluid/pir/drr/src/pattern_graph.cc | 20 ++++++++-------- paddle/fluid/pir/drr/src/rewrite_pattern.cc | 2 +- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/pir/drr/src/attr_type_uilts.h b/paddle/fluid/pir/drr/src/attr_type_uilts.h index 02f5a4defc1551..a48ed382a7d19d 100644 --- a/paddle/fluid/pir/drr/src/attr_type_uilts.h +++ b/paddle/fluid/pir/drr/src/attr_type_uilts.h @@ -48,7 +48,7 @@ PD_SPECIALIZE_CppTypeToIrAttribute(phi::IntArray, paddle::dialect::IntArrayAttribute); template -struct IrAttrbuteCreator { +struct IrAttributeCreator { typename CppTypeToIrAttribute::type operator()(T obj) const { return CppTypeToIrAttribute::type::template get( pir::IrContext::Instance(), obj); @@ -56,7 +56,7 @@ struct IrAttrbuteCreator { }; template <> -struct IrAttrbuteCreator> { +struct IrAttributeCreator> { pir::ArrayAttribute operator()(std::vector obj) const { std::vector attr_vec; attr_vec.reserve(obj.size()); @@ -69,7 +69,7 @@ struct IrAttrbuteCreator> { }; template <> -struct IrAttrbuteCreator> { +struct IrAttributeCreator> { pir::ArrayAttribute operator()(std::vector obj) const { std::vector attr_vec; attr_vec.reserve(obj.size()); diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.cc b/paddle/fluid/pir/drr/src/ir_operation_factory.cc index 61c12c281e1398..bfe97d45592f72 100644 --- a/paddle/fluid/pir/drr/src/ir_operation_factory.cc +++ b/paddle/fluid/pir/drr/src/ir_operation_factory.cc @@ -65,33 +65,33 @@ void OperationFactory::RegisterManualOpCreator() { pir::Attribute CreateIrAttribute(const std::any& obj) { if (obj.type() == typeid(bool)) { - return IrAttrbuteCreator()(std::any_cast(obj)); + return IrAttributeCreator()(std::any_cast(obj)); } else if (obj.type() == typeid(int32_t)) { - return IrAttrbuteCreator()(std::any_cast(obj)); + return IrAttributeCreator()(std::any_cast(obj)); } else if (obj.type() == typeid(int64_t)) { - return IrAttrbuteCreator()(std::any_cast(obj)); + return IrAttributeCreator()(std::any_cast(obj)); } else if (obj.type() == typeid(float)) { - return IrAttrbuteCreator()(std::any_cast(obj)); + return IrAttributeCreator()(std::any_cast(obj)); } else if (obj.type() == typeid(std::string)) { - return IrAttrbuteCreator()(std::any_cast(obj)); + return IrAttributeCreator()(std::any_cast(obj)); } else if (obj.type() == typeid(const char*)) { - return IrAttrbuteCreator()(std::any_cast(obj)); + return IrAttributeCreator()(std::any_cast(obj)); } else if (obj.type() == typeid(phi::DataType)) { - return IrAttrbuteCreator()( + return IrAttributeCreator()( std::any_cast(obj)); } else if (obj.type() == typeid(phi::Place)) { - return IrAttrbuteCreator()(std::any_cast(obj)); + return IrAttributeCreator()(std::any_cast(obj)); } else if (obj.type() == typeid(std::vector)) { // NOLINT - return IrAttrbuteCreator>()( + return IrAttributeCreator>()( std::any_cast>(obj)); } else if (obj.type() == typeid(std::vector)) { - return IrAttrbuteCreator>()( + return IrAttributeCreator>()( std::any_cast>(obj)); } else if (obj.type() == typeid(std::vector)) { - return IrAttrbuteCreator>()( + return IrAttributeCreator>()( std::any_cast>(obj)); } else if (obj.type() == typeid(phi::IntArray)) { - return IrAttrbuteCreator()( + return IrAttributeCreator()( std::any_cast(obj)); } else { PADDLE_THROW( diff --git a/paddle/fluid/pir/drr/src/pattern_graph.cc b/paddle/fluid/pir/drr/src/pattern_graph.cc index eccbb30dea8906..be57150ed8ffdd 100644 --- a/paddle/fluid/pir/drr/src/pattern_graph.cc +++ b/paddle/fluid/pir/drr/src/pattern_graph.cc @@ -148,7 +148,7 @@ void GraphTopo::WalkGraphNodesTopoOrder( graph_->input_tensors(); const std::unordered_map> &id2owned_tensor = graph_->id2owned_tensor(); - const std::vector> &owend_opcall = + const std::vector> &owned_opcall = graph_->owned_op_call(); std::queue opcall_queue; @@ -156,7 +156,7 @@ void GraphTopo::WalkGraphNodesTopoOrder( opcall_dependent; // init opcall_dependent - for (const std::shared_ptr &opcall_sptr : owend_opcall) { + for (const std::shared_ptr &opcall_sptr : owned_opcall) { if (opcall_sptr.get()->inputs().empty()) { // opcall inputs is empty opcall_queue.push(opcall_sptr.get()); } else { @@ -174,11 +174,11 @@ void GraphTopo::WalkGraphNodesTopoOrder( "The input tensor [%s] must exists " "in pattern graph to be obtained.", tensor_name)); - for (const auto &tensor_comsumer : + for (const auto &tensor_consumer : id2owned_tensor.at(tensor_name).get()->consumers()) { - opcall_dependent[tensor_comsumer].erase(tensor_name); - if (opcall_dependent[tensor_comsumer].empty()) { - opcall_queue.push(tensor_comsumer); + opcall_dependent[tensor_consumer].erase(tensor_name); + if (opcall_dependent[tensor_consumer].empty()) { + opcall_queue.push(tensor_consumer); } } } @@ -190,10 +190,10 @@ void GraphTopo::WalkGraphNodesTopoOrder( // update opcall_dependent for (const auto &output_tensor : opcall->outputs()) { - for (const auto &tensor_comsumer : output_tensor->consumers()) { - opcall_dependent[tensor_comsumer].erase(output_tensor->name()); - if (opcall_dependent[tensor_comsumer].empty()) { - opcall_queue.push(tensor_comsumer); + for (const auto &tensor_consumer : output_tensor->consumers()) { + opcall_dependent[tensor_consumer].erase(output_tensor->name()); + if (opcall_dependent[tensor_consumer].empty()) { + opcall_queue.push(tensor_consumer); } } } diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc index 04390126ddddf8..46b034aca85582 100644 --- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc +++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc @@ -59,7 +59,7 @@ bool DrrRewritePattern::MatchAndRewrite( if (PatternGraphMatch(op, src_match_ctx.get())) { VLOG(4) << "DRR pattern (" << pattern_name_ << ") is matched in program."; PatternGraphRewrite(*src_match_ctx, rewriter); - VLOG(4) << "DRR pattern (" << pattern_name_ << ") is rewrited in program."; + VLOG(4) << "DRR pattern (" << pattern_name_ << ") is rewritten in program."; return true; } return false; From 78254af04977586d0be32f8129236feefb9663c9 Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 1 Mar 2024 23:13:54 +0800 Subject: [PATCH 04/15] Fix Unexpceted Unexpected, etc (#62260) --- .../fast_threaded_ssa_graph_executor.cc | 4 ++-- .../framework/details/fetch_op_handle.cc | 2 +- paddle/fluid/framework/operator.cc | 10 +++++----- paddle/fluid/framework/parallel_executor.cc | 10 +++++----- paddle/fluid/framework/tensor_util.cc | 8 +++++--- paddle/fluid/framework/trainer_factory.cc | 4 ++-- paddle/fluid/operators/cvm_op.cc | 2 +- paddle/fluid/platform/float16_test.cu | 2 +- .../fluid/prim/api/manual_prim/utils/utils.h | 6 +++--- paddle/phi/kernels/prior_box_kernel.h | 20 +++++++++---------- 10 files changed, 35 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 19cf30d24db406..66c62085faed2b 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -49,8 +49,8 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( /*disable_setting_default_stream_for_allocator=*/true, /*stream_priority=*/0); if (ir::IsTopologySortOperationsUnique(*graph_)) { - VLOG(10) - << "Change thread number to 1 because the toposort order is unique"; + VLOG(10) << "Change thread number to 1 because the topology sort order is " + "unique"; strategy_.num_threads_ = 1; traced_ops_.clear(); for (auto *op_node : TopologySortOperations(*graph_)) { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 27be4b77176350..25108148af3494 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -39,7 +39,7 @@ FetchOpHandle::~FetchOpHandle() = default; void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { PADDLE_THROW(platform::errors::PermissionDenied( - "No nodes need to wait FetchOp. Unexpceted Error.")); + "No nodes need to wait FetchOp. Unexpected Error.")); } static void CheckDims(const framework::DDim &tensor_dims, diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 55fc19ad2be1c3..afe442c0a7c6f3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2038,7 +2038,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, phi::KernelContext phi_kernel_context; if (enable_cache_runtime_context_ && !need_prepare_phi_data_ && !need_prepare_data_) { - // TODO(inference): Now we only suppor dense_tensor cache, we may be + // TODO(inference): Now we only support dense_tensor cache, we may be // support ScalarTensor, SparseTensor in future. bool all_dense_tensor_input_{true}; for (auto& iter : Inputs()) { @@ -2573,7 +2573,7 @@ Scope* OperatorWithKernel::PrepareData( // for some situation like InferShape(). // In this situation We cannot skip Var analysis, as // oneDNN shape of Var may differ from kNHWC Var - // In such situation corressponding resized Var + // In such situation corresponding resized Var // has to be created and registered if ((tensor_in->layout() == DataLayout::ONEDNN) && (var->IsType() == true) && @@ -3193,7 +3193,7 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t i = 0; i < input_names.size(); ++i) { auto it = ctx.inputs.find(input_names[i]); - // calcute the start and end index of the input tensors + // calculate the start and end index of the input tensors size_t start_idx = (i == 0 ? 0 : phi_kernel_context->InputRangeAt(i - 1).second); // deal with optional here @@ -3399,7 +3399,7 @@ void OperatorWithKernel::BuildPhiKernelContext( attr_iter, Attrs().end(), platform::errors::NotFound("(%s) is not found in AttributeMap when " - "buildind static KernelContext.", + "building static KernelContext.", attr_names[i])); switch (AttrTypeID(attr_iter->second)) { case proto::AttrType::INTS: { @@ -3473,7 +3473,7 @@ void OperatorWithKernel::BuildPhiKernelContext( RuntimeAttrs().end(), platform::errors::NotFound( "(%s) is not found in AttributeMap when " - "buildind static KernelContext.", + "building static KernelContext.", attr_names[i])); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 897e520813809c..c2b6c37e7dd6e6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -639,15 +639,15 @@ void InitP2P(const std::vector &places) { for (int i = 0; i < count; ++i) { for (int j = 0; j < count; ++j) { if (devices[i] == devices[j]) continue; - int can_acess = -1; + int can_access = -1; #ifdef PADDLE_WITH_HIP hipError_t ret = - hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); - if (ret != hipSuccess || can_acess != 1) { + hipDeviceCanAccessPeer(&can_access, devices[i], devices[j]); + if (ret != hipSuccess || can_access != 1) { #else cudaError_t ret = - cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); - if (ret != cudaSuccess || can_acess != 1) { + cudaDeviceCanAccessPeer(&can_access, devices[i], devices[j]); + if (ret != cudaSuccess || can_access != 1) { #endif LOG(WARNING) << "Cannot enable P2P access from " << devices[i] << " to " << devices[j]; diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index fafde716b7bba7..bd869a05880671 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -710,8 +710,9 @@ void TensorFromStream(std::istream& is, PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); } else { - PADDLE_THROW(platform::errors::Unimplemented( - "CutomPlace is not supported when not compiled with CustomDevice")); + PADDLE_THROW( + platform::errors::Unimplemented("CustomPlace is not supported when " + "not compiled with CustomDevice")); } #endif } else { @@ -887,7 +888,8 @@ std::ostream& print_tensor(std::ostream& os, const phi::DenseTensor& tensor) { auto element_num = tensor.numel(); os << " - data: ["; - // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly + // Note: int8_t && uint8_t is typedef of char, ostream unable to print + // properly if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) { if (element_num > 0) { os << signed(inspect[0]); diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index ba5dac4830aa18..81b2df6efc723d 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -26,8 +26,8 @@ namespace framework { class TrainerBase; -typedef std::shared_ptr (*CreatetrainerFunction)(); -typedef std::unordered_map trainerMap; +typedef std::shared_ptr (*CreateTrainerFunction)(); +typedef std::unordered_map trainerMap; trainerMap g_trainer_map; #define REGISTER_TRAINER_CLASS(trainer_class) \ diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc index 578a59130495ac..1e414ff217c2f1 100644 --- a/paddle/fluid/operators/cvm_op.cc +++ b/paddle/fluid/operators/cvm_op.cc @@ -127,7 +127,7 @@ class CVMOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(LodTensor, default LodTensor), a 2-D tensor with shape " "[N x D]," - " where N is the batch size and D is the emebdding dim. "); + " where N is the batch size and D is the embedding dim. "); AddInput("CVM", "(Tensor), a 2-D Tensor with shape [N x 2], where N is the batch " "size, 2 is show and click."); diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 4575b54d48c9bf..555f83d61675ef 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -282,7 +282,7 @@ TEST(float16, compound_on_gpu) { TestDivAssign(6, 2, 3); } -TEST(float16, comparision_on_gpu) { +TEST(float16, comparison_on_gpu) { TestEqual(1, 1, true); TestEqual(1, 2, false); TestNotEqual(2, 3, true); diff --git a/paddle/fluid/prim/api/manual_prim/utils/utils.h b/paddle/fluid/prim/api/manual_prim/utils/utils.h index 90a25f8bf1e1fd..f3b21169e57f1a 100644 --- a/paddle/fluid/prim/api/manual_prim/utils/utils.h +++ b/paddle/fluid/prim/api/manual_prim/utils/utils.h @@ -29,7 +29,7 @@ namespace prim { // We put some api like utils here template Tensor empty(const paddle::experimental::IntArray& shape, - phi::DataType dype, + phi::DataType dtype, const paddle::Place& place); template @@ -37,7 +37,7 @@ Tensor empty_like(const Tensor& x, phi::DataType dtype, const paddle::Place& place); -// copy tensor for output ptr, in static need use assigh op +// copy tensor for output ptr, in static need use assign op template void by_pass(const Tensor& x, Tensor* out); @@ -114,7 +114,7 @@ static std::vector unsafe_vector_cast(const std::vector& src) { return dst; } -// This fucction compute unsqueeze dims for reshape to replace unsqueeze. +// This function compute unsqueeze dims for reshape to replace unsqueeze. static std::vector get_unsqueeze_dims( const Tensor& origin, const std::vector& axis) { auto origin_dims = origin.shape(); diff --git a/paddle/phi/kernels/prior_box_kernel.h b/paddle/phi/kernels/prior_box_kernel.h index 45a741c7a3a72b..132efb7b6cc722 100644 --- a/paddle/phi/kernels/prior_box_kernel.h +++ b/paddle/phi/kernels/prior_box_kernel.h @@ -35,25 +35,25 @@ void PriorBoxKernel(const Context& ctx, DenseTensor* out, DenseTensor* var); -inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, +inline void ExpandAspectRatios(const std::vector& input_aspect_ratio, bool flip, - std::vector* output_aspect_ratior) { + std::vector* output_aspect_ratio) { constexpr float epsilon = 1e-6; - output_aspect_ratior->clear(); - output_aspect_ratior->push_back(1.0f); - for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { - float ar = input_aspect_ratior[i]; + output_aspect_ratio->clear(); + output_aspect_ratio->push_back(1.0f); + for (size_t i = 0; i < input_aspect_ratio.size(); ++i) { + float ar = input_aspect_ratio[i]; bool already_exist = false; - for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { - if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { + for (size_t j = 0; j < output_aspect_ratio->size(); ++j) { + if (fabs(ar - output_aspect_ratio->at(j)) < epsilon) { already_exist = true; break; } } if (!already_exist) { - output_aspect_ratior->push_back(ar); + output_aspect_ratio->push_back(ar); if (flip) { - output_aspect_ratior->push_back(1.0f / ar); + output_aspect_ratio->push_back(1.0f / ar); } } } From 317fad13a6d7cfcebd69405ad8a9c5561b117daf Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 1 Mar 2024 23:15:22 +0800 Subject: [PATCH 05/15] Fix maxinum maximum, etc (#62290) --- paddle/phi/kernels/bmm_kernel.h | 2 +- .../kernels/xpu/instance_norm_grad_kernel.cc | 2 +- paddle/phi/kernels/xpu/inverse_kernel.cc | 2 +- .../phi/kernels/xpu/multiclass_nms3_kernel.cc | 2 +- paddle/phi/kernels/xpu/prelu_grad_kernel.cc | 4 +-- .../phi/kernels/xpu/reduce_max_grad_kernel.cc | 30 +++++++++---------- .../phi/kernels/xpu/reduce_min_grad_kernel.cc | 30 +++++++++---------- paddle/phi/kernels/xpu/rnn_util.h | 2 +- .../phi/kernels/xpu/set_value_grad_kernel.cc | 2 +- paddle/phi/kernels/xpu/set_value_kernel.cc | 2 +- 10 files changed, 39 insertions(+), 39 deletions(-) diff --git a/paddle/phi/kernels/bmm_kernel.h b/paddle/phi/kernels/bmm_kernel.h index 09e7f9647b68eb..6d3733bf750d3f 100644 --- a/paddle/phi/kernels/bmm_kernel.h +++ b/paddle/phi/kernels/bmm_kernel.h @@ -22,7 +22,7 @@ namespace phi { * @brief Bmm Kernel. * Applies batched matrix multiplication to two tensors. * - * Both of the two input tensors must be three-dementional + * Both of the two input tensors must be three-dimensional * and share the same batch size. * if x is a (b, m, k) tensor, y is a (b, k, n) tensor, * the output will be a (b, m, n) tensor. diff --git a/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc index dba0e2ccfd7651..f1a217ed81ad35 100644 --- a/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/instance_norm_grad_kernel.cc @@ -39,7 +39,7 @@ void InstanceNormGradKernel(const Context& dev_ctx, true, phi::errors::InvalidArgument( "The size of input's dimensions should be less equal than 5", - "and the dimension of D should be eaual to 1", + "and the dimension of D should be equal to 1", "But received: the size of input's dimensions is [%d]", x_dims.size())); diff --git a/paddle/phi/kernels/xpu/inverse_kernel.cc b/paddle/phi/kernels/xpu/inverse_kernel.cc index a48baa508ade00..966fcc97e0ab09 100644 --- a/paddle/phi/kernels/xpu/inverse_kernel.cc +++ b/paddle/phi/kernels/xpu/inverse_kernel.cc @@ -41,7 +41,7 @@ void InverseKernel(const Context& dev_ctx, 8192, phi::errors::InvalidArgument( "The size of a single matrix (%d bytes) exceeds the " - "maxinum numbers of bytes xpu supports (8192).", + "maximum numbers of bytes xpu supports (8192).", n * n * sizeof(T))); auto RAII_GUARD = xpu::ctx_guard(dev_ctx.x_context()); auto* info_xpu = RAII_GUARD.alloc_l3_or_gm(batch); diff --git a/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc b/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc index 17746e4eeff0af..2f343ccc6b494e 100644 --- a/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc +++ b/paddle/phi/kernels/xpu/multiclass_nms3_kernel.cc @@ -90,7 +90,7 @@ void MultiClassNMSKernel(const Context& ctx, PADDLE_ENFORCE_EQ( boxes_count == score_dims[0], true, - phi::errors::InvalidArgument("boxes_count shuold equal score_dims[0].", + phi::errors::InvalidArgument("boxes_count should equal score_dims[0].", "But received: (%d) and (%d)", boxes_count, score_dims[0])); diff --git a/paddle/phi/kernels/xpu/prelu_grad_kernel.cc b/paddle/phi/kernels/xpu/prelu_grad_kernel.cc index fa43c908837664..b7c2157d55f43e 100644 --- a/paddle/phi/kernels/xpu/prelu_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/prelu_grad_kernel.cc @@ -60,9 +60,9 @@ void PReluGradKernel(const Context& dev_ctx, } } - // mode = 0: channel_nchw, slope_shape = {c}, default. meanwhile, xhsape = {n, + // mode = 0: channel_nchw, slope_shape = {c}, default. meanwhile, xshape = {n, // c, h, w} - // mode = 1, channel_nhwc, slope_shape = {c}, meanwhile, xhsape = {n, h, w, c} + // mode = 1, channel_nhwc, slope_shape = {c}, meanwhile, xshape = {n, h, w, c} // mode = 2, elementwise, slope_shape = {c*h*w} // mode = 3, single slope, slope_shape = {1} diff --git a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc index 846250c0677406..aa8736d84b71f0 100644 --- a/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_max_grad_kernel.cc @@ -60,23 +60,23 @@ void ReduceMaxGradKernel(const Context& dev_ctx, } } - T* brocast1 = nullptr; - T* brocast2 = nullptr; + T* broadcast1 = nullptr; + T* broadcast2 = nullptr; bool* equal = nullptr; xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - brocast1 = RAII_GUARD.alloc_l3_or_gm(x.numel()); + broadcast1 = RAII_GUARD.alloc_l3_or_gm(x.numel()); PADDLE_ENFORCE_NOT_NULL( - brocast1, errors::ResourceExhausted("XPU has no enough memory")); + broadcast1, errors::ResourceExhausted("XPU has no enough memory")); equal = RAII_GUARD.alloc_l3_or_gm(x.numel()); PADDLE_ENFORCE_NOT_NULL( equal, errors::ResourceExhausted("XPU has no enough memory")); - brocast2 = RAII_GUARD.alloc_l3_or_gm(x.numel()); + broadcast2 = RAII_GUARD.alloc_l3_or_gm(x.numel()); PADDLE_ENFORCE_NOT_NULL( - brocast2, errors::ResourceExhausted("XPU has no enough memory")); + broadcast2, errors::ResourceExhausted("XPU has no enough memory")); // use [1] to replace [], because xpu not support [] if (xdims.size() == 0) { @@ -86,25 +86,25 @@ void ReduceMaxGradKernel(const Context& dev_ctx, ydims = std::vector({1}); } - // step 1. brocast out and out_grad - int r = - xpu::broadcast(dev_ctx.x_context(), out_data, brocast1, ydims, xdims); + // step 1. broadcast out and out_grad + int r = xpu::broadcast( + dev_ctx.x_context(), out_data, broadcast1, ydims, xdims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); r = xpu::broadcast( - dev_ctx.x_context(), out_grad_data, brocast2, ydims, xdims); + dev_ctx.x_context(), out_grad_data, broadcast2, ydims, xdims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); - // step 2. comparse out_brocast and x - r = xpu::equal(dev_ctx.x_context(), x_data, brocast1, equal, x.numel()); + // step 2. compare out_broadcast and x + r = xpu::equal(dev_ctx.x_context(), x_data, broadcast1, equal, x.numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "equal"); // step 3. get x_grad - r = xpu::constant(dev_ctx.x_context(), brocast1, x.numel(), 0); + r = xpu::constant(dev_ctx.x_context(), broadcast1, x.numel(), 0); PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); r = xpu::select(dev_ctx.x_context(), equal, - brocast2, - brocast1, + broadcast2, + broadcast1, x_grad_data, xdims, xdims); diff --git a/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc index 9019cb0834d72e..aefcc74b450919 100644 --- a/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_min_grad_kernel.cc @@ -60,23 +60,23 @@ void ReduceMinGradKernel(const Context& dev_ctx, } } - T* brocast1 = nullptr; - T* brocast2 = nullptr; + T* broadcast1 = nullptr; + T* broadcast2 = nullptr; bool* equal = nullptr; xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - brocast1 = RAII_GUARD.alloc_l3_or_gm(x.numel()); + broadcast1 = RAII_GUARD.alloc_l3_or_gm(x.numel()); PADDLE_ENFORCE_NOT_NULL( - brocast1, errors::ResourceExhausted("XPU has no enough memory")); + broadcast1, errors::ResourceExhausted("XPU has no enough memory")); equal = RAII_GUARD.alloc_l3_or_gm(x.numel()); PADDLE_ENFORCE_NOT_NULL( equal, errors::ResourceExhausted("XPU has no enough memory")); - brocast2 = RAII_GUARD.alloc_l3_or_gm(x.numel()); + broadcast2 = RAII_GUARD.alloc_l3_or_gm(x.numel()); PADDLE_ENFORCE_NOT_NULL( - brocast2, errors::ResourceExhausted("XPU has no enough memory")); + broadcast2, errors::ResourceExhausted("XPU has no enough memory")); // use [1] to replace [], because xpu not support [] if (xdims.size() == 0) { @@ -86,25 +86,25 @@ void ReduceMinGradKernel(const Context& dev_ctx, ydims = std::vector({1}); } - // step 1. brocast out and out_grad - int r = - xpu::broadcast(dev_ctx.x_context(), out_data, brocast1, ydims, xdims); + // step 1. broadcast out and out_grad + int r = xpu::broadcast( + dev_ctx.x_context(), out_data, broadcast1, ydims, xdims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); r = xpu::broadcast( - dev_ctx.x_context(), out_grad_data, brocast2, ydims, xdims); + dev_ctx.x_context(), out_grad_data, broadcast2, ydims, xdims); PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast"); - // step 2. comparse out_brocast and x - r = xpu::equal(dev_ctx.x_context(), x_data, brocast1, equal, x.numel()); + // step 2. compare out_broadcast and x + r = xpu::equal(dev_ctx.x_context(), x_data, broadcast1, equal, x.numel()); PADDLE_ENFORCE_XDNN_SUCCESS(r, "equal"); // step 3. get x_grad - r = xpu::constant(dev_ctx.x_context(), brocast1, x.numel(), 0); + r = xpu::constant(dev_ctx.x_context(), broadcast1, x.numel(), 0); PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); r = xpu::select(dev_ctx.x_context(), equal, - brocast2, - brocast1, + broadcast2, + broadcast1, x_grad_data, xdims, xdims); diff --git a/paddle/phi/kernels/xpu/rnn_util.h b/paddle/phi/kernels/xpu/rnn_util.h index 5310b35e64dc36..7948bb2defa0ca 100644 --- a/paddle/phi/kernels/xpu/rnn_util.h +++ b/paddle/phi/kernels/xpu/rnn_util.h @@ -23,7 +23,7 @@ void ResetParameterVector(const std::vector& raw_params_vec, const int& num_layers, const bool& is_bidirec, std::vector>* params_vec) { - // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers + // the parameter raw sequence is [FWhi, FWhh, BWhi, BWhh] * num_layers // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers const int& direction_num = is_bidirec ? 2 : 1; diff --git a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc index c5d33ae4ac8d06..227d6b39c9f281 100644 --- a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc @@ -203,7 +203,7 @@ void SetValueGradImpl(const Context& dev_ctx, auto value_grad_dims = value_grad->dims(); auto fake_value_grad_dims = out_dims; - // Create an extented shape according to the rules of broadcast. + // Create an extended shape according to the rules of broadcast. auto value_grad_dims_size = value_grad_dims.size(); int num_decrease = 0; diff --git a/paddle/phi/kernels/xpu/set_value_kernel.cc b/paddle/phi/kernels/xpu/set_value_kernel.cc index c457a6d21fd8a1..60b0fff7d9d7c8 100644 --- a/paddle/phi/kernels/xpu/set_value_kernel.cc +++ b/paddle/phi/kernels/xpu/set_value_kernel.cc @@ -263,7 +263,7 @@ void SetValueKernelImpl(const Context& dev_ctx, const std::vector& decrease_axes, const std::vector& none_axes, DenseTensor* out) { - // rank是xtensor的维度信息 + // rank是x tensor的维度信息 const int rank = x.dims().size(); switch (rank) { From 13d74009555434d6327a00a01aee68fc111c14bb Mon Sep 17 00:00:00 2001 From: co63oc Date: Fri, 1 Mar 2024 23:17:04 +0800 Subject: [PATCH 06/15] Update kernel_backward.h (#62288) --- .../fusion/cutlass/memory_efficient_attention/kernel_backward.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h index 31ce0bd3574ee9..2bd3ac2db5f5b7 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/kernel_backward.h @@ -492,8 +492,6 @@ struct AttentionBackwardKernel { scalar_t, // ElementC accum_t // ElementAccumulator >; - static constexpr auto kOptimalAlignement = - std::max(DefaultConfig::kAlignmentA, DefaultConfig::kAlignmentB); static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment; struct MatmulQK { From 06d3a5de0321e2d23787a1a6ea1e4572e294585b Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Sat, 2 Mar 2024 04:32:36 +0800 Subject: [PATCH 07/15] Fix copy *.h on paddle/pir dir introduced from PR#61863 (#62293) --- python/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index 9fd352ddd26be0..3ba1dc05e4976d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -876,7 +876,7 @@ headers = ( # init headers list(find_files('init_phi.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) + # phi init headers # init headers - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pir/include')) + # pir init headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pir/include', recursive=True)) + # pir init headers # init headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/pir/drr/include')) + # drr init headers # init headers From cbe8810bbea29c28cc99ccd764134dd30fb61e84 Mon Sep 17 00:00:00 2001 From: lanxianghit <47554610+lanxianghit@users.noreply.github.com> Date: Sat, 2 Mar 2024 08:19:07 +0800 Subject: [PATCH 08/15] [PIR][DynamicShape] Fix bug in slice op's InferSymbolicShape (#62247) * Fix bug in slice op's InferSymbolicShape * add more tests * fix ci --- .../infer_symbolic_shape/infer_sym_utils.cc | 11 + .../infer_symbolic_shape/infer_sym_utils.h | 8 + .../paddle_op_infer_sym.cc | 241 +++++++++++------- .../shape_dialect/shape_optimization_test.cc | 8 +- .../cinn/symbolic/test_op_infer_sym_shape.py | 58 +++++ 5 files changed, 231 insertions(+), 95 deletions(-) diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc index 4e5f5df08732a0..5675429b5c65f2 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc @@ -16,6 +16,17 @@ namespace paddle::dialect::details { +std::optional> VecExpr2Int64(const ExprVec &expr_vec) { + std::vector int64vec; + for (auto item : expr_vec) { + if (!item.isa()) { + return std::nullopt; + } + int64vec.push_back(item.Get()); + } + return int64vec; +} + bool ReduceInferDim(pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis, const std::vector &axis, diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h index 8a14e40e6337af..d2d508ff5890db 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h @@ -17,6 +17,12 @@ #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h" +// To make codes shorter +using ExprVec = std::vector; +using ShapeOrData = symbol::ShapeOrDataDimExprs; +using TensorExprs = symbol::TensorShapeOrDataDimExprs; +using TensorListExprs = symbol::TensorListShapeOrDataDimExprs; + namespace paddle::dialect::details { template struct AttributeTrait; @@ -60,6 +66,8 @@ std::vector GetVectorAttr(const ::pir::Operation *op, return vec_res; } +std::optional> VecExpr2Int64(const ExprVec &expr_vec); + bool ReduceInferDim(pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis, const std::vector &axis, diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc index d95f1095635184..1be26c82f4c21a 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc @@ -19,11 +19,6 @@ namespace paddle::dialect { -// To make codes shorter -using ShapeOrData = symbol::ShapeOrDataDimExprs; -using TensorExprs = symbol::TensorShapeOrDataDimExprs; -using TensorListExprs = symbol::TensorListShapeOrDataDimExprs; - bool DataOpInferSymbolicShape(pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) { const auto &attributes = op->attributes(); @@ -270,9 +265,104 @@ bool FullIntArrayOpInferSymbolicShape( return true; } +inline void CheckAndUpdateSliceAttrs( + const ExprVec &in_dims, + const std::vector &axes, + ExprVec *starts_p, + ExprVec *ends_p, + std::vector *infer_flags = nullptr) { + auto vec_int64 = details::VecExpr2Int64(*starts_p); + IR_ENFORCE(vec_int64.has_value(), + "for slice op, all the elements in `starts` must be int64_t"); + std::vector starts_int = vec_int64.value(); + + vec_int64 = details::VecExpr2Int64(*ends_p); + IR_ENFORCE(vec_int64.has_value(), + "for slice op, all the elements in `ends` must be int64_t"); + std::vector ends_int = vec_int64.value(); + + ExprVec &starts = *starts_p; + ExprVec &ends = *ends_p; + auto IsMaxInt = [](const symbol::DimExpr &expr) { + return expr.isa() && + expr.Get() == + static_cast(std::numeric_limits::max()); + }; + + for (size_t i = 0; i < axes.size(); ++i) { + int64_t axis = axes[i]; + + if (infer_flags != nullptr && (*infer_flags)[i] == -1) { + PADDLE_THROW( + phi::errors::Unimplemented("SliceOpInferSymbolicShape CAN NOT " + "deal with -1 in infer_flags now")); + } + + // For both start and end can be negtive or positive, we need to handle the + // following different arrangements. + ends[i] = IsMaxInt(ends[i]) ? in_dims[axis] : ends[i]; + + bool both_negative_or_positive = (starts_int[i] >= 0 && ends_int[i] >= 0) || + (starts_int[i] <= 0 && ends_int[i] <= 0); + bool start_negative_end_positive = starts_int[i] <= 0 && ends_int[i] >= 0; + bool start_positive_end_negative = starts_int[i] >= 0 && ends_int[i] <= 0; + + if (both_negative_or_positive) { + continue; + } else if (start_negative_end_positive) { + starts[i] = starts[i] + in_dims[axis]; + } else if (start_positive_end_negative) { + starts[i] = starts[i] - in_dims[axis]; + } else { + LOG(FATAL) << "Dead code"; + } + } +} + +inline ExprVec GetSliceDims(const ExprVec &in_dims, + const std::vector &axes, + const ExprVec &starts, + const ExprVec &ends, + std::vector *infer_flags = nullptr) { + ExprVec slice_dims(in_dims); + + for (size_t i = 0; i < axes.size(); ++i) { + int64_t axis = axes[i]; + + if (infer_flags != nullptr && (*infer_flags)[i] == -1) { + PADDLE_THROW( + phi::errors::Unimplemented("SliceOpInferSymbolicShape CAN NOT " + "deal with -1 in infer_flags now")); + } + + slice_dims[axis] = ends[i] - starts[i]; + } + + return slice_dims; +} + +inline ExprVec GetDecreasedDims(const ExprVec &slice_dims, + const std::vector &decrease_axes) { + ExprVec decreased_dims(slice_dims); + std::vector decrease_flag(slice_dims.size(), 0); + if (decrease_axes.size() > 0) { + for (size_t i = 0; i < decrease_axes.size(); ++i) { + int64_t axis = decrease_axes[i]; + decrease_flag[axis] = 1; + } + ExprVec new_shape; + for (size_t i = 0; i < slice_dims.size(); ++i) { + if (decrease_flag[i] == 0) { + new_shape.emplace_back(slice_dims[i]); + } + } + decreased_dims = new_shape; + } + return decreased_dims; +} + bool SliceOpInferSymbolicShape(pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) { - // TODO(zhangbopd): Not implemented yet. pir::Value operand_source = op->operand_source(0); pir::Value operand_starts = op->operand_source(1); pir::Value operand_ends = op->operand_source(2); @@ -285,107 +375,76 @@ bool SliceOpInferSymbolicShape(pir::Operation *op, const symbol::ShapeOrDataDimExprs &ends_shape_data = shape_analysis->GetShapeOrDataForValue(operand_ends); - // Currently, we DO NOT support the case that any element in `axes` `starts` - // or `ends` is a Symbol. const std::vector axes = [&] { - const auto &attributes = op->attributes(); - pir::Attribute attr_axes = attributes.at("axes"); - - const auto &axes_vec = attr_axes.dyn_cast().AsVector(); - std::vector axes; + std::vector axes_vec = details::GetVectorAttr(op, "axes"); int64_t rank = int64_t(operand_shape_or_data.shape().size()); - for (auto item : axes_vec) { - int64_t axis = item.dyn_cast().data(); - axes.emplace_back(axis >= 0 ? axis : std::max(int64_t(0), axis + rank)); + for (size_t i = 0; i < axes_vec.size(); i++) { + int64_t axis = axes_vec[i]; + axes_vec[i] = axis >= 0 ? axis : std::max(int64_t(0), axis + rank); } - return axes; + return axes_vec; }(); - const std::vector starts = [&] { - std::vector starts; - for (auto item : starts_shape_data.data().value()) { - IR_ENFORCE(item.isa(), - "Currently, we DO NOT support the case that any element in " - "`starts` is a Symbol."); - starts.push_back(item.Get()); - } - return starts; - }(); + // Currently, we DO NOT support any element in `starts` is a Symbol. + ExprVec starts = starts_shape_data.data().value(); + ExprVec ends = ends_shape_data.data().value(); - const std::vector ends = [&] { - std::vector ends; - for (auto item : ends_shape_data.data().value()) { - IR_ENFORCE(item.isa(), - "Currently, we DO NOT support the case that any element in " - "`ends` is a Symbol."); - ends.push_back(item.Get()); + std::vector infer_flags = [op, &axes] { + std::vector infer_flags_t = + details::GetVectorAttr(op, "infer_flags"); + if (infer_flags_t.empty()) { + infer_flags_t = std::vector(axes.size(), 1); } - return ends; + return infer_flags_t; }(); - // When `pd.slice` is operating on a tensor which is produced by a `pd.shape` - // op, the reseult should be written into data. - const auto &GetDataDimExprs = [&]() -> symbol::ShapeOrDataDimExprs { - const std::vector out_data = [&] { - std::vector out_data; - const int64_t start = - starts[0] < 0 - ? starts[0] + operand_shape_or_data.data().value().size() - : starts[0]; - const int64_t end = - static_cast(std::numeric_limits::max()) == ends[0] - ? operand_shape_or_data.data().value().size() - : ends[0]; - - for (int64_t i = start; i < end; i++) { - out_data.push_back(operand_shape_or_data.data().value()[i]); - } - return out_data; - }(); - const std::vector shape{std::int64_t(out_data.size())}; - return symbol::ShapeOrDataDimExprs{ - symbol::TensorShapeOrDataDimExprs(shape, out_data)}; - }; + const std::vector decrease_axis = + details::GetVectorAttr(op, "decrease_axis"); - // Othewise, the reseult should be written into the shape. const auto &GetShapeDimExprs = [&]() -> symbol::ShapeOrDataDimExprs { - std::vector out_shape = operand_shape_or_data.shape(); + const ExprVec &in_dims = operand_shape_or_data.shape(); + CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &infer_flags); + ExprVec slice_dims = + GetSliceDims(in_dims, axes, starts, ends, &infer_flags); + ExprVec out_dims = GetDecreasedDims(slice_dims, decrease_axis); - const std::vector &dim_expr_starts = - starts_shape_data.data().value(); - const std::vector &dim_expr_ends = - ends_shape_data.data().value(); + return symbol::ShapeOrDataDimExprs{ + symbol::TensorShapeOrDataDimExprs(out_dims)}; + }; - // For both start and end can be negtive or positive, we need to handle the - // following different arrangements. - auto IsMaxInt = [](const symbol::DimExpr &expr) { - return expr.isa() && - expr.Get() == - static_cast(std::numeric_limits::max()); - }; - for (size_t i = 0; i < axes.size(); ++i) { - const int64_t axis = axes[i]; - auto end = - IsMaxInt(dim_expr_ends[i]) ? out_shape[axis] : dim_expr_ends[i]; - - bool both_negative_or_positive = - (starts[i] >= 0 && ends[i] >= 0) || (starts[i] <= 0 && ends[i] <= 0); - bool start_negative_end_positive = starts[i] <= 0 && ends[i] >= 0; - bool start_positive_end_negative = starts[i] >= 0 && ends[i] <= 0; - - if (both_negative_or_positive) { - out_shape[axis] = end - dim_expr_starts[i]; - } else if (start_negative_end_positive) { - out_shape[axis] = end - dim_expr_starts[i] - out_shape[axis]; - } else if (start_positive_end_negative) { - out_shape[axis] = out_shape[axis] - dim_expr_starts[i] + end; - } else { - LOG(FATAL) << "Dead code"; - } + // When `pd.slice` is operating on a tensor which is produced by a `pd.shape` + // op, the reseult should be written into data. + const auto &GetDataDimExprs = [&]() -> symbol::ShapeOrDataDimExprs { + std::vector out_data; + + // Currently, we DO NOT support the case that any element in `axes` `starts` + // or `ends` is a Symbol. + auto vec_int64 = details::VecExpr2Int64(starts); + IR_ENFORCE(vec_int64.has_value(), + "for slice op, all the elements in `starts` must be int64_t"); + std::vector starts_int = vec_int64.value(); + + vec_int64 = details::VecExpr2Int64(ends); + IR_ENFORCE(vec_int64.has_value(), + "for slice op, all the elements in `ends` must be int64_t"); + std::vector ends_int = vec_int64.value(); + + const int64_t start = + starts_int[0] < 0 + ? starts_int[0] + operand_shape_or_data.data().value().size() + : starts_int[0]; + const int64_t end = + static_cast(std::numeric_limits::max()) == ends_int[0] + ? operand_shape_or_data.data().value().size() + : ends_int[0]; + + for (int64_t i = start; i < end; i++) { + out_data.push_back(operand_shape_or_data.data().value()[i]); } + const std::vector shape{std::int64_t(out_data.size())}; return symbol::ShapeOrDataDimExprs{ - symbol::TensorShapeOrDataDimExprs(out_shape)}; + symbol::TensorShapeOrDataDimExprs(shape, out_data)}; }; symbol::ShapeOrDataDimExprs shape_data = diff --git a/test/cpp/pir/shape_dialect/shape_optimization_test.cc b/test/cpp/pir/shape_dialect/shape_optimization_test.cc index b48f84db4d1b80..faefec6e7ec416 100644 --- a/test/cpp/pir/shape_dialect/shape_optimization_test.cc +++ b/test/cpp/pir/shape_dialect/shape_optimization_test.cc @@ -122,10 +122,10 @@ TEST(shape_optimization, shape_optimization_pass) { "Mul(Mul(Mul(Mul(1, S1), 128), 32), 1 / (128))"); EXPECT_EQ(cast_res.shape()[3], 2); - EXPECT_EQ(symbol::ToString(relu_res.shape()[0]), "Add(Add(S2, -2), -2)"); - EXPECT_EQ(symbol::ToString(relu_res.shape()[1]), "Add(Add(S3, -2), -2)"); - EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(Add(S4, -2), -2)"); - EXPECT_EQ(symbol::ToString(relu_res.shape()[3]), "Add(Add(S5, -2), -2)"); + EXPECT_EQ(symbol::ToString(relu_res.shape()[0]), "Add(-2, -Add(2, -S2))"); + EXPECT_EQ(symbol::ToString(relu_res.shape()[1]), "Add(-2, -Add(2, -S3))"); + EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(-2, -Add(2, -S4))"); + EXPECT_EQ(symbol::ToString(relu_res.shape()[3]), "Add(-2, -Add(2, -S5))"); EXPECT_EQ(subtract_res.shape()[0], 1); EXPECT_EQ(subtract_res.shape()[1], 64); diff --git a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py index 61ca48f19d797c..4ab27bf657eac9 100644 --- a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py +++ b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py @@ -459,5 +459,63 @@ def test_eval_symbolic(self): return True +class SliceNet(paddle.nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x): + out = x[:, -1, :] + out = x[1:3, 0:2, 2:4] + + axes = [0, 1, 2] + starts = [-3, 0, 2] + ends = [3, 2, 4] + out = paddle.slice(x, axes=axes, starts=starts, ends=ends) + + return out + + +class TestSliceOpInferSymbolicShape(TestBase): + def prepare_data(self): + self.cases = [np.random.rand(4, 5, 6)] + + self.expected = [ + [ + 'shape[S0, S2], data[NULL]', + 'shape[2, 2, 2], data[NULL]', + 'shape[Add(3, -Add(-3, S0)), 2, 2]', + ] + ] + + def test_eval_symbolic(self): + net = SliceNet() + + for i in range(len(self.cases)): + x = self.cases[i] + x_spec = InputSpec( + shape=[None for index in range(len(x.shape))], dtype='float32' + ) + + input_spec = [x_spec] + net = apply_to_static(net, False, input_spec) + net.eval() + + # check the infer result + sym_shape_str_list = get_sym_shape_str_for_op( + net, input_spec, 'pd_op.slice' + ) + np.testing.assert_equal( + len(sym_shape_str_list), len(self.expected[i]) + ) + for j in range(len(sym_shape_str_list)): + np.testing.assert_equal( + sym_shape_str_list[j].find(self.expected[i][j]), + 0, + f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}', + ) + + return True + + if __name__ == '__main__': unittest.main() From f445bd8d31a8dc283d63dc282dc09082bf77a059 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Sat, 2 Mar 2024 08:48:30 +0800 Subject: [PATCH 09/15] [DRR]Fix SegmentFault for BlockArgument while applying pass in Llama2 infer (#62283) * [DRR]Fix SegmentFault for BlockArgument while applying pass in Llama2 infer * fix typo --- paddle/fluid/pir/drr/src/rewrite_pattern.cc | 137 ++++++++++++-------- 1 file changed, 85 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc index 46b034aca85582..e19d5ae224c7d3 100644 --- a/paddle/fluid/pir/drr/src/rewrite_pattern.cc +++ b/paddle/fluid/pir/drr/src/rewrite_pattern.cc @@ -258,95 +258,128 @@ bool DrrRewritePattern::MatchFromOutputToInput( std::unordered_set ir_visited; std::queue drr_q; std::queue ir_q; - bool matched = true; - size_t step = 0; - for (auto it = output_op_map.begin(); it != output_op_map.end(); ++it) { - VLOG(6) << "match (" << it->first->name() << " @" << it->first << " : @" - << it->second << ") in source_pattern_graph "; - drr_q.push(it->first); - drr_visited.insert(it->first); - ir_q.push(it->second); - ir_visited.insert(it->second); - } - while (!drr_q.empty()) { - if (!matched) break; - auto* drr_node = drr_q.front(); - auto* ir_node = ir_q.front(); - drr_q.pop(); - ir_q.pop(); + // Initialize DRR matched queue. + const auto& InitDrrQueue = [&]() -> void { + for (auto it = output_op_map.begin(); it != output_op_map.end(); ++it) { + VLOG(6) << "match (" << it->first->name() << " @" << it->first << " : @" + << it->second << ") in source_pattern_graph "; + drr_q.push(it->first); + drr_visited.insert(it->first); + ir_q.push(it->second); + ir_visited.insert(it->second); + } + }; + // Check whether DrrNode and Operation have the same Operands and Results + // information. + const auto& IsSameOperandsAndResults = + [](const OpCall* drr_node, const pir::Operation* ir_node) -> bool { if (drr_node->name() != ir_node->name()) { - matched = false; VLOG(8) << "Match failed: drr_node(" << drr_node->name() << ") != pir_node(" << ir_node->name() << ")."; - break; + return false; } const auto& drr_input_tensors = drr_node->inputs(); auto ir_input_value_size = ir_node->num_operands(); if (drr_input_tensors.size() != ir_input_value_size) { - matched = false; VLOG(8) << drr_node->name() << " Match failed: drr input tensors(" << drr_input_tensors.size() << ") != pir input tensors(" << ir_input_value_size << ")."; - break; + return false; } if (drr_node->outputs().size() != ir_node->num_results()) { - matched = false; VLOG(8) << drr_node->name() << " Match failed: drr output tensors(" << drr_node->outputs().size() << ") != pir output tensors(" << ir_node->num_results() << ")."; + return false; + } + return true; + }; + // Check whether source_pattern_match_ctx has visited Operation's Operands. + const auto& HasVisitedOperands = [&](const Tensor* drr_input_tensor, + pir::Value ir_value) -> bool { + const auto& tensor_name = drr_input_tensor->name(); + if (ir_value.isa()) { + VLOG(8) << "Match Attention! Found BlockArgument as input of " + << tensor_name; + } + return source_pattern_match_ctx->tensor_map().count(tensor_name) != 0 && + ir_value != source_pattern_match_ctx->tensor_map().at(tensor_name); + }; + // Update drr_q et.al information. Return false if faild. + const auto& TryUpdateDrrQueue = [&](const OpCall* drr_producer_op, + pir::Operation* ir_producer_op) -> bool { + // still return true if both visited. + if (drr_visited.count(drr_producer_op) && + ir_visited.count(ir_producer_op)) { + return true; + } + // insert map if both not visited. + if (!drr_visited.count(drr_producer_op) && + !ir_visited.count(ir_producer_op)) { + drr_q.push(drr_producer_op); + ir_q.push(ir_producer_op); + drr_visited.insert(drr_producer_op); + ir_visited.insert(ir_producer_op); + return true; + } + return false; + }; + + // Step 1: Initialize DRR matched queue. + bool matched = true; + size_t step = 0; + InitDrrQueue(); + + while (!drr_q.empty()) { + if (!matched) break; + auto* drr_node = drr_q.front(); + auto* ir_node = ir_q.front(); + drr_q.pop(); + ir_q.pop(); + if (!IsSameOperandsAndResults(drr_node, ir_node)) { + matched = false; break; } + // Step 1: Bind Operation of current op to match_ctx. source_pattern_match_ctx->BindIrOperation(drr_node, ir_node); - // binding input_tensor of current_op + + // Step 2: Bind input_tensor of current op to match_ctx. + const auto& drr_input_tensors = drr_node->inputs(); + auto ir_input_values = ir_node->operands_source(); for (size_t i = 0; i < drr_input_tensors.size(); ++i) { - if (source_pattern_match_ctx->tensor_map().count( - drr_input_tensors[i]->name()) != 0 && - ir_node->operand(i).source() != - source_pattern_match_ctx->tensor_map().at( - drr_input_tensors[i]->name())) { + if (HasVisitedOperands(drr_input_tensors[i], ir_input_values[i])) { matched = false; VLOG(8) << " tensor_map key[" << drr_input_tensors[i]->name() << "] already exists,but value is different!"; break; - } else { - source_pattern_match_ctx->BindIrValue(drr_input_tensors[i]->name(), - ir_node->operand(i).source()); - } - - if (ir_node->operand_source(i).isa()) { - VLOG(8) << "Match Attention! Found BlockArgument as input of " - << drr_node->name(); } - + source_pattern_match_ctx->BindIrValue(drr_input_tensors[i]->name(), + ir_input_values[i]); + // Skip it while drr_producer_op is nullptr for trigger pattern boundary. auto* drr_producer_op = drr_input_tensors[i]->producer(); if (drr_producer_op == nullptr) { continue; } - + // Check whether tensor and value have the same use_count. if (drr_input_tensors[i]->consumers().size() != - ir_node->operand(i).source().use_count()) { + ir_input_values[i].use_count()) { matched = false; VLOG(8) << drr_node->name() << " Match failed: consumers of drr intput[" << i << "] { " << drr_node->outputs().size() << " } != consumers of pir intput[" << i << "] { " - << ir_node->operand(i).source().use_count() << " }."; + << ir_input_values[i].use_count() << " }."; break; } - auto* ir_producer_op = ir_node->operand_source(i).defining_op(); - // bfs producer_op of current_op - if (drr_visited.count(drr_producer_op) && - ir_visited.count(ir_producer_op)) { - continue; + auto* ir_producer_op = ir_input_values[i].defining_op(); + // Tigger early stop while operand is BlockArgument with + // producer_op==nullptr. + if (drr_producer_op && ir_producer_op == nullptr) { + matched = false; + break; } - - if (!drr_visited.count(drr_producer_op) && - !ir_visited.count(ir_producer_op)) { - drr_q.push(drr_producer_op); - ir_q.push(ir_producer_op); - drr_visited.insert(drr_producer_op); - ir_visited.insert(ir_producer_op); - } else { + // bfs producer_op of current_op + if (!TryUpdateDrrQueue(drr_producer_op, ir_producer_op)) { matched = false; VLOG(8) << "Match failed: status of visiting for" << drr_node->name() << " is different."; From 98f48ba2947739636c18e986f5fadfa8f5041cf5 Mon Sep 17 00:00:00 2001 From: feifei-111 <2364819892@qq.com> Date: Sat, 2 Mar 2024 10:16:32 +0800 Subject: [PATCH 10/15] [SOT] fix bug in llm stable diffusion (#62257) --- .../executor/opcode_executor.py | 19 ++++- .../executor/variables/__init__.py | 2 +- .../executor/variables/callable.py | 6 +- .../instruction_utils/opcode_analysis.py | 74 ++++++++++++------- .../paddle/jit/sot/utils/paddle_api_config.py | 1 - test/sot/test_break_graph.py | 15 ++++ 6 files changed, 82 insertions(+), 35 deletions(-) diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py index 3dfa9fb1b733b3..7f28346922d918 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py +++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py @@ -88,6 +88,7 @@ TensorVariable, TupleVariable, UserDefinedFunctionVariable, + UserDefinedGeneratorFunctionVariable, VariableBase, VariableFactory, ) @@ -1318,11 +1319,21 @@ def g(z=x): default_args, closure, ) - self.stack.push( - UserDefinedFunctionVariable( - new_fn, self._graph, DummyTracker(related_list) + # new_fn is created for which is binded with Variables + # so new_fn.__module__ is a ConstantVariable + # can not use VariableFactory.from_value + if inspect.isgeneratorfunction(new_fn): + self.stack.push( + UserDefinedGeneratorFunctionVariable( + new_fn, self._graph, DummyTracker(related_list) + ) + ) + else: + self.stack.push( + UserDefinedFunctionVariable( + new_fn, self._graph, DummyTracker(related_list) + ) ) - ) def GET_ITER(self, instr: Instruction): source_obj = self.stack.pop() diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py b/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py index 989c23e110abd0..3d53d1fce93dc3 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py +++ b/python/paddle/jit/sot/opcode_translator/executor/variables/__init__.py @@ -44,7 +44,7 @@ PaddleApiVariable, PaddleLayerVariable, UserDefinedFunctionVariable, - UserDefinedGeneratorVariable, + UserDefinedGeneratorFunctionVariable, UserDefinedLayerVariable, ) from .container import ( # noqa: F401 diff --git a/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py index 0e6ba7ec1e33fd..1648ebcf79b4d8 100644 --- a/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py +++ b/python/paddle/jit/sot/opcode_translator/executor/variables/callable.py @@ -681,9 +681,9 @@ def main_info(self) -> dict[str, Any]: } -class UserDefinedGeneratorVariable(FunctionVariable): +class UserDefinedGeneratorFunctionVariable(FunctionVariable): """ - UserDefinedGeneratorVariable is a subclass of FunctionVariable used to wrap a user-defined generator. + UserDefinedGeneratorFunctionVariable is a subclass of FunctionVariable used to wrap a user-defined generator. Args: fn (Callable[..., Any]): The user-defined generator to be wrapped. graph(FunctionGraph): The FunctionGraph object that this variable is associated with. @@ -711,7 +711,7 @@ def main_info(self) -> dict[str, Any]: ) def from_value(value: Any, graph: FunctionGraph, tracker: Tracker): if inspect.isgeneratorfunction(value): - return UserDefinedGeneratorVariable(value, graph, tracker) + return UserDefinedGeneratorFunctionVariable(value, graph, tracker) return None diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py index 93722f42c9602a..3d7c1cb7d1f46c 100644 --- a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py +++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py @@ -23,21 +23,19 @@ ALL_JUMP, HAS_FREE, HAS_LOCAL, - RETURN, UNCONDITIONAL_JUMP, ) @dataclasses.dataclass -class State: +class NameRecorder: reads: OrderedSet[str] writes: OrderedSet[str] - visited: OrderedSet[int] def __or__(self, other): reads = self.reads | other.reads writes = self.writes | other.writes - return State(reads, writes, OrderedSet()) + return NameRecorder(reads, writes) def is_read_opcode(opname): @@ -90,46 +88,70 @@ def analysis_used_names( Returns: State: The analysis result. """ - root_state = State(OrderedSet(), OrderedSet(), OrderedSet()) - - def fork(state: State, start: int, jump: bool, jump_target: int) -> State: + name_recorder = NameRecorder(OrderedSet(), OrderedSet()) + + # start idx and writes names can decide the analysis result below + # so, just check the pair of (idx, writes), to skip repeat simulation + # (writes can decide if a name should be add to reads) + # one idx can has multi writes for whom is not subset with each other + # if A is subset of B, we just record A, simulate A might add more reads + visited_states = {} + + def check_and_update_visited_states(idx, writes): + writes = set(writes) + + if idx in visited_states: + history = visited_states[idx] + for record in history: + if record.issubset(writes): + return True + elif writes.issubset(record): + history.remove(record) + history.append(writes) + return False + else: + visited_states[idx] = [writes] + + return False + + def fork( + name_recorder: NameRecorder, start: int, jump: bool, jump_target: int + ) -> NameRecorder: new_start = start + 1 if not jump else jump_target - new_state = State( - OrderedSet(state.reads), - OrderedSet(state.writes), - OrderedSet(state.visited), + new_state = NameRecorder( + OrderedSet(name_recorder.reads), + OrderedSet(name_recorder.writes), ) return walk(new_state, new_start) - def walk(state: State, start: int) -> State: + def walk(name_recorder: NameRecorder, start: int) -> NameRecorder: end = len(instructions) if stop_instr_idx is None else stop_instr_idx for i in range(start, end): - if i in state.visited: - return state - state.visited.add(i) + if check_and_update_visited_states(i, name_recorder.writes): + return name_recorder instr = instructions[i] if instr.opname in HAS_LOCAL | HAS_FREE: if is_read_opcode(instr.opname) and instr.argval not in ( - state.writes + name_recorder.writes ): - state.reads.add(instr.argval) + name_recorder.reads.add(instr.argval) elif is_write_opcode(instr.opname): - state.writes.add(instr.argval) + name_recorder.writes.add(instr.argval) elif instr.opname in ALL_JUMP: assert instr.jump_to is not None target_idx = instructions.index(instr.jump_to) # Fork to two branches, jump or not - jump_branch = fork(state, i, True, target_idx) + jump_branch = fork(name_recorder, i, True, target_idx) not_jump_branch = ( - fork(state, i, False, target_idx) + fork(name_recorder, i, False, target_idx) if instr.opname not in UNCONDITIONAL_JUMP - else State(OrderedSet(), OrderedSet(), OrderedSet()) + else NameRecorder(OrderedSet(), OrderedSet()) ) return jump_branch | not_jump_branch - elif instr.opname in RETURN: - return state - return state + elif instr.opname == "RETURN_VALUE": + return name_recorder + return name_recorder - state = walk(root_state, current_instr_idx) - return state.reads, state.writes + name_recorder = walk(name_recorder, current_instr_idx) + return name_recorder.reads, name_recorder.writes diff --git a/python/paddle/jit/sot/utils/paddle_api_config.py b/python/paddle/jit/sot/utils/paddle_api_config.py index 8a5cde9e657160..24b58bda9b83b4 100644 --- a/python/paddle/jit/sot/utils/paddle_api_config.py +++ b/python/paddle/jit/sot/utils/paddle_api_config.py @@ -82,7 +82,6 @@ def get_paddle_api(): # considered as paddle module? paddle_api_module_prefix = { "paddle.nn.functional", - "paddle.nn.layer.activation", } break_graph_set = set() diff --git a/test/sot/test_break_graph.py b/test/sot/test_break_graph.py index b6908f4d229b57..58cab6d48b0a3e 100644 --- a/test/sot/test_break_graph.py +++ b/test/sot/test_break_graph.py @@ -185,5 +185,20 @@ def test_break_graph_in_layer(self): self.assert_results(net.forward, x) +def dummy(*args): + return None + + +def break_graph_call_generator_function(x): + return dummy(y for y in x) + + +class TestBreakGraphCallGeneratorFunction(TestCaseBase): + def test_break_graph_when_call_generator_function(self): + x = paddle.rand([1], dtype=paddle.float32) + y = paddle.rand([1], dtype=paddle.float32) + self.assert_results(break_graph_call_generator_function, [x, y]) + + if __name__ == "__main__": unittest.main() From eabf863247fef18d5d7912817c9a1a95d3ddf23f Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Sat, 2 Mar 2024 11:02:44 +0800 Subject: [PATCH 11/15] [Dy2St][PIR] Add view op to inplace info (#62300) --- paddle/fluid/pybind/pir.cc | 5 ++ test/dygraph_to_static/test_deal_inplace.py | 53 +++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 test/dygraph_to_static/test_deal_inplace.py diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc index 45fe7263e692c5..d28b2743482016 100644 --- a/paddle/fluid/pybind/pir.cc +++ b/paddle/fluid/pybind/pir.cc @@ -1360,7 +1360,12 @@ std::map GetOpInplaceInfo(const pir::Operation *op) { const std::string &inplace_name = yaml_parser.InplaceName(value_name); inplace_info[i] = yaml_parser.InputName2Id().at(inplace_name); } + if (yaml_parser.HasView(value_name)) { + const std::string &view_name = yaml_parser.ViewName(value_name); + inplace_info[i] = yaml_parser.InputName2Id().at(view_name); + } } + return inplace_info; } diff --git a/test/dygraph_to_static/test_deal_inplace.py b/test/dygraph_to_static/test_deal_inplace.py new file mode 100644 index 00000000000000..3984dd729db0a2 --- /dev/null +++ b/test/dygraph_to_static/test_deal_inplace.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from dygraph_to_static_utils import ( + Dy2StTestBase, + test_pir_only, +) + +import paddle + + +def fn_with_inplace_op(inplace_op, x): + y = inplace_op(x) + z = inplace_op(x) + return y + z + + +class TestDealInplace(Dy2StTestBase): + def run_test(self, dygraph_fn, *inputs): + dygraph_out = dygraph_fn(*inputs) + static_fn = paddle.jit.to_static(dygraph_fn) + static_out = static_fn(*inputs) + np.testing.assert_allclose(dygraph_out.numpy(), static_out.numpy()) + + @test_pir_only + def test_deal_view(self): + bn_layer = paddle.nn.BatchNorm2D(10) + x = paddle.to_tensor(np.random.random((2, 10, 3, 3)).astype('float32')) + self.run_test(fn_with_inplace_op, bn_layer, x) + + @test_pir_only + def test_deal_inplace(self): + sigmoid_layer = paddle.nn.Sigmoid() + x = paddle.to_tensor(np.random.random((2, 10, 3, 3)).astype('float32')) + self.run_test(fn_with_inplace_op, sigmoid_layer, x) + + +if __name__ == '__main__': + unittest.main() From 6f608ca9d2c84db75e7bff4ce7a9be9a321a1fba Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Sat, 2 Mar 2024 12:31:30 +0800 Subject: [PATCH 12/15] [PT] Set NCHW as default Layout for type translator (#62263) * [PT] Set NCHW as default Layout for type translator * fix randint * fix typo * fix delt --- .../ir_adaptor/translator/op_translator.cc | 2 +- .../ir_adaptor/translator/type_translator.cc | 89 +++++++++---------- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc index bf5acda9c1bbd3..3466c074ed9948 100644 --- a/paddle/fluid/ir_adaptor/translator/op_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc @@ -2746,7 +2746,7 @@ struct RandIntOpTranscriber : public OpTranscriber { paddle::dialect::DenseTensorTypeStorage::Dim dim = common::make_ddim(var->GetShape()); paddle::dialect::DenseTensorTypeStorage::DataLayout layout = - paddle::dialect::DenseTensorTypeStorage::DataLayout::UNDEFINED; + paddle::dialect::DenseTensorTypeStorage::DataLayout::NCHW; paddle::dialect::DenseTensorTypeStorage::LoD lod = {}; size_t offset = 0; pir::Type translated_var_type = paddle::dialect::DenseTensorType::get( diff --git a/paddle/fluid/ir_adaptor/translator/type_translator.cc b/paddle/fluid/ir_adaptor/translator/type_translator.cc index 7cd297cf46b62d..4378ef5285ceb0 100644 --- a/paddle/fluid/ir_adaptor/translator/type_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/type_translator.cc @@ -30,8 +30,48 @@ using DenseTensorType = paddle::dialect::DenseTensorType; using DenseTensorTypeStorage = paddle::dialect::DenseTensorTypeStorage; using SelectedRowsType = paddle::dialect::SelectedRowsType; using SelectedRowsTypeStorage = paddle::dialect::SelectedRowsTypeStorage; +using DataLayout = DenseTensorTypeStorage::DataLayout; +using LoD = DenseTensorTypeStorage::LoD; TypeTranslator::TypeTranslator() { + const auto& HandleTensor = [&](pir::IrContext* ctx, + const VarDesc& var_desc) -> pir::Type { + VLOG(10) << "[vartype translating]" + << "[" << var_desc.Name() << "] from LOD_TENSOR"; + const pir::Type dtype = + this->operator[](var_desc.GetDataType())(ctx, var_desc); + const auto dim = common::make_ddim(var_desc.GetShape()); + const auto layout = DataLayout::NCHW; + const LoD lod = {}; + const size_t offset = 0; + return DenseTensorType::get(ctx, dtype, dim, layout, lod, offset); + }; + const auto& HandleTensorArray = [&](pir::IrContext* ctx, + const VarDesc& var_desc) -> pir::Type { + VLOG(10) << "[vartype translating]" + << "[" << var_desc.Name() << "] from LOD_TENSOR_ARRAY"; + const pir::Type dtype = + this->operator[](var_desc.GetDataType())(ctx, var_desc); + const auto dims = common::make_ddim(var_desc.GetShape()); + const auto layout = DataLayout::NCHW; + return paddle::dialect::DenseTensorArrayType::get(ctx, dtype, dims, layout); + }; + + const auto& HandleSelectedRows = [&](pir::IrContext* ctx, + const VarDesc& var_desc) -> pir::Type { + VLOG(10) << "[vartype translating]" + << "[" << var_desc.Name() << "] from SELECTED_ROWS"; + const pir::Type dtype = + this->operator[](var_desc.GetDataType())(ctx, var_desc); + const auto dim = common::make_ddim(var_desc.GetShape()); + const auto layout = DataLayout::NCHW; + const LoD lod = {}; + const size_t offset = 0; + pir::Type SelectedRows = + SelectedRowsType::get(ctx, dtype, dim, layout, lod, offset); + return SelectedRows; + }; + handlers = { {VarType::BOOL, [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type { @@ -81,52 +121,9 @@ TypeTranslator::TypeTranslator() { [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type { return pir::Complex128Type::get(ctx); }}, - {VarType::LOD_TENSOR, - [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type { - VLOG(10) << "[vartype translating]" - << "[" << var_desc.Name() << "] from LOD_TENSOR"; - - pir::Type dtype = - this->operator[](var_desc.GetDataType())(ctx, var_desc); - DenseTensorTypeStorage::Dim dim = - common::make_ddim(var_desc.GetShape()); - DenseTensorTypeStorage::DataLayout layout = - DenseTensorTypeStorage::DataLayout::UNDEFINED; - DenseTensorTypeStorage::LoD lod = {}; - size_t offset = 0; - return DenseTensorType::get(ctx, dtype, dim, layout, lod, offset); - }}, - {VarType::LOD_TENSOR_ARRAY, - [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type { - VLOG(10) << "[vartype translating]" - << "[" << var_desc.Name() << "] from LOD_TENSOR_ARRAY"; - pir::Type dtype = - this->operator[](var_desc.GetDataType())(ctx, var_desc); - phi::DDim dims = common::make_ddim(var_desc.GetShape()); - DenseTensorTypeStorage::DataLayout layout = - DenseTensorTypeStorage::DataLayout::UNDEFINED; - - return paddle::dialect::DenseTensorArrayType::get( - ctx, dtype, dims, layout); - }}, - {VarType::SELECTED_ROWS, - [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type { - VLOG(10) << "[vartype translating]" - << "[" << var_desc.Name() << "] from SELECTED_ROWS"; - - pir::Type dtype = - this->operator[](var_desc.GetDataType())(ctx, var_desc); - - SelectedRowsTypeStorage::Dim dim = - common::make_ddim(var_desc.GetShape()); - SelectedRowsTypeStorage::DataLayout layout = - SelectedRowsTypeStorage::DataLayout::UNDEFINED; - SelectedRowsTypeStorage::LoD lod = {}; - size_t offset = 0; - pir::Type SelectedRows = - SelectedRowsType::get(ctx, dtype, dim, layout, lod, offset); - return SelectedRows; - }}, + {VarType::LOD_TENSOR, HandleTensor}, + {VarType::LOD_TENSOR_ARRAY, HandleTensorArray}, + {VarType::SELECTED_ROWS, HandleSelectedRows}, }; } From 94018aecdeddb4169232655631f5b1cc762f8c8f Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Sat, 2 Mar 2024 12:38:16 +0800 Subject: [PATCH 13/15] [CINN]Fix group op attribuge hash bug (#62309) * fix group op attribute hash bug * fix bug --- paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h | 5 +++++ .../dialect/operator/transforms/cinn_group_cluster_pass.cc | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h index 61a2ae3268e05d..d338dcd84b04d1 100644 --- a/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h +++ b/paddle/cinn/hlir/dialect/operator/ir/attribute_storage.h @@ -71,6 +71,11 @@ struct GroupInfoAttributeStorage : public pir::AttributeStorage { static std::size_t HashValue(const ParamKey& key) { size_t hash_value = std::hash{}(key.group_id); + for (auto op : key.ops) { + hash_value = + pir::detail::hash_combine(hash_value, std::hash()(op)); + } + for (auto d : key.loop_ranges) { hash_value = pir::detail::hash_combine(hash_value, std::hash()(d)); diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc index 9f9856004646fc..f0069a55a4cdee 100644 --- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc +++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc @@ -252,7 +252,7 @@ cinn::dialect::GroupInfo BuildGroupInfo( const GroupClusterNode& node, const std::unordered_map<::pir::Operation*, std::vector>& new_align_info) { - cinn::dialect::GroupInfo group_info({}); + cinn::dialect::GroupInfo group_info(vec_new_op_list); group_info.group_id = BuildGroupId(vec_new_op_list); group_info.loop_ranges = node.loop_ranges; group_info.reduce_axis = node.reduce_axis; From 8b4219b0b84b42df40ebb439440ce5445d769884 Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Sat, 2 Mar 2024 15:10:35 +0800 Subject: [PATCH 14/15] add argmax & argmin (#62312) --- .../infer_symbolic_shape/infer_sym_utils.h | 3 + .../infer_symbolic_shape.h | 1 + .../paddle_op_infer_sym.cc | 13 -- .../paddle_op_infer_sym.h | 5 - .../infer_symbolic_shape/unary_infer_sym.cc | 77 ++++++++++++ .../infer_symbolic_shape/unary_infer_sym.h | 26 ++++ .../pir/transforms/shape_optimization_pass.cc | 4 +- .../symbolic/test_unary_op_infer_sym_shape.py | 112 ++++++++++++++++++ 8 files changed, 220 insertions(+), 21 deletions(-) create mode 100644 paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc create mode 100644 paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h create mode 100644 test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h index d2d508ff5890db..f5193b3f7ff5b5 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h @@ -17,6 +17,9 @@ #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h" +#define GET_BOOL_ATTR(op, str) \ + op->attributes().at(str).dyn_cast().data(); + // To make codes shorter using ExprVec = std::vector; using ShapeOrData = symbol::ShapeOrDataDimExprs; diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h index 4e1946acd75f1a..515eaaca1b3484 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h @@ -18,6 +18,7 @@ #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h" #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h" #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h" +#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h" #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h" // Type inference is currently modelled executionally for operation creation diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc index 1be26c82f4c21a..d7ee4fb6781b0f 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc @@ -1174,19 +1174,6 @@ bool SplitOpInferSymbolicShape(pir::Operation *op, // Not Impelmented Ops. -bool ArgmaxOpInferSymbolicShape( - pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) { - PADDLE_THROW(phi::errors::Unimplemented( - op->name() + " 's InferSymbolicShape interface is NOT implemented now.")); - return true; -} -bool ArgminOpInferSymbolicShape( - pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) { - PADDLE_THROW(phi::errors::Unimplemented( - op->name() + " 's InferSymbolicShape interface is NOT implemented now.")); - return true; -} - bool AsComplexOpInferSymbolicShape( pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) { PADDLE_THROW(phi::errors::Unimplemented( diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h index cf5e650023fa95..f23e84c27f55d5 100644 --- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h @@ -114,11 +114,6 @@ bool SplitOpInferSymbolicShape(pir::Operation *op, // Not Impelmented Ops. -bool ArgmaxOpInferSymbolicShape(pir::Operation *op, - pir::ShapeConstraintIRAnalysis *shape_analysis); -bool ArgminOpInferSymbolicShape(pir::Operation *op, - pir::ShapeConstraintIRAnalysis *shape_analysis); - bool AsComplexOpInferSymbolicShape( pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis); bool AsRealOpInferSymbolicShape(pir::Operation *op, diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc new file mode 100644 index 00000000000000..d82fc12521998a --- /dev/null +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h" +#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h" +// #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h" + +namespace paddle::dialect { + +bool ArgmaxOpInferSymbolicShape( + pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) { + bool flatten = GET_BOOL_ATTR(op, "flatten"); + bool keepdims = GET_BOOL_ATTR(op, "keepdims"); + + const auto &input_shape_or_data = + shape_analysis->GetShapeOrDataForValue(op->operand_source(0)); + + const auto &axis_shape_or_data = + shape_analysis->GetShapeOrDataForValue(op->operand_source(1)); + int axis = + static_cast(axis_shape_or_data.data().value()[0].Get()); + + const std::vector &input_sym_shape = + input_shape_or_data.data().has_value() + ? input_shape_or_data.data().value() + : input_shape_or_data.shape(); + + int rank = input_sym_shape.size(); + if (axis < 0) axis += rank; + + const auto &out_sym_shape = [&] { + std::vector out_sym_shape; + if (flatten) { + if (keepdims) { + out_sym_shape.emplace_back(std::int64_t(rank)); + } else { + out_sym_shape.emplace_back(std::int64_t(0)); + } + } else { + for (int i = 0; i < axis; i++) { + out_sym_shape.emplace_back(input_sym_shape[i]); + } + if (keepdims) { + out_sym_shape.emplace_back(std::int64_t(1)); + } + + for (int i = axis + 1; i < rank; i++) { + out_sym_shape.emplace_back(input_sym_shape[i]); + } + } + return out_sym_shape; + }(); + + symbol::ShapeOrDataDimExprs shape_data{ + symbol::TensorShapeOrDataDimExprs(out_sym_shape)}; + + shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data); + return true; +} + +bool ArgminOpInferSymbolicShape( + pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) { + return ArgmaxOpInferSymbolicShape(op, shape_analysis); +} + +} // namespace paddle::dialect diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h new file mode 100644 index 00000000000000..832a6a7a074c36 --- /dev/null +++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h @@ -0,0 +1,26 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pir/include/dialect/shape/utils/shape_analysis.h" + +namespace paddle::dialect { + +bool ArgmaxOpInferSymbolicShape(pir::Operation *op, + pir::ShapeConstraintIRAnalysis *shape_analysis); +bool ArgminOpInferSymbolicShape(pir::Operation *op, + pir::ShapeConstraintIRAnalysis *shape_analysis); + +} // namespace paddle::dialect diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc index d9cf96f78efe99..85f4a5a5eef498 100644 --- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc +++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc @@ -23,7 +23,7 @@ COMMON_DECLARE_bool(pir_apply_shape_optimization_pass); -const int vlog_level = 3; +constexpr int vlog_level = 3; namespace pir { namespace { @@ -144,8 +144,6 @@ void InferSymExprForBlock(const Block& block, &op, shape_analysis->GetShapeOrDataForValue(op.result(0))); } } else { - VLOG(vlog_level) << op.name() + - " DOES NOT have InferSymbolicShapeInterface!"; PADDLE_THROW(phi::errors::Unimplemented( op.name() + " DOES NOT have InferSymbolicShapeInterface!")); } diff --git a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py new file mode 100644 index 00000000000000..5260475b45f1e8 --- /dev/null +++ b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py @@ -0,0 +1,112 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.static import InputSpec + + +def get_sym_shape_str_for_op(net, input_spec, op_name='builtin.shadow_output'): + forward_program = net.forward.get_concrete_program(*input_spec)[ + 1 + ].infer_program.forward_program + all_sym_shape_str = [] + for op in forward_program.global_block().ops: + if op.name() == op_name: + all_sym_shape_str.append(op.attrs()['sym_shape_str']) + + return all_sym_shape_str + + +def apply_to_static(net, use_cinn, input_spec=None): + build_strategy = paddle.static.BuildStrategy() + build_strategy.build_cinn_pass = use_cinn + return paddle.jit.to_static( + net, + input_spec=input_spec, + build_strategy=build_strategy, + full_graph=True, + ) + + +class TestBase(unittest.TestCase): + def setUp(self): + paddle.seed(2022) + self.prepare_data() + + def prepare_data(self): + pass + + def test_eval_symbolic(self): + pass + + +class ArgMaxMinNet(paddle.nn.Layer): + def __init__(self): + super().__init__() + + def forward(self, x): + argmax_out = paddle.argmax(x) + argmin_out = paddle.argmin(x, axis=-1) + return argmax_out, argmin_out + + +class TestArgMaxMinOpInferSymbolicShape(TestBase): + def prepare_data(self): + self.cases = [np.random.rand(4, 5, 6)] + self.expected = [ + [ + 'shape[0], data[NULL]', + 'shape[S0, S1], data[NULL]', + ] + ] + + def test_eval_symbolic(self): + net = ArgMaxMinNet() + + for i in range(len(self.cases)): + x = self.cases[i] + x_spec = InputSpec( + shape=[None for index in range(len(x.shape))], dtype='float32' + ) + + input_spec = [x_spec] + net = apply_to_static(net, False, input_spec) + net.eval() + + # check the infer result + sym_shape_str_list = get_sym_shape_str_for_op( + net, input_spec, 'pd_op.argmax' + ) + sym_shape_str_list += get_sym_shape_str_for_op( + net, input_spec, 'pd_op.argmin' + ) + np.testing.assert_equal( + len(sym_shape_str_list), len(self.expected[i]) + ) + for j in range(len(sym_shape_str_list)): + np.testing.assert_equal( + sym_shape_str_list[j].find(self.expected[i][j]), + 0, + f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}', + ) + + return True + + +if __name__ == '__main__': + unittest.main() From 6fccb8f20c283abcbf28d0ed7e82be9c83e7ce45 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Sat, 2 Mar 2024 17:09:09 +0800 Subject: [PATCH 15/15] [CINN] uniform all the 0 and reduce deleted axis (#61608) * uniform all the 0 and reduce deleted axis * remove one shape for keepdim cases. * fix by code review * fix some error in 0d format --- paddle/cinn/ast_gen_ius/ast_gen.cc | 86 +++++++++++++++++++++++++----- paddle/cinn/hlir/pe/reduction.cc | 8 +++ paddle/cinn/ir/ir.cc | 5 +- paddle/cinn/ir/ir.h | 15 ++++-- paddle/cinn/lang/compute.cc | 7 +++ paddle/cinn/pybind/ir/ir_api.cc | 1 + paddle/cinn/runtime/flags.cc | 4 ++ 7 files changed, 107 insertions(+), 19 deletions(-) diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc index 009158d3f9cce2..57b10fb7ca8849 100644 --- a/paddle/cinn/ast_gen_ius/ast_gen.cc +++ b/paddle/cinn/ast_gen_ius/ast_gen.cc @@ -22,6 +22,7 @@ #include "paddle/cinn/optim/replace_var_with_expr.h" PD_DECLARE_bool(cinn_new_group_scheduler); +PD_DECLARE_bool(group_schedule_tiling_first); PD_DECLARE_bool(cinn_bucket_compile); namespace cinn { @@ -93,9 +94,21 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { std::vector iter_values; // reduce body and reduce init schedule block should have different objects // for same axis so we re-create objects + VLOG(4) << "FLAGS_group_schedule_tiling_first = " + << FLAGS_group_schedule_tiling_first; std::vector axis_vars = cinn::common::GenDefaultAxis(axis_len); + const std::vector& reduce_axis = tensor->reduce_axis; + VLOG(4) << "ast gen: tensor init_body is " << init_body; for (int i = 0; i < shape.size(); ++i) { - if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) { + bool is_keep_dim = axis[i]->is_keepdim; + if (FLAGS_group_schedule_tiling_first && is_keep_dim) { + // if tiling first, we need to replace the reduce axis with 0, but don't + // deal with the non-reduce axis + optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0)); + continue; + } + if (!FLAGS_group_schedule_tiling_first && + FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) { optim::ReplaceVarWithExpr(&init_body, axis[i], Expr(0)); continue; } @@ -105,21 +118,25 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { /*is_reduce = */ false)); optim::ReplaceVarWithExpr(&init_body, axis[i], block_vars.back()); axis_vars[i]->is_reduce_axis = false; - if (shape[i] == Expr(1)) { + if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) { iter_values.push_back(Expr(0)); } else { iter_values.push_back(axis_vars[i]); } } + VLOG(4) << "iter_value.size() and block_vars.size() is " + << iter_values.size() << " " << block_vars.size(); init_body = ir::ScheduleBlockRealize::Make( iter_values, ir::ScheduleBlock::Make( block_vars, {}, {}, reduce_init_name, init_body)); // For the remaining reduce axis, make reduce body - const std::vector& reduce_axis = tensor->reduce_axis; ir::Expr reduce_body = ConvertReduceBody(tensor->body(), tensor, axis_exprs); + + VLOG(4) << "ast gen: reduce body is " << reduce_body; + // create schedule block itervars, i0,i1... std::vector reduce_block_vars; std::vector reduce_iter_values; @@ -127,7 +144,15 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { // for same axis so we re-create objects std::vector reduce_axis_vars = cinn::common::GenDefaultAxis(axis_len); for (int i = 0; i < shape.size(); ++i) { - if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) { + bool is_keep_dim = axis[i]->is_keepdim; + if (FLAGS_group_schedule_tiling_first && is_keep_dim) { + // if tiling first, we need to replace the reduce axis with 0, but don't + // deal with the non-reduce axis + optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0)); + continue; + } + if (!FLAGS_group_schedule_tiling_first && + FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) { optim::ReplaceVarWithExpr(&reduce_body, axis[i], Expr(0)); continue; } @@ -136,12 +161,13 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { cinn::UniqName("i" + std::to_string(i)), /*is_reduce = */ false)); reduce_axis_vars[i]->is_reduce_axis = false; - if (shape[i] == Expr(1)) { + if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) { reduce_iter_values.push_back(Expr(0)); } else { reduce_iter_values.push_back(axis_vars[i]); } } + VLOG(4) << "ast gen: reduce body is after replace 0" << reduce_body; for (int i = 0; i < reduce_axis.size(); ++i) { int count = shape.size() + i; reduce_block_vars.push_back( @@ -155,14 +181,43 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { } int non_zero_axis_size = 0; - for (int i = 0; i < axis.size(); ++i) { - if (FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) { - continue; + if (FLAGS_group_schedule_tiling_first) { + std::vector non_reduce_axis_vars = [&]() { + std::vector res; + for (int i = 0; i < shape.size(); ++i) { + bool is_keep_dim = axis[i]->is_keepdim; + if (!is_keep_dim) { + res.push_back(axis[i]); + } + } + return res; + }(); + for (int i = 0; i < non_reduce_axis_vars.size(); ++i) { + optim::ReplaceVarWithExpr( + &reduce_body, non_reduce_axis_vars[i], reduce_block_vars[i]); + ++non_zero_axis_size; } - optim::ReplaceVarWithExpr( - &reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]); - ++non_zero_axis_size; + } else { + for (int i = 0; i < axis.size(); ++i) { + if (!FLAGS_group_schedule_tiling_first && + FLAGS_cinn_new_group_scheduler && shape[i] == Expr(1)) { + continue; + } + optim::ReplaceVarWithExpr( + &reduce_body, axis[i], reduce_block_vars[non_zero_axis_size]); + ++non_zero_axis_size; + } + } + + VLOG(4) << "to replace : " << non_zero_axis_size << " " + << reduce_block_vars.size(); + for (auto i = 0; i < reduce_block_vars.size(); i++) { + VLOG(4) << "reduce_block_vars[" << i << "] = " << reduce_block_vars[i]; + } + for (auto i = 0; i < reduce_axis.size(); i++) { + VLOG(4) << "reduce_axis[" << i << "] = " << reduce_axis[i]; } + VLOG(4) << "before replace body: " << reduce_body; for (int i = non_zero_axis_size; i < reduce_block_vars.size(); ++i) { optim::ReplaceVarWithExpr(&reduce_body, reduce_axis[i - non_zero_axis_size], @@ -185,7 +240,12 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { // Put the two parts together ir::Expr body = ir::Block::Make({init_body, reduce_body}); for (int i = static_cast(axis_len) - 1; i >= 0; --i) { - if (!FLAGS_cinn_bucket_compile && shape[i] == Expr(1)) { + bool is_keep_dim = axis[i]->is_keepdim; + if (FLAGS_group_schedule_tiling_first && is_keep_dim) { + continue; + } + if (!FLAGS_group_schedule_tiling_first && !FLAGS_cinn_bucket_compile && + shape[i] == Expr(1)) { continue; } ir::Var loop_var = axis[i]; @@ -210,7 +270,7 @@ ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) { Expr(0), shape[i], cinn::UniqName("i" + std::to_string(i)), false)); optim::ReplaceVarWithExpr(&body, axis[i], block_vars[i]); axis_vars[i]->is_reduce_axis = false; - if (shape[i] == Expr(1)) { + if (!FLAGS_group_schedule_tiling_first && shape[i] == Expr(1)) { iter_values.push_back(Expr(0)); } else { iter_values.push_back(axis_vars[i]); diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc index 7e33a1475e48b3..605a1b3d6443fe 100644 --- a/paddle/cinn/hlir/pe/reduction.cc +++ b/paddle/cinn/hlir/pe/reduction.cc @@ -166,6 +166,14 @@ Tensor DoReduce(const Tensor& tensor, int indice_cnt = 0; int reduce_cnt = 0; + // Set keepdim flags of indices. + if (tensor->shape.size() == indices.size()) { + for (const auto& i : real_axes) { + VLOG(4) << "Set is_keepdim = true for var(" << i << ")"; + indices[i].as_var_ref()->is_keepdim = true; + } + } + for (size_t i = 0; i < tensor->shape.size(); ++i) { bool squeeze_i = std::find(squeeze_axes.begin(), squeeze_axes.end(), i) != squeeze_axes.end(); diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc index 2e194200d19937..f3c64790551cac 100644 --- a/paddle/cinn/ir/ir.cc +++ b/paddle/cinn/ir/ir.cc @@ -218,11 +218,13 @@ Expr _Var_::Make(Expr lower_bound, Expr upper_bound, const std::string &name, bool is_reduce_axis, - bool is_symbolic_constant) { + bool is_symbolic_constant, + bool is_keepdim) { auto *n = make_shared<_Var_>(); n->lower_bound = lower_bound; n->upper_bound = upper_bound; n->is_reduce_axis = is_reduce_axis; + n->is_keepdim = is_keepdim; n->is_symbolic_constant = is_symbolic_constant; n->name = name; n->set_type(lower_bound.type()); @@ -233,6 +235,7 @@ Expr _Var_::Copy() const { auto *n = make_shared<_Var_>(); n->name = name; n->is_reduce_axis = is_reduce_axis; + n->is_keepdim = is_keepdim; n->lower_bound = lower_bound; n->upper_bound = upper_bound; n->set_type(type()); diff --git a/paddle/cinn/ir/ir.h b/paddle/cinn/ir/ir.h index c02517f9836fc3..5a1f9f6a1f739f 100644 --- a/paddle/cinn/ir/ir.h +++ b/paddle/cinn/ir/ir.h @@ -381,6 +381,7 @@ struct _Var_ : public ExprNode<_Var_> { std::string name; bool is_reduce_axis{false}; + bool is_keepdim{false}; bool is_symbolic_constant{false}; //! Lower bound and upper bound of a axis. // @{ @@ -401,7 +402,8 @@ struct _Var_ : public ExprNode<_Var_> { Expr upper_bound, const std::string& name, bool is_reduce, - bool is_symbolic_constant = false); + bool is_symbolic_constant = false, + bool is_keepdim = false); void Verify() const override; @@ -419,12 +421,14 @@ struct Var : public IrNodeRef { Var(Expr lower_bound, Expr upper_bound, const std::string& name, - bool is_reduce = false) - : Var(_Var_::Make(lower_bound, upper_bound, name, is_reduce)) {} + bool is_reduce = false, + bool is_keepdim = false) + : Var(_Var_::Make( + lower_bound, upper_bound, name, is_reduce, false, is_keepdim)) {} Var(int upper_bound, const std::string& name) - : Var(_Var_::Make(Expr(0), Expr(upper_bound), name, false)) {} + : Var(_Var_::Make(Expr(0), Expr(upper_bound), name, false, false)) {} Var(Expr upper_bound, const std::string& name) - : Var(_Var_::Make(Expr(0), upper_bound, name, false)) {} + : Var(_Var_::Make(Expr(0), upper_bound, name, false, false)) {} operator Expr() { return Expr(get()); } operator Expr() const { @@ -977,6 +981,7 @@ struct ScheduleBlock : public ExprNode { std::map attrs; std::string name; Expr body; + int32_t reduce_type{-1}; // 0 for warp reduce, 1 for block reduce static Expr Make(const std::vector& iter_vars, const std::vector& read_buffers, diff --git a/paddle/cinn/lang/compute.cc b/paddle/cinn/lang/compute.cc index 4828eaac64e13c..bd195fd26a6390 100644 --- a/paddle/cinn/lang/compute.cc +++ b/paddle/cinn/lang/compute.cc @@ -187,6 +187,13 @@ ir::Tensor Compute(const std::vector &domain, domain_without_reduce_axis, op, reduce_axis); + const auto set_keep_dim_for_tensor = [&]() { + for (int i = 0; i < _axis.size(); ++i) { + const auto &axis_var = _axis.at(i); + tensor->axis_[i]->is_keepdim = axis_var.as_var_ref()->is_keepdim; + } + }; + set_keep_dim_for_tensor(); return tensor; } diff --git a/paddle/cinn/pybind/ir/ir_api.cc b/paddle/cinn/pybind/ir/ir_api.cc index 56dff498dd7101..efebf1206a8674 100644 --- a/paddle/cinn/pybind/ir/ir_api.cc +++ b/paddle/cinn/pybind/ir/ir_api.cc @@ -383,6 +383,7 @@ void BindIrIr(py::module *m) { ir::Expr, const std::string &, bool, + bool, bool>(&ir::_Var_::Make)) .def("copy", &ir::_Var_::Copy); diff --git a/paddle/cinn/runtime/flags.cc b/paddle/cinn/runtime/flags.cc index 89512913e8fa98..c9f0760d43e80b 100644 --- a/paddle/cinn/runtime/flags.cc +++ b/paddle/cinn/runtime/flags.cc @@ -69,6 +69,10 @@ PD_DEFINE_bool(cinn_bucket_compile, BoolFromEnv("FLAGS_cinn_bucket_compile", false), "Whether to enable bucket compile for dynamic shape."); +PD_DEFINE_bool(group_schedule_tiling_first, + BoolFromEnv("FLAGS_group_schedule_tiling_first", false), + "Whether to enable new group scheduler tiling first strategy."); + PD_DEFINE_bool(cinn_use_common_subexpression_elimination, BoolFromEnv("FLAGS_cinn_use_common_subexpression_elimination", false),